ktstr/stats/metric.rs
1use super::*;
2
3/// Definition of a metric for the comparison pipeline.
4///
5/// Each entry describes polarity (`higher_is_worse`), dual-gate
6/// significance thresholds (`default_abs`, `default_rel`), a
7/// display unit string for formatted output, and a row accessor
8/// (`accessor`) that returns the metric's value from a
9/// [`GauntletRow`] without a hand-maintained name→field match.
10///
11/// The `accessor` field is skipped in serde output because `fn`
12/// pointers are not serializable. A future `Deserialize` impl
13/// would need callers to re-hydrate the accessor by looking up
14/// `name` via [`metric_def`] — the static [`METRICS`] table is
15/// the authoritative source of the function identity. No such
16/// impl exists today; the note is a forward-conditional so that
17/// if one is added, the migration path is spelled out rather
18/// than reinvented per site.
19///
20/// # Registered vs unregistered metrics
21///
22/// The static [`METRICS`] registry is the "core metric" set with
23/// hand-authored accessors, hand-tuned dual-gate thresholds
24/// (`default_abs` / `default_rel`), and display units. Each
25/// registered `MetricDef.accessor` reads a typed field on
26/// `GauntletRow` directly (e.g. `r.spread`, `r.gap_ms`).
27///
28/// Metrics that fall OUTSIDE this registry are carried on
29/// `GauntletRow.ext_metrics: BTreeMap<String, f64>`. Registered
30/// metrics never flow through `ext_metrics`; unregistered metrics
31/// never flow through the typed fields. [`MetricDef::read`] and
32/// `read_metric` check the registered-field accessor first and
33/// fall back to an `ext_metrics.get(name)` lookup — a name that
34/// matches neither returns `None`. Consumers that want to
35/// distinguish "registered-but-null" from "unregistered-and-
36/// absent" must inspect the registry directly rather than rely
37/// on the fallback.
38///
39/// # `#[non_exhaustive]` migration note
40///
41/// Downstream code that pattern-matches an instance of `MetricDef`
42/// must end the match with `..` so a future field addition does
43/// not become a breaking change. Prefer reading values through
44/// the static [`METRICS`] registry and [`metric_def`] lookup
45/// rather than constructing `MetricDef` values by hand.
46#[derive(Debug, Clone, serde::Serialize)]
47#[non_exhaustive]
48pub struct MetricDef {
49 pub name: &'static str,
50 /// Regression direction for this metric. A metric that
51 /// previously used `higher_is_worse: true` maps to
52 /// [`Polarity::LowerBetter`](crate::test_support::Polarity::LowerBetter)
53 /// (bigger values are regressions, so smaller is better);
54 /// `false` maps to
55 /// [`Polarity::HigherBetter`](crate::test_support::Polarity::HigherBetter).
56 /// The sense is INVERSE: the old bool answered "does growing
57 /// this value mean worse?" while the enum answers "what
58 /// direction do we want this to move?".
59 pub polarity: crate::test_support::Polarity,
60 /// Temporal aggregation kind. Drives how
61 /// [`aggregate_samples`] collapses N readings of the same
62 /// metric across multiple capture samples (e.g. periodic
63 /// monitor ticks within one run, or runs pooled for a
64 /// `cargo ktstr perf-delta` comparison) into one comparable
65 /// value. Distinct from [`Self::polarity`], which is the
66 /// "good direction" of the FINAL value: kind tells us HOW to
67 /// reduce a vec of samples; polarity tells us how to interpret
68 /// the reduced number.
69 ///
70 /// Default `Counter` matches the most common shape — every
71 /// kernel monotonic counter (SCX_EV_*, ttwu_count, run_delay,
72 /// cpustat[]) collapses by sum-of-deltas. ~80% of ktstr fields
73 /// are counters; the field exists so the remaining peaks and
74 /// gauges can opt out of sum-aggregation explicitly.
75 pub kind: MetricKind,
76 /// Absolute-materiality gate: a move smaller than this (in the metric's
77 /// [`Self::display_unit`]) is never a confident change, ANDed with
78 /// [`Self::default_rel`]. Its role depends on the metric's dynamic range
79 /// across workloads:
80 ///
81 /// - **Scale-bounded** metrics (a ratio of co-scaling counters, or a
82 /// naturally-bounded unit — `%` spread, `ms`/`µs` latency, `x` ratios,
83 /// `[0,1]` fractions): `default_abs` is a fixed unit-scale measurement-
84 /// noise floor. A sub-unit move is immaterial regardless of its relative
85 /// size, so a fixed floor is correct.
86 /// - **Scale-varying** metrics (a raw per-event count, or a rate normalized
87 /// only by time — `*_per_sec`, `ops/s`, `req/s`, `ns/s`): the baseline
88 /// spans orders of magnitude across workloads, so a fixed floor calibrated
89 /// for high throughput would MASK a large relative regression on a low-
90 /// throughput workload. Here `default_abs` is only a NEAR-IDLE activity
91 /// guard (it keeps a big relative swing on a near-idle baseline from
92 /// firing) and [`Self::default_rel`] carries materiality. Three tests
93 /// enforce that every scale-varying metric keeps a near-idle floor:
94 /// `throughput_rate_floors_are_near_idle` (per-time rates + throughput
95 /// carriers), `scale_varying_count_floors_are_near_idle` (raw counts and
96 /// ns/µs accumulations), and `mixed_class_scale_varying_floors_pinned`
97 /// (the mixed-class Peak/WorstLowest/PerPhase metrics, by allowlist).
98 pub default_abs: f64,
99 pub default_rel: f64,
100 pub display_unit: &'static str,
101 #[serde(skip)]
102 pub accessor: fn(&GauntletRow) -> Option<f64>,
103}
104
105/// Temporal aggregation classification for a metric.
106///
107/// Kernel-source-grounded per the metric-semantics taxonomy.
108/// Drives [`aggregate_samples`] — the function that collapses a
109/// slice of per-sample readings of the SAME metric into one
110/// representative value for downstream regression / display.
111///
112/// Reduction semantics by variant:
113/// - [`MetricKind::Counter`] — kernel monotonic counter; the
114/// temporal aggregate is the SUM of consecutive deltas across
115/// the sample window. For pre-deltaed inputs (each sample
116/// carries its own window's count) this is `samples.iter().sum()`.
117/// - [`MetricKind::Gauge`] — instantaneous value; the
118/// [`GaugeAgg`] subkind picks Avg / Last / Max.
119/// - [`MetricKind::Peak`] — kernel-side max-of-window (e.g.
120/// `max_run_delay`, `max_newidle_lb_cost`); temporal aggregate
121/// is max-of-max so a window-wise high-water never gets
122/// diluted.
123/// - [`MetricKind::Timestamp`] — wall/rq clock; the temporal
124/// aggregate is the LAST sample's value (a snapshot of "where
125/// the clock is now"). Diffing two captures gives elapsed
126/// time, but a single window's reduction picks the latest
127/// reading — averaging timestamps is meaningless.
128// Serialize only: MetricKind is serialized as part of MetricDef (which is
129// Serialize-only) but is never deserialized. A `Deserialize` derive here
130// would narrow to `Deserialize<'static>` because the Rate variant carries
131// `&'static str` fields (serde treats `&str` as borrowed), so it would not
132// satisfy `DeserializeOwned` and would break any future container that
133// deserializes an embedded MetricKind. Drop it rather than carry a fragile,
134// unused impl.
135#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
136#[non_exhaustive]
137pub enum MetricKind {
138 /// Monotonic counter (SCX_EV_* event counters,
139 /// `cpustat[CPUTIME_*]`, `bpf_prog_stats.cnt`, `ttwu_count`,
140 /// `nr_migrations`, …). Aggregate by sum.
141 Counter,
142 /// Instantaneous value (`nr_running`, `local_dsq.nr`, current
143 /// `policy`, current `comm`). The [`GaugeAgg`] tag picks the
144 /// reduction: Avg for typical-load, Last for "what's happening
145 /// now", Max for worst-instant.
146 Gauge(GaugeAgg),
147 /// Kernel max-of-window (`max_run_delay`,
148 /// `max_newidle_lb_cost`, the per-CPU preempt-off peak).
149 /// Aggregate by max — a peak that ever fired must survive the
150 /// reduction.
151 Peak,
152 /// Clock or wall-time reading (`rq.clock`,
153 /// CLOCK_REALTIME-stamped capture timestamps). Aggregate by
154 /// Last — averaging timestamps loses meaning.
155 Timestamp,
156 /// PRE-DELTAED counter: each sample is already a delta-since-the-
157 /// previous-read, not a cumulative-since-boot total. Schedulers
158 /// that delta their scx_stats Metrics server-side per reader
159 /// request (e.g. scx_mitosis) produce this — one ktstr snapshot =
160 /// one reader request = one delta. The per-phase reduction is the
161 /// SUM of the in-phase deltas (NOT the `Counter` last-minus-first,
162 /// which would difference two deltas into nonsense); the flat-run
163 /// reduction is likewise the sum. Boundary: the first in-phase
164 /// delta straddles the phase boundary (it spans from the last
165 /// pre-phase read to the first in-phase read, so it includes a
166 /// little pre-phase activity); it is attributed to the phase its
167 /// read lands in — a slight left-edge over-attribution, the
168 /// deliberate semantic since a per-read delta cannot be split.
169 DeltaSum,
170 /// An INJECTED per-phase delta that SUMS across phases within a run but
171 /// folds by UNWEIGHTED MEAN across runs. Each per-phase value is a
172 /// thread-group CPU-time delta (`system_time_ns` / `user_time_ns`,
173 /// written directly into `PhaseBucket.metrics` by `phase_group_cpu_delta`
174 /// — there is no `read_sample` arm). The cross-PHASE fold is the SUM of the
175 /// disjoint per-phase deltas (like `Counter`), NOT a sample-count-weighted
176 /// mean: a per-phase delta is already proportional to the phase's
177 /// wall-clock duration, and the freeze `sample_count` is too, so a weighted
178 /// mean would double-count duration and collapse the run to a meaningless
179 /// duration-weighted average per-phase delta. The sum is the run's total
180 /// OBSERVED CPU time — first-to-last freeze WITHIN each phase — a lower
181 /// bound that excludes pre-first-freeze, post-last-freeze, and
182 /// inter-phase-gap windows (and single-freeze phases, which contribute
183 /// nothing), per `phase_group_cpu_delta`'s observed-window semantic; that
184 /// is sufficient for an A/B regression signal, where the same
185 /// observed-window sum is compared on both sides. Across RUNS each run
186 /// contributes exactly one total, folded by the UNWEIGHTED arithmetic mean
187 /// (`Σ / contributors`) over the runs that emitted the key — NOT weighted
188 /// by `run_sample_count` (the monitor capture count, an unrelated
189 /// population that would also silently zero-weight a monitor-off run). This
190 /// SUM-cross-phase / MEAN-cross-run pair is exactly why these deltas are
191 /// NOT `Gauge(Avg)` (weighted mean at BOTH levels — the bug this kind
192 /// fixes) and NOT `DeltaSum` (sum at both levels, which would inflate the
193 /// cross-run value by the run count). The value is run-wide POOLED (one
194 /// scalar per phase across all tgids), NOT per-cgroup; the cross-cgroup
195 /// same-step merge is `Commutative` (`a + b`, like `Counter`) but is the
196 /// defensive same-step-index path only — dead for these keys, which are
197 /// injected exactly once into the pooled host `PhaseBucket`. NOT
198 /// [`Self::is_derived`] — it carries a real per-phase value, unlike Rate /
199 /// Distribution.
200 PerPhaseDeltaSum,
201 /// Derived ratio of two component metrics — a RATE that must be
202 /// recomputed from its components at every in-map aggregation level, never
203 /// averaged as a ready-made ratio. The variant carries the registry
204 /// names of its `numerator` and `denominator` component metrics, each
205 /// itself registered with its own kind (e.g. a `Counter` numerator).
206 ///
207 /// A Rate has NO samples of its own. Its value is DERIVED from the
208 /// already-reduced component values as `map[numerator] /
209 /// map[denominator]` by the [`derive_rate_metrics`] post-pass. An
210 /// aggregation level that pools the components FIRST (each by its own
211 /// kind — a `Counter` numerator sums, a `Gauge(Avg)` averages) and
212 /// then re-derives the rate RE-POOLS correctly: for the common
213 /// `Counter / Counter` case the result is `Σnumerator / Σdenominator`,
214 /// NOT a mean of two phases' ready-made ratios `(r₁ + r₂) / 2` (which
215 /// is WRONG whenever the phases carry different denominator weight,
216 /// e.g. iterations-per-cpu-second across phases of unequal CPU time).
217 /// The numerator and denominator must already be expressed in units
218 /// whose quotient is the intended rate unit (the component
219 /// registration owns the unit choice; this variant does not scale).
220 ///
221 /// `derive_rate_metrics` runs as a post-pass at the nine aggregation
222 /// sites where the components co-locate in one map: the two per-phase
223 /// builds (`buckets_from_grouped`, `build_phase_buckets_with_stimulus`),
224 /// the cross-phase bucket merge (`merge_matched_phase_buckets`), the
225 /// three cross-RUN ext-metrics reducers (`populate_run_ext_metrics`,
226 /// `populate_run_ext_metrics_from_phases`, and `group_and_average_by`),
227 /// and the cross-CGROUP pooled re-pools
228 /// (`crate::assert::populate_run_pooled_iterations_per_cpu_sec`,
229 /// `crate::assert::populate_run_pooled_taobench`,
230 /// `crate::assert::populate_run_pooled_schbench`).
231 /// The cross-CGROUP `AssertResult::merge` ext-metrics fold itself uses
232 /// worst-case polarity (min/max) and is NOT a re-pool site; the pooled
233 /// re-pool runs separately after it, at the eval layer, reading
234 /// `stats.cgroups` directly. `iteration_rate` does not exercise the merge
235 /// fold either: it and its components are host-injected by
236 /// `populate_run_ext_metrics_from_phases` AFTER the cross-cgroup `merge`,
237 /// so the fold never sees them. The pooled `iterations_per_cpu_sec` is the
238 /// rate whose components ARE per-cgroup, and
239 /// `populate_run_pooled_iterations_per_cpu_sec` re-pools it post-merge.
240 ///
241 /// Because a single sample slice cannot express the re-pool, a Rate is
242 /// FORBIDDEN from the single-slice reducers ([`aggregate_finite`]
243 /// panics on it); the post-pass is its only producer.
244 Rate {
245 /// Registry name of the numerator component metric.
246 numerator: &'static str,
247 /// Registry name of the denominator component metric.
248 denominator: &'static str,
249 },
250 /// Derived DISTRIBUTIONAL aggregate re-pooled from a raw per-cgroup
251 /// sample set, never folded from ready-made per-cgroup reductions. The
252 /// variant names the [`SampleSource`] (which
253 /// [`crate::assert::PhaseCgroupStats`] sample vector feeds it) and the
254 /// [`SampleReduction`] (which statistic to compute over the pooled set).
255 ///
256 /// Like [`MetricKind::Rate`], a Distribution has NO value of its own at
257 /// the WITHIN-RUN levels: its run-level value is DERIVED post-merge by
258 /// `crate::assert::populate_run_distribution_metrics`, which pools the
259 /// raw samples from `stats.phases[].per_cgroup` across every phase and
260 /// cgroup and recomputes the statistic over the COMBINED set — the
261 /// percentile / CV / mean / extreme of the pooled distribution, NOT a
262 /// max or mean of per-cgroup reductions (the percentile of a union is
263 /// not the max of per-source percentiles). It is therefore FORBIDDEN
264 /// from the per-phase single-slice reducers
265 /// ([`aggregate_samples_for_phase`] returns None via
266 /// [`MetricKind::is_derived`]); the post-pass is its only within-run
267 /// producer. When the size-limited bulk frame strips the sample pools
268 /// (`crate::assert::strip_phase_cgroup_samples`), the producer falls
269 /// back to a worst-wins fold over the surviving per-cgroup `CgroupStats`
270 /// reductions so the metric degrades rather than vanishing.
271 ///
272 /// CROSS-RUN it is a HYBRID, unlike Rate: a run's components (the raw
273 /// sample vectors) do not survive into the cross-RUN ext-metrics map
274 /// (phases are dropped at the cross-RUN fold), so there is no combined
275 /// sample SET to re-pool across runs. The cross-RUN value is instead a
276 /// plain fold of the per-run derived values — an UNWEIGHTED mean (over the
277 /// runs that emitted the key, `sum / finite.len()`) for the percentile /
278 /// CV / mean reductions and a MAX for [`SampleReduction::Worst`] (the
279 /// peak run-delay) — applied by [`aggregate_finite`] over the per-run ext
280 /// values. So `is_derived`
281 /// skips it at the within-run sites, but the cross-RUN ext fold does
282 /// NOT skip it (only Rate, whose components DO survive cross-RUN, is
283 /// skipped there).
284 Distribution {
285 /// Which raw sample vector on
286 /// [`crate::assert::PhaseCgroupStats`] feeds this aggregate.
287 source: SampleSource,
288 /// Which statistic to recompute over the pooled sample set.
289 reduction: SampleReduction,
290 },
291 /// Derived LOWEST-WINS per-cgroup selector — the worst (lowest) cgroup's
292 /// `numerator / denominator` ratio across the run, re-pooled from per-cgroup
293 /// carriers rather than folded from ready-made ratios. None-aware lowest-wins
294 /// (the semantic the deleted `fold_lowest_some` carried in
295 /// [`crate::assert::AssertResult::merge`], now in
296 /// `crate::assert::populate_run_distribution_metrics`): a measured
297 /// `Some(0.0)` — a cgroup that ran zero iterations (real starvation) for the
298 /// efficiency selectors, or one with all pages off-node for
299 /// `worst_page_locality` — wins the worst bucket; a not-measured `None` (no
300 /// workers / no on-CPU time / no NUMA pages) is skipped; and an all-`None`
301 /// cohort produces no key (absence preserved as a missing ext entry, never a
302 /// `0.0`).
303 ///
304 /// Derived post-merge by
305 /// `crate::assert::populate_run_distribution_metrics`. The SOURCE depends on
306 /// the numerator: the iteration-efficiency selectors (`Iterations`) re-pool
307 /// from the `stats.cgroups[]` counters (which survive bulk-frame stripping,
308 /// so they need no degraded fallback); `worst_page_locality` (`NumaLocal`)
309 /// re-pools from the per-phase `stats.phases[].per_cgroup` NUMA carriers (the
310 /// reports-only `CgroupStats` hardcodes `page_locality` 0.0, so it cannot
311 /// source from `stats.cgroups[]`). Like Distribution it is `is_derived`
312 /// (skipped at the within-run reducers) and CROSS-RUN it MEAN-folds the
313 /// per-run derived values through [`aggregate_finite`].
314 WorstLowest {
315 /// The per-cgroup numerator (`Iterations` or `NumaLocal`).
316 numerator: WorstLowestNumerator,
317 /// The per-cgroup denominator the numerator is divided by.
318 denominator: WorstLowestDenominator,
319 },
320 /// Derived WORST-CGROUP wake-latency tail-amplification selector — the
321 /// highest per-cgroup `p99 / median` wake-latency ratio across the run.
322 /// Higher-is-worse (a stretched long tail), so "worst" is the MAX over
323 /// cgroups — the polarity-opposite of [`MetricKind::WorstLowest`]'s
324 /// lowest-wins. Re-selected post-merge by
325 /// `crate::assert::populate_run_distribution_metrics` from the
326 /// `stats.cgroups[]` entries via `CgroupStats::wake_latency_tail_ratio`
327 /// (deliberately NOT `pooled_p99 / pooled_median` of the cross-cgroup
328 /// union — that is the distinct `worst_p99_wake_latency_us` /
329 /// `worst_median_wake_latency_us` Distribution pair). Like Distribution /
330 /// WorstLowest it is [`MetricKind::is_derived`] (skipped at the within-run
331 /// reducers); the producer emits NO key when the run is below the
332 /// [`WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS`] noise floor or no cgroup
333 /// carried a measurable tail (absence preserved as a missing ext entry,
334 /// never a `0.0` sentinel — the no-false-zero contract the deleted typed
335 /// field could not express).
336 ///
337 /// CROSS-RUN it folds, like every WorstLowest selector, by the UNWEIGHTED
338 /// exclude-missing MEAN through [`aggregate_finite`] (`sum / finite.len()`
339 /// over the runs that emitted the key) — the cohort's TYPICAL worst-cgroup
340 /// tail amplification, deliberately NOT a MAX: peak-of-peaks is reserved
341 /// for [`SampleReduction::Worst`] (a peak detector answering "did this ever
342 /// fire"), whereas this answers "what is this cohort's characteristic
343 /// worst-cgroup tail". A run below the floor never enters the mean, so no
344 /// sub-threshold run dilutes the cohort (the bug the ext relocation fixed:
345 /// the deleted typed cross-RUN fold summed every passing run's raw ratio
346 /// over `passes_observed`, folding noisy low-N runs in as real values).
347 WakeLatencyTailRatio,
348 /// Derived WORST-CGROUP cross-node migration-churn selector — the highest
349 /// per-cgroup `cross_node_migrated / numa_pages_total` ratio across the run.
350 /// LowerBetter (more cross-node migration is worse), so "worst" is the MAX
351 /// over cgroups — the polarity twin of `worst_page_locality`
352 /// ([`MetricKind::WorstLowest`] `NumaLocal`/`NumaTotal`, lowest-wins), sharing
353 /// the same per-phase NUMA carriers and the `numa_agg_per_cgroup` helper.
354 /// Re-pooled post-merge by
355 /// `crate::assert::populate_run_distribution_metrics` from
356 /// `stats.phases[].per_cgroup`: the cross-phase fold SUMs the per-phase
357 /// migration-counter deltas over the LATEST residency total
358 /// (`cross_node_migration_ratio_of(summed_migrated, latest_total)`), then
359 /// MAXes across cgroups that measured NUMA residency (`numa_pages_total > 0`);
360 /// a never-measured cohort yields no key (absence preserved as a missing ext
361 /// entry, never a `0.0`). A CHURN ratio (cumulative migration EVENTS over a
362 /// residency SNAPSHOT) — can legitimately exceed 1.0, NOT a bounded `[0,1]`
363 /// fraction. A dedicated max-selector like [`MetricKind::WakeLatencyTailRatio`]
364 /// (no generic numerator/denominator), since cross_node is the sole max-wins
365 /// phase-carrier ratio. Like the other derived kinds it is
366 /// [`MetricKind::is_derived`] (skipped at the within-run reducers) and
367 /// CROSS-RUN MEAN-folds through [`aggregate_finite`] over the runs that
368 /// EMITTED the key — the cohort's typical worst-cgroup churn. The deleted
369 /// typed `Gauge(Last)` field instead averaged its value over
370 /// `passes_observed`, folding a NUMA-less passing run's `0.0` sentinel into
371 /// the mean (dilution); the ext re-pool writes no key for a never-measured
372 /// run (absence preserved), so it never enters the divisor.
373 WorstCrossNodeRatio,
374 /// Per-phase-only scalar derived ONCE per phase from a phase-scoped
375 /// carrier, NOT from monitor samples and NOT re-pooled run-level. The sole
376 /// producer is [`crate::assert::derive_phase_metrics`], which derives two
377 /// families per phase:
378 /// - the schbench scalars: it pools each phase's
379 /// `crate::assert::PhaseCgroupStats` schbench carriers and re-derives a
380 /// percentile (from the merged latency histogram), a sample-weighted mean
381 /// (run-delay), or a count (loop_count), writing them into BOTH each
382 /// carrier's `PhaseCgroupStats::metrics` (per-cgroup, read via
383 /// `phase_cgroup_metric`) AND the pooled
384 /// `crate::assert::PhaseBucket::metrics` (read via `phase_metric`).
385 /// - the NON-schbench carrier scalars (wake/run-delay/off-cpu distributions
386 /// + migration/iterations/locality ratios, via `write_carrier_scalars`):
387 /// written ONLY into each `PhaseCgroupStats::metrics` (per-cgroup, read
388 /// via `phase_cgroup_metric`) — these have NO pooled
389 /// `crate::assert::PhaseBucket::metrics` entry; their run-level aggregate
390 /// is the `worst_*` ext-metrics key.
391 /// It is [`MetricKind::is_derived`] (so the within-run reducers —
392 /// [`aggregate_samples_for_phase`] and the phase-bucket merge loop — skip
393 /// it) and has NO run-level producer; it is ADDITIONALLY gated out of the
394 /// cross-RUN ext fold (`fold_ext_metrics`), since a per-phase scalar has no
395 /// meaningful cross-run aggregate. A unit marker (no payload): the
396 /// derivation owns the metric-name→computation mapping, so the kind need not
397 /// carry a percentile selector (which would leak `plat::Pct` through this
398 /// public enum).
399 // doc_lazy_continuation: pre-existing list-item wording surfaced by the clippy
400 // 1.94 bump; renders fine. Suppress rather than reflow the prose.
401 #[allow(clippy::doc_lazy_continuation)]
402 PerPhase,
403 /// A WHOLE-RUN distributional value (a percentile / min / max) re-pooled
404 /// run-level by UNIONING the per-phase per-cgroup raw distribution carriers
405 /// and re-deriving the statistic over the union — the schbench engine's
406 /// `*_whole` wakeup / request / rps keys, written by
407 /// `crate::assert::populate_run_pooled_schbench_distribution` from the
408 /// `PhaseCgroupStats::schbench` `PlatStats` histograms (`PlatStats::combine`
409 /// is an associative bucket-count add, so the merged histogram is the faithful
410 /// union and the re-derived percentile is the percentile OF the pooled sample
411 /// set, NOT a mean of per-source percentiles). UNLIKE
412 /// [`MetricKind::Distribution`] it is NOT cross-RUN folded: a percentile of a
413 /// union is not a mean of per-run percentiles, and the per-phase histograms
414 /// are dropped at the cross-run boundary (no pooled set survives to
415 /// re-derive), so the only honest cross-run treatment is the per-run
416 /// noise-compare — `crate::stats::noise_findings` reads each run's own
417 /// `*_whole` scalar and compares the spread, never averaging them. So it is
418 /// [`MetricKind::is_derived`] (the within-run reducers skip it; the value is
419 /// produced solely by the run-level union populate) AND gated out of the
420 /// cross-RUN ext fold (`fold_ext_metrics`); `noise_findings` is its only
421 /// consumer. ext-only (`accessor |_| None`); the `*_whole` names are distinct
422 /// from the per-phase [`MetricKind::PerPhase`] percentile keys (one registry
423 /// name = one kind) — the established per-phase-vs-whole-run coexistence (as
424 /// the taobench and schbench loop/run-delay whole-run keys also do).
425 PerRunDistribution,
426}
427
428/// Sub-classification for [`MetricKind::Gauge`] picking the
429/// per-window reduction. Most ktstr gauges are Avg ("typical-load
430/// over the window"); Last fits "current state" snapshots like
431/// `comm` / `policy`; Max fits worst-instant queue-depth probes.
432// Serialize-only, matching its container MetricKind (which is Serialize-only)
433// and the sibling MetricKind sub-enums (SampleSource / SampleReduction /
434// WorstLowestNumerator / WorstLowestDenominator). Nothing deserializes a
435// MetricKind / GaugeAgg, so the prior Deserialize derive was dead.
436#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
437#[non_exhaustive]
438pub enum GaugeAgg {
439 /// Reduce by arithmetic mean. Default for `nr_running`-style
440 /// gauges where the question is "what was the typical load".
441 Avg,
442 /// Take the latest sample. Default for `comm` / `policy` /
443 /// `cgroup_path`-style snapshots where the value is "what is
444 /// it RIGHT NOW".
445 Last,
446 /// Take the max sample. Useful when a gauge is being used to
447 /// detect a worst-case regression (e.g. queue-depth probe
448 /// where any spike is the signal of interest).
449 Max,
450}
451
452/// The raw per-cgroup sample vector on
453/// [`crate::assert::PhaseCgroupStats`] that a [`MetricKind::Distribution`]
454/// re-pools over. Each variant maps to exactly one un-reduced sample
455/// vector the per-phase per-cgroup carrier holds (stored RAW in
456/// nanoseconds; the [`SampleReduction`] applies the ns→µs scale once).
457#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
458#[non_exhaustive]
459// The `Ns` suffix on every variant documents the unit (all sources are RAW
460// nanoseconds) at each use site, not just in the enum doc; clippy's
461// enum_variant_names is a style heuristic that misfires on a meaningful shared
462// unit suffix — renaming would drop the unit, so the suffix is kept.
463#[allow(clippy::enum_variant_names)]
464pub enum SampleSource {
465 /// Per-wakeup latency samples in ns
466 /// (`crate::assert::PhaseCgroupStats::wake_latencies_ns`). One sample per
467 /// observed wakeup (reservoir-capped per cgroup), so the pooled set is the
468 /// cross-cgroup union of those capped per-wakeup samples.
469 WakeLatencyNs,
470 /// Per-worker schedstat run-delay samples in ns
471 /// (`crate::assert::PhaseCgroupStats::run_delays_ns`). One sample per worker
472 /// — each is that worker's `sched_info.run_delay` delta over the carrier's
473 /// window (whole-run last-minus-first for the step-local carrier; the
474 /// per-phase delta for the backdrop slice carrier), so the pool size is the
475 /// worker count, NOT a per-wakeup stream like `WakeLatencyNs`.
476 RunDelayNs,
477 /// Per-timer-cycle latency samples in ns
478 /// (`crate::assert::PhaseCgroupStats::timer_latencies_ns`). One sample per
479 /// `clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME)` wake — the observed
480 /// wake time minus the absolute deadline, floored at 0 — recorded by
481 /// [`crate::workload::WorkType::TimerLatency`] (reservoir-capped per cgroup
482 /// like `WakeLatencyNs`). Distinct from `WakeLatencyNs` so cyclictest-style
483 /// timer latency does not blur with the blocking variants' wake latency in a
484 /// shared sidecar.
485 TimerLatencyNs,
486}
487
488/// The statistic a [`MetricKind::Distribution`] computes over its pooled
489/// [`SampleSource`] set. Each maps to the matching reduction
490/// `crate::assert::cgroup_stats` computes per cgroup, so the run-level
491/// re-pool reproduces that reduction over the COMBINED cross-cgroup set
492/// rather than folding ready-made per-cgroup reductions.
493#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
494#[non_exhaustive]
495pub enum SampleReduction {
496 /// 99th percentile (nearest-rank), ns→µs.
497 P99,
498 /// 99.9th percentile (nearest-rank), ns→µs — the deep tail an RT /
499 /// cyclictest-style latency probe turns on (a single max is one sample;
500 /// p99.9 is the robust deep-tail percentile between p99 and max).
501 P999,
502 /// Median (50th percentile, nearest-rank), ns→µs.
503 Median,
504 /// Coefficient of variation (stddev / mean) over the pooled set,
505 /// `n = pool.len()`. Unitless.
506 Cv,
507 /// Arithmetic mean over the pooled set, ns→µs.
508 Mean,
509 /// Maximum (worst) sample over the pooled set, ns→µs. CROSS-RUN this is
510 /// the one reduction [`aggregate_finite`] folds by MAX (peak survives),
511 /// not MEAN — see [`MetricKind::Distribution`].
512 Worst,
513}
514
515/// The per-cgroup numerator of a [`MetricKind::WorstLowest`] lowest-wins
516/// selector. `#[non_exhaustive]`, mirroring [`MetricKind::Rate`]'s `numerator`.
517/// The producer branches on the numerator to pick the per-cgroup SOURCE:
518/// `Iterations` reads the `crate::assert::CgroupStats` counters; `NumaLocal`
519/// reads the per-phase NUMA carriers (`page_locality` is structurally 0.0 in the
520/// reports-only `CgroupStats`, so it must come from the phase carriers).
521#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
522#[non_exhaustive]
523pub enum WorstLowestNumerator {
524 /// Per-cgroup total iteration count
525 /// (`crate::assert::CgroupStats::total_iterations`).
526 Iterations,
527 /// Per-cgroup pages resident on the expected NUMA node(s) — the
528 /// page-locality numerator, the LATEST per-phase residency snapshot summed
529 /// across the cgroup's workers (`crate::assert::PhaseCgroupStats::numa_pages_local`),
530 /// NOT a `CgroupStats` counter.
531 NumaLocal,
532}
533
534/// The per-cgroup denominator a [`MetricKind::WorstLowest`] numerator is divided
535/// by to form the lowest-wins ratio.
536#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
537#[non_exhaustive]
538pub enum WorstLowestDenominator {
539 /// Worker count (`crate::assert::CgroupStats::num_workers`) — yields
540 /// iterations-per-worker (raw throughput, scales with the CPU budget).
541 NumWorkers,
542 /// On-CPU nanoseconds (`crate::assert::CgroupStats::total_cpu_time_ns`),
543 /// converted ns→s ONCE on the summed counter — yields the
544 /// overcommit-invariant iterations-per-CPU-second efficiency.
545 CpuTimeNs,
546 /// Total resident pages — the page-locality denominator, the LATEST
547 /// per-phase residency snapshot (`crate::assert::PhaseCgroupStats::numa_pages_total`),
548 /// shared with the cross-node ratio. Paired with `NumaLocal` to yield the
549 /// page-locality fraction; the per-cgroup ratio is absent (None) when the
550 /// cgroup measured no NUMA pages.
551 NumaTotal,
552}
553
554/// How a per-phase metric reduction merges across two
555/// [`crate::assert::AssertResult`]s that both carry a
556/// [`crate::assert::PhaseBucket`] at the same `step_index`.
557///
558/// Driven by [`MetricKind::merge_kind`] so a future
559/// [`MetricKind`] addition is forced to declare its merge
560/// semantic explicitly (the match is `#[non_exhaustive]`-aware
561/// via the helper rather than a bare `match` in every caller).
562///
563/// The split mirrors the rolling-aggregation contract in
564/// [`AssertResult::merge`](crate::assert::AssertResult::merge): the
565/// per-phase fold must commute so the accumulator pattern
566/// `AssertResult::pass().merge(real_a).merge(real_b)` yields the
567/// same result whether merges arrive in `a→b` or `b→a` order
568/// — EXCEPT for kinds whose reduction is intrinsically the LAST
569/// sample (`Gauge(Last)`, `Timestamp`), where the merge must
570/// resolve to the bucket whose `end_ms` is later.
571///
572/// Counter, Peak, and Gauge(Max/Avg) are commutative because their
573/// reductions are sum / max / weighted-mean respectively — all
574/// associative, commutative folds over reduced values. Gauge(Last)
575/// and Timestamp are NOT commutative under a per-merge cumulative
576/// fold (the "later" sample wins) so the merge uses `end_ms` as
577/// the tiebreaker rather than the operand order.
578#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
579#[non_exhaustive]
580pub enum MergeKind {
581 /// The reduction commutes: `merge(a, b) == merge(b, a)`. The
582 /// merge folds the two reduced values via the kind's natural
583 /// commutative operation (Counter → sum, Peak / Gauge(Max) →
584 /// max, Gauge(Avg) → weighted mean by `sample_count`).
585 Commutative,
586 /// The reduction is "the LATEST sample's value" (Gauge(Last),
587 /// Timestamp). The merge resolves to the value from whichever
588 /// bucket has the later `end_ms`; ties keep `self`.
589 NonCommutative,
590 /// The value is DERIVED post-merge from pooled components, never folded
591 /// from two already-reduced values. Covers every
592 /// [`MetricKind::is_derived`] kind:
593 /// - [`MetricKind::Rate`]: re-derived as `Σnumerator / Σdenominator` from
594 /// its component keys by [`derive_rate_metrics`];
595 /// - [`MetricKind::Distribution`] / [`MetricKind::WorstLowest`] /
596 /// [`MetricKind::WakeLatencyTailRatio`] / [`MetricKind::WorstCrossNodeRatio`]:
597 /// re-pooled post-merge by
598 /// `crate::assert::populate_run_distribution_metrics`;
599 /// - [`MetricKind::PerPhase`]: re-derived per phase by
600 /// `crate::assert::derive_phase_metrics`.
601 ///
602 /// The per-metric merge loop skips these derived keys entirely and the
603 /// post-pass produces them, so this variant is classification metadata: no
604 /// merge dispatches on it.
605 Recompute,
606}
607
608impl MetricKind {
609 /// Map each [`MetricKind`] variant to the corresponding
610 /// [`MergeKind`] used by per-phase
611 /// [`AssertResult::merge`](crate::assert::AssertResult::merge).
612 /// Centralising the mapping here means a future kind
613 /// addition fails the build until the new variant is wired
614 /// (the inner `match` is exhaustive even though `MetricKind`
615 /// is `#[non_exhaustive]` because this fn lives in the same
616 /// crate).
617 pub fn merge_kind(self) -> MergeKind {
618 match self {
619 MetricKind::Counter => MergeKind::Commutative,
620 MetricKind::Peak => MergeKind::Commutative,
621 MetricKind::Gauge(GaugeAgg::Avg) => MergeKind::Commutative,
622 MetricKind::Gauge(GaugeAgg::Max) => MergeKind::Commutative,
623 MetricKind::Gauge(GaugeAgg::Last) => MergeKind::NonCommutative,
624 MetricKind::Timestamp => MergeKind::NonCommutative,
625 // Per-phase reduction is a sum of in-phase deltas — an
626 // associative, commutative fold, so cross-AssertResult merge
627 // sums the two reduced values (same as Counter).
628 MetricKind::DeltaSum => MergeKind::Commutative,
629 // PerPhaseDeltaSum: the cross-cgroup same-step merge is a
630 // commutative sum (`a + b`, like Counter), but for these run-wide
631 // POOLED keys (one scalar per phase, injected once into the host
632 // bucket) it is the defensive same-step-index path only — two real
633 // values are never summed. (The SUM-cross-phase / MEAN-cross-run
634 // split is handled at the fold sites, not in this merge.)
635 MetricKind::PerPhaseDeltaSum => MergeKind::Commutative,
636 // A Rate is re-derived from its pooled components, never
637 // folded from two ready-made ratios.
638 MetricKind::Rate { .. } => MergeKind::Recompute,
639 // Distribution and WorstLowest are derived post-merge by
640 // `populate_run_distribution_metrics` (re-pooled from the
641 // per-cgroup raw samples / counters), so the per-phase merge
642 // loop skips them and re-derives — classification-only, like
643 // Rate. See [`MetricKind::is_derived`].
644 MetricKind::Distribution { .. } => MergeKind::Recompute,
645 MetricKind::WorstLowest { .. } => MergeKind::Recompute,
646 // Worst-cgroup wake-latency tail ratio: derived post-merge by
647 // `populate_run_distribution_metrics` (max over the merged
648 // `stats.cgroups` per-cgroup ratios), so the per-phase merge loop
649 // skips and re-derives it — classification-only, like the other
650 // derived kinds.
651 MetricKind::WakeLatencyTailRatio => MergeKind::Recompute,
652 // Worst-cgroup cross-node migration churn: derived post-merge by
653 // `populate_run_distribution_metrics` (max over the per-phase NUMA
654 // carriers' per-cgroup churn ratio), so the per-phase merge loop skips
655 // and re-derives it — classification-only, like the other derived kinds.
656 MetricKind::WorstCrossNodeRatio => MergeKind::Recompute,
657 // PerPhase is derived post-merge by `derive_phase_metrics` (schbench
658 // scalars into PhaseBucket.metrics + PhaseCgroupStats::metrics;
659 // non-schbench carrier scalars into PhaseCgroupStats::metrics only);
660 // the per-phase merge loop skips it (is_derived) and never re-derives
661 // via a kind — classification-only, like the other derived kinds.
662 MetricKind::PerPhase => MergeKind::Recompute,
663 // PerRunDistribution is derived run-level by
664 // `populate_run_pooled_schbench_distribution` (union of the per-phase
665 // per-cgroup PlatStats histograms, percentile re-derived over the
666 // union); the per-phase merge loop skips it (is_derived) and never
667 // re-derives via a kind — classification-only, like the other derived
668 // kinds.
669 MetricKind::PerRunDistribution => MergeKind::Recompute,
670 }
671 }
672
673 /// Whether this kind is DERIVED post-merge from other data rather than
674 /// reduced from its own per-phase sample slice: [`MetricKind::Rate`]
675 /// (from numerator/denominator components), [`MetricKind::Distribution`]
676 /// (re-pooled from the per-cgroup raw sample sets), [`MetricKind::WorstLowest`]
677 /// (lowest-wins over per-cgroup counters),
678 /// [`MetricKind::WakeLatencyTailRatio`] (max over the per-cgroup p99/median
679 /// wake-latency ratios, floor-gated), and [`MetricKind::WorstCrossNodeRatio`]
680 /// (max over the per-cgroup cross-node migration-churn ratios).
681 ///
682 /// Drives the WITHIN-RUN skip-sites that must not reduce a derived kind
683 /// from a slice: [`aggregate_samples_for_phase`] returns None, and the
684 /// per-phase build, the cross-phase
685 /// `crate::assert::merge_matched_phase_buckets` key-loop, and
686 /// [`crate::assert::populate_run_ext_metrics_from_phases`] all skip the
687 /// key then re-derive.
688 ///
689 /// NOT a uniform cross-RUN skip: at the cross-RUN ext fold
690 /// ([`group_and_average_by`], via `fold_ext_metrics`) [`MetricKind::Rate`],
691 /// [`MetricKind::PerPhase`], AND [`MetricKind::PerRunDistribution`] are
692 /// skipped — Rate's components survive cross-RUN so it re-derives there;
693 /// PerPhase is a per-phase-only scalar with no meaningful cross-RUN aggregate
694 /// (its skip also keeps [`aggregate_finite`]'s `PerPhase => unreachable!`
695 /// unreachable); and PerRunDistribution is a percentile-of-union whose
696 /// per-phase histograms are not shipped cross-RUN (a percentile of a union is
697 /// not a mean of per-run percentiles), so it is noise-compared per-run, never
698 /// folded — while Distribution / WorstLowest / WakeLatencyTailRatio /
699 /// WorstCrossNodeRatio, whose components do NOT survive cross-RUN, fall through
700 /// to be plainly folded (MEAN, or MAX for [`SampleReduction::Worst`]) by
701 /// [`aggregate_finite`]. So callers gate on `is_derived` for the within-run
702 /// sites and on `matches!(.., Rate { .. } | PerPhase | PerRunDistribution)`
703 /// for the cross-RUN ext fold.
704 pub fn is_derived(self) -> bool {
705 matches!(
706 self,
707 MetricKind::Rate { .. }
708 | MetricKind::Distribution { .. }
709 | MetricKind::WorstLowest { .. }
710 | MetricKind::WakeLatencyTailRatio
711 | MetricKind::WorstCrossNodeRatio
712 | MetricKind::PerPhase
713 | MetricKind::PerRunDistribution
714 )
715 }
716}
717
718/// Reduce a slice of per-sample readings of the same metric into
719/// one representative value, dispatching on [`MetricKind`]. Used
720/// by sample-windowed comparison paths (e.g. multi-tick monitor
721/// captures, perf-delta across multiple snapshot
722/// subdirectories) to collapse a sample vec into the value the
723/// existing scalar-comparison pipeline already understands.
724///
725/// Returns `None` when `samples` is empty — the caller decides
726/// whether absence is a missing-data condition or a benign
727/// "no samples in window" result. NaN samples are dropped from
728/// the reduction (same semantics as the existing percentile()
729/// helper); a final all-NaN input also returns `None`.
730///
731/// Semantics by kind:
732/// - `Counter` → sum of finite samples — the flat-run reduction
733/// for cross-RUN aggregation. NOT the right semantic for
734/// per-phase reduction of a cumulative-since-boot Counter
735/// (which would over-count). Callers wanting per-phase
736/// Counter reduction use [`aggregate_samples_for_phase`],
737/// which routes Counter through a dedicated last-minus-first
738/// branch instead of dispatching through here.
739/// - `Gauge(Avg)` → arithmetic mean of finite samples.
740/// - `Gauge(Last)` → last finite sample.
741/// - `Gauge(Max)` → max of finite samples.
742/// - `Peak` → max of finite samples.
743/// - `Timestamp` → last finite sample.
744///
745/// Live caller: [`aggregate_samples_for_phase`] dispatches every
746/// non-Counter kind through this entry point so the per-phase
747/// reduction inherits the flat-run semantic for Gauge / Peak /
748/// Timestamp without restating it. That fn is itself folded by
749/// [`crate::assert::build_phase_buckets`] whose live caller is
750/// the host-side `evaluate_vm_result` AssertResult-population
751/// site at `src/test_support/eval/mod.rs`.
752pub fn aggregate_samples(samples: &[f64], kind: MetricKind) -> Option<f64> {
753 let finite: Vec<f64> = samples.iter().copied().filter(|x| x.is_finite()).collect();
754 aggregate_finite(&finite, |_| 1, kind)
755}
756
757/// Weighted variant of [`aggregate_samples`]. Takes a slice of
758/// `(value, weight)` pairs so the lock-step shape is enforced by
759/// the type — there is no length-mismatch class for the caller to
760/// trigger. Weight is consulted for [`MetricKind::Gauge`] with
761/// [`GaugeAgg::Avg`] (weighted mean); other kinds fold by their
762/// natural reduction and ignore weight.
763///
764/// NaN-valued pairs drop along with their weight (filter operates
765/// on the value field — no risk of weights misaligning to other
766/// samples after filtering, unlike the previous parallel-slice
767/// shape).
768///
769/// Zero total weight degenerates to the unweighted mean per the
770/// `merge_metric_values` precedent. Weight sum uses `checked_add`
771/// with fallback to unweighted on overflow so a pathological
772/// caller can't crash the aggregator.
773pub fn aggregate_samples_weighted(pairs: &[(f64, usize)], kind: MetricKind) -> Option<f64> {
774 let finite: Vec<(f64, usize)> = pairs
775 .iter()
776 .copied()
777 .filter(|(x, _)| x.is_finite())
778 .collect();
779 if finite.is_empty() {
780 return None;
781 }
782 let values: Vec<f64> = finite.iter().map(|(x, _)| *x).collect();
783 aggregate_finite(&values, |i| finite[i].1, kind)
784}
785
786/// Inner fold shared by [`aggregate_samples`] (uniform weights)
787/// and [`aggregate_samples_weighted`] (caller-supplied weights).
788/// `weight_for(i)` returns the weight for the i-th element of
789/// `finite`; callers either pass `|_| 1` (unweighted) or a
790/// closure that reads from their pair vec (weighted). Pre-filtered
791/// `finite` carries only NaN-free values so the closure indexes
792/// into a known-good vec without risking shape drift.
793fn aggregate_finite(
794 finite: &[f64],
795 weight_for: impl Fn(usize) -> usize,
796 kind: MetricKind,
797) -> Option<f64> {
798 if finite.is_empty() {
799 return None;
800 }
801 Some(match kind {
802 // Counter (cumulative-since-boot, cross-RUN flat sum) and
803 // DeltaSum (each sample already a per-read delta) both reduce to
804 // a plain sum of the finite samples here; they differ only in
805 // the PER-PHASE path (Counter last-minus-first vs DeltaSum sum —
806 // see aggregate_samples_for_phase).
807 MetricKind::Counter | MetricKind::DeltaSum => finite.iter().sum(),
808 // PerPhaseDeltaSum at the CROSS-RUN fold: each contributor is one run's
809 // already-summed per-phase total, so runs fold by the UNWEIGHTED
810 // arithmetic mean over the runs that emitted the key — `Σ / len`, the
811 // same shape as the Distribution / WorstLowest cross-run mean below and
812 // deliberately NOT weighted by `run_sample_count`. The CROSS-PHASE sum
813 // that builds each run's total is done directly in
814 // `crate::assert::populate_run_ext_metrics_from_phases`, which does NOT
815 // route this kind through `aggregate_finite`, so this arm only ever runs
816 // at the cross-RUN ext fold.
817 MetricKind::PerPhaseDeltaSum => finite.iter().sum::<f64>() / (finite.len() as f64),
818 // Distribution Worst (peak run-delay): the cross-RUN fold is MAX
819 // so the high-water peak survives, distinct from the MEAN-folded
820 // percentile / CV / mean reductions below. (WITHIN-RUN no
821 // derived kind reaches here — `is_derived` skips every one at
822 // the per-phase reducers; this arm only fires at the cross-RUN ext
823 // fold in `group_and_average_by`.) Matched before the general
824 // `Distribution { .. }` mean arm so Worst takes MAX, not MEAN.
825 MetricKind::Distribution {
826 reduction: SampleReduction::Worst,
827 ..
828 } => finite.iter().copied().fold(f64::NEG_INFINITY, f64::max),
829 // Cross-RUN MEAN fold of the remaining Distribution reductions (p99 /
830 // median / CV / mean run-delay) and every WorstLowest selector: each
831 // per-run value is itself a within-run pooled reduction or a
832 // lowest-wins selector, NOT a monitor-sampled gauge, so the cross-RUN
833 // fold is an UNWEIGHTED arithmetic mean — `sum / finite.len()`, i.e.
834 // over the runs that EMITTED a finite value for the key. This matches
835 // the unweighted-mean SHAPE of the surviving typed siblings
836 // (spread, migration_ratio), but its
837 // divisor is the present-finite-contributor count, NOT the typed path's
838 // `sum / passes_observed`: a passing run that omitted the key (absent /
839 // dropped-non-finite ext entry) is EXCLUDED from the mean rather than
840 // folded in as 0.0 — the deliberate no-false-zero improvement the ext
841 // relocation buys (the old typed field defaulted a no-data run to 0.0).
842 // Weighting by `run_sample_count` (the MONITOR capture count) would
843 // weight by an unrelated population AND silently zero-weight a
844 // monitor-off run, so it is deliberately NOT used here. (WITHIN-RUN
845 // these never reach here — `is_derived` skips them at the per-phase
846 // reducers; this arm only fires at the cross-RUN ext fold in
847 // `group_and_average_by`.)
848 //
849 // EXTREMUM ASYMMETRY (on the record, ratified): every WorstLowest
850 // selector is a within-run lowest-wins ("worst cgroup") value yet folds
851 // cross-RUN by this MEAN, NOT by an extremum — UNLIKE worst_run_delay_us
852 // (SampleReduction::Worst), whose dedicated MAX arm above preserves the
853 // peak-of-peaks. Both reproduce the deleted typed cross-RUN folds
854 // exactly: run-delay is a peak detector (MAX), the iteration
855 // efficiencies are a starvation-floor cohort statistic (MEAN). Aligning
856 // WorstLowest to an extremum (a MIN arm gated on HigherBetter) would be
857 // a future product decision, tracked separately, not a Stage-1 fix.
858 //
859 // HYBRID caveat (sharpest for CV): a cross-RUN value here is a
860 // mean-of-per-run-reductions, NOT a reduction recomputed over the
861 // combined raw set — the raw samples do not survive cross-RUN (phases
862 // are dropped), so there is no union to re-pool. For p99 / median /
863 // mean run-delay this mean-of-summaries is a defensible cohort
864 // statistic; for worst_wake_latency_cv it is a mean-of-ratios (the
865 // fold-of-ready-made-ratios shape the Rate kind exists to avoid), not a
866 // pooled CV — accepted here only because no combined set exists to
867 // recompute over, and it reproduces the deleted typed path's shape
868 // exactly. See [`MetricKind::Distribution`].
869 MetricKind::Distribution { .. }
870 | MetricKind::WorstLowest { .. }
871 | MetricKind::WakeLatencyTailRatio
872 | MetricKind::WorstCrossNodeRatio => finite.iter().sum::<f64>() / (finite.len() as f64),
873 MetricKind::Gauge(GaugeAgg::Avg) => {
874 // Weighted mean: sum(v * w) / sum(w). Uniform-weight
875 // callers (aggregate_samples) reduce to arithmetic
876 // mean per weight_for == |_| 1. Zero total weight
877 // degenerates to the unweighted mean rather than
878 // dividing by zero; mirrors `merge_metric_values` at
879 // `crate::assert::merge_matched_phase_buckets` per
880 // single-source-of-truth.
881 //
882 // `checked_add` on the running weight sum so a
883 // pathological caller (huge per-RUN sample counts
884 // across many runs) saturates to MAX rather than
885 // wrapping silently in release. On overflow we
886 // collapse to the unweighted-mean fallback so the
887 // returned value stays plausible.
888 let total_weight: usize = finite
889 .iter()
890 .enumerate()
891 .try_fold(0usize, |acc, (i, _)| acc.checked_add(weight_for(i)))
892 .unwrap_or(0);
893 if total_weight == 0 {
894 finite.iter().sum::<f64>() / (finite.len() as f64)
895 } else {
896 finite
897 .iter()
898 .enumerate()
899 .map(|(i, x)| *x * (weight_for(i) as f64))
900 .sum::<f64>()
901 / (total_weight as f64)
902 }
903 }
904 MetricKind::Gauge(GaugeAgg::Last) | MetricKind::Timestamp => {
905 *finite.last().expect("non-empty by check above")
906 }
907 MetricKind::Gauge(GaugeAgg::Max) | MetricKind::Peak => {
908 finite.iter().copied().fold(f64::NEG_INFINITY, f64::max)
909 }
910 // A Rate is derived from its components by `derive_rate_metrics`,
911 // never reduced from a single sample slice (one slice cannot
912 // express Σnum/Σdenom). EVERY aggregation path skips Rate before
913 // reaching the reducers: `aggregate_samples_for_phase` returns
914 // None, and the per-phase build, the cross-phase merge, and both
915 // cross-RUN reducers skip Rate keys then re-derive via
916 // `derive_rate_metrics`. So reaching here is a routing bug.
917 MetricKind::Rate { .. } => unreachable!(
918 "MetricKind::Rate must be derived via derive_rate_metrics, \
919 not reduced from a sample slice"
920 ),
921 // PerPhase is derived post-merge by derive_phase_metrics (into
922 // PhaseBucket.metrics and/or PhaseCgroupStats::metrics) and is gated out
923 // of the cross-RUN ext fold (fold_ext_metrics) + the within-run reducers
924 // (is_derived), so it never reaches a sample-slice reduction. Reaching
925 // here is a routing bug (the gate or is_derived was bypassed).
926 MetricKind::PerPhase => unreachable!(
927 "MetricKind::PerPhase is derived by derive_phase_metrics, \
928 not reduced from a sample slice"
929 ),
930 // PerRunDistribution is produced run-level by
931 // populate_run_pooled_schbench_distribution / populate_run_pooled_taobench_distribution
932 // (union of the per-phase PlatStats histograms) and is gated out of BOTH
933 // the within-run reducers (is_derived) AND the cross-RUN ext fold
934 // (fold_ext_metrics skip) — its only consumer is noise_findings reading the
935 // per-run scalar. So it never reaches a sample-slice reduction; reaching
936 // here is a routing bug.
937 MetricKind::PerRunDistribution => unreachable!(
938 "MetricKind::PerRunDistribution is produced by \
939 populate_run_pooled_schbench_distribution / populate_run_pooled_taobench_distribution \
940 and noise-compared per-run, not reduced from a sample slice"
941 ),
942 })
943}
944
945/// Per-phase metric reduction with the correct semantic per
946/// [`MetricKind`].
947///
948/// Counter kinds bypass [`aggregate_samples`]'s flat-run `sum`
949/// (which is correct for cross-RUN aggregation, but wrong for
950/// cumulative-since-boot per-phase data — summing 10 samples at
951/// `[100, 150, 175, ...]` yields ~425 instead of the per-phase
952/// delta `175 - 100 = 75`) and route through
953/// [`phase_counter_delta`] instead. All other kinds use
954/// [`aggregate_samples`] verbatim, which is correct for them
955/// (Gauge avg/last/max, Peak max, Timestamp last, and DeltaSum — whose
956/// samples are ALREADY per-read deltas, so the per-phase reduction is
957/// the sum of the in-phase deltas, NOT a last-minus-first that would
958/// difference two deltas into nonsense).
959///
960/// `samples` are the per-Sample readings of `metric` collected
961/// over one phase's window of
962/// [`crate::scenario::sample::Sample`]s via `MetricDef::read_sample`
963/// once that helper is wired through.
964/// Returns `None` when every reading was `None` / `NaN`.
965///
966/// Live caller: [`crate::assert::build_phase_buckets`] folds
967/// per-phase sample slices through this entry point and the
968/// result lands on [`crate::assert::PhaseBucket::metrics`]; the
969/// host-side `evaluate_vm_result` at `src/test_support/eval/mod.rs`
970/// is the consumer that drives the call.
971pub fn aggregate_samples_for_phase(metric: &MetricDef, samples: &[f64]) -> Option<f64> {
972 match metric.kind {
973 MetricKind::Counter => phase_counter_delta(samples),
974 // Derived kinds (every `is_derived()`: Rate / Distribution / WorstLowest /
975 // WakeLatencyTailRatio / WorstCrossNodeRatio / PerPhase / PerRunDistribution)
976 // have no samples
977 // of their own: their value is produced by a post-pass
978 // (`derive_rate_metrics` / `crate::assert::populate_run_distribution_metrics`)
979 // from pooled components, not reduced from a per-phase slice. Return
980 // None so the build loop inserts no key here.
981 k if k.is_derived() => None,
982 _ => aggregate_samples(samples, metric.kind),
983 }
984}
985
986/// Per-phase reduction for [`MetricKind::Counter`]: compute the
987/// last finite sample minus the first finite sample, clamping
988/// negative results (counter reset across a scheduler restart)
989/// to 0 and emitting a `tracing::warn!` so the reset is visible
990/// in stderr. Mirrors the existing
991/// `crate::monitor`-side counter-delta clamp pattern used
992/// when reducing cumulative kernel counters across boundaries
993/// for the same reset-detection reason.
994///
995/// Edge cases (sentinel-free absent-vs-measured-zero):
996/// - 0 or 1 finite samples -> `None`. A delta is UNMEASURABLE from
997/// fewer than two points; absence here is distinct from a measured
998/// zero. The renderer's has-data signal is `PhaseBucket::sample_count`
999/// (see `expect_metric`), NOT this value, so absence loses no
1000/// diagnostic. (Previously a 1-sample phase returned a phantom
1001/// `Some(0.0)` that made a per-phase Counter claim read 0 even when
1002/// the phase fired plenty — only one freeze landed.)
1003/// - 2+ finite samples -> `Some(max(0.0, last - first))` (equal
1004/// endpoints give a REAL `Some(0.0)`: the counter did not advance).
1005///
1006/// Live caller: [`aggregate_samples_for_phase`] dispatches the
1007/// Counter variant through this entry point.
1008pub fn phase_counter_delta(samples: &[f64]) -> Option<f64> {
1009 let finite: Vec<f64> = samples.iter().copied().filter(|x| x.is_finite()).collect();
1010 match finite.as_slice() {
1011 // 0 or 1 finite samples: a delta is unmeasurable from fewer than two
1012 // points -> None (sentinel-free contract). The 2+-equal case below
1013 // still yields a real Some(0.0).
1014 [] | [_] => None,
1015 [first, .., last] => {
1016 let delta = *last - *first;
1017 if delta < 0.0 {
1018 tracing::warn!(
1019 first = *first,
1020 last = *last,
1021 "phase_counter_delta: counter reset detected (last < first); clamping to 0"
1022 );
1023 Some(0.0)
1024 } else {
1025 Some(delta)
1026 }
1027 }
1028 }
1029}
1030
1031/// Derive every registered [`MetricKind::Rate`] metric in `metrics`
1032/// from its already-present numerator / denominator component values:
1033/// `metrics[rate] = metrics[numerator] / metrics[denominator]`.
1034///
1035/// This is the SOLE producer of a Rate metric's value. It runs as a
1036/// post-pass at nine aggregation sites where the components co-locate in
1037/// one map: the two per-phase builds, the cross-phase bucket merge, the
1038/// three cross-RUN ext-metrics reducers (`populate_run_ext_metrics`,
1039/// `populate_run_ext_metrics_from_phases`, `group_and_average_by`), and the
1040/// cross-CGROUP pooled re-pools
1041/// (`crate::assert::populate_run_pooled_iterations_per_cpu_sec`, run
1042/// post-`merge` at the eval layer to re-pool `iterations_per_cpu_sec` across a
1043/// run's cgroups, plus `crate::assert::populate_run_pooled_taobench` and
1044/// `crate::assert::populate_run_pooled_schbench` for the taobench/schbench
1045/// whole-run Rates). At each, the components are
1046/// pooled FIRST by their own kinds (a `Counter` numerator summed), then
1047/// the rate is re-derived — so for `Counter / Counter` the result is
1048/// `Σnumerator / Σdenominator`, the correct re-pool rather than a mean of
1049/// ready-made ratios. (The cross-CGROUP `AssertResult::merge` ext-metrics
1050/// fold itself uses worst-case polarity and is NOT a derive site — the
1051/// pooled re-pool above runs separately after it; see [`MetricKind::Rate`].)
1052///
1053/// A rate is skipped (its key left absent) when either component key is
1054/// missing, the denominator is zero, or either component is non-finite —
1055/// keeping an absent rate distinct from a real `0.0`.
1056///
1057/// INVARIANT: the producers must co-insert both components from the same
1058/// observation (both-or-neither per map) — e.g.
1059/// `build_phase_buckets_with_stimulus` inserts `total_phase_iterations` and
1060/// `total_phase_duration_sec` together under one `rate_components` guard. A
1061/// partial pair (numerator from one source, denominator from another) is
1062/// never produced today but would derive a cross-paired rate; any second
1063/// Rate must keep the co-insertion contract.
1064pub(crate) fn derive_rate_metrics(metrics: &mut std::collections::BTreeMap<String, f64>) {
1065 derive_rate_metrics_from(
1066 metrics,
1067 METRICS.iter().filter_map(|m| match m.kind {
1068 MetricKind::Rate {
1069 numerator,
1070 denominator,
1071 } => Some((m.name, numerator, denominator)),
1072 _ => None,
1073 }),
1074 );
1075}
1076
1077/// Inner of [`derive_rate_metrics`] taking the rate specs explicitly as
1078/// `(name, numerator, denominator)` so the derivation math is
1079/// unit-testable without a registered Rate metric in [`METRICS`].
1080pub(crate) fn derive_rate_metrics_from<'a>(
1081 metrics: &mut std::collections::BTreeMap<String, f64>,
1082 rates: impl Iterator<Item = (&'a str, &'a str, &'a str)>,
1083) {
1084 for (name, numerator, denominator) in rates {
1085 let (Some(num), Some(den)) = (
1086 metrics.get(numerator).copied(),
1087 metrics.get(denominator).copied(),
1088 ) else {
1089 continue;
1090 };
1091 if num.is_finite() && den.is_finite() && den != 0.0 {
1092 // Guard the QUOTIENT too: a finite num / finite tiny den can
1093 // overflow to +/-inf. Insert only a finite rate so an absent
1094 // rate stays distinct from a real value (no inf in the map).
1095 let rate = num / den;
1096 if rate.is_finite() {
1097 metrics.insert(name.to_string(), rate);
1098 }
1099 }
1100 }
1101}
1102
1103impl MetricDef {
1104 /// Read this metric's value from `row`. Consults the
1105 /// accessor first (for built-in `GauntletRow` fields) and
1106 /// falls back to `row.ext_metrics[self.name]` when the
1107 /// accessor returns `None`.
1108 pub fn read(&self, row: &GauntletRow) -> Option<f64> {
1109 (self.accessor)(row).or_else(|| row.ext_metrics.get(self.name).copied())
1110 }
1111
1112 /// Read this metric's value from a single
1113 /// [`crate::scenario::sample::Sample`] — the per-sample
1114 /// analogue of [`Self::read`] used by the per-phase
1115 /// aggregator to fold a window of samples into one
1116 /// [`crate::assert::PhaseBucket`] value per metric.
1117 ///
1118 /// Returns `None` for metrics that cannot be derived from a
1119 /// single-sample shape: most ktstr metrics are computed host-side
1120 /// (cross-CPU / cross-cgroup folds, run-level distributional
1121 /// re-pools, or monitor-axis windowing), not from one sample —
1122 /// `worst_spread`, `worst_gap_ms`, `worst_migration_ratio`,
1123 /// `max_imbalance_ratio`, the `worst_*_wake_latency_*` /
1124 /// `worst_mean_run_delay_us` / `worst_run_delay_us` distributions,
1125 /// `worst_iterations_per_worker` / `worst_iterations_per_cpu_sec`,
1126 /// `worst_page_locality`, `worst_cross_node_migration_ratio`,
1127 /// `worst_wake_latency_tail_ratio` — and have no single-sample
1128 /// reading.
1129 ///
1130 /// Wired per-sample arms (return `Some`): `max_dsq_depth` /
1131 /// `avg_dsq_depth` from `sample.snapshot`'s DSQ-walker,
1132 /// `total_fallback` / `total_keep_last` from its SCX events
1133 /// region, and the IRQ/steal cross-CPU sums `total_hardirqs`,
1134 /// `total_softirq_net_rx` / `total_softirq_net_tx` /
1135 /// `total_softirq_timer` / `total_softirq_sched`,
1136 /// `total_irq_time_ns`, `total_softirq_time_ns`, and
1137 /// `total_steal_time_ns` from its `per_cpu_time`. Every other
1138 /// registered metric falls to `_ => None`
1139 /// here, for one of three reasons: (1) it is a MONITOR-axis
1140 /// signal with no guest-`Snapshot` shape (`stuck_count`,
1141 /// `max_imbalance_ratio`, `avg_imbalance_ratio`) — folded
1142 /// per-phase from `MonitorSample` windowing in
1143 /// [`crate::assert::build_phase_buckets`], NOT from read_sample;
1144 /// (2) it has no per-sample source yet (`total_migrations`,
1145 /// `total_iterations` — per-task guest counters not captured per
1146 /// tick); or (3) it is a run-level metric with no single-sample
1147 /// reading (the `worst_*` family above).
1148 /// [`crate::stats::aggregate_samples_for_phase`] surfaces an
1149 /// all-None reduction as a `None` bucket entry — distinct from
1150 /// `Some(0.0)` (a real zero) — so the bucket renderer can paint
1151 /// "no data" vs "real zero" distinctly without losing information.
1152 ///
1153 /// Live caller: [`crate::assert::build_phase_buckets`] calls
1154 /// `read_sample` once per [`crate::stats::METRICS`] entry per
1155 /// sample to collect the per-sample readings the per-phase
1156 /// aggregator folds. The host-side `evaluate_vm_result` at
1157 /// `src/test_support/eval/mod.rs` drives the chain.
1158 pub fn read_sample(&self, sample: &crate::scenario::sample::Sample<'_>) -> Option<f64> {
1159 // Per-metric dispatch by registry name. Only the metrics
1160 // whose value is genuinely a per-sample reading are wired;
1161 // every other entry in the METRICS registry is
1162 // cross-cgroup folds or run-level distributional re-pools
1163 // computed host-side at `evaluate_vm_result` time
1164 // (worst-spread / worst-gap-ms fold; the
1165 // `worst_*_wake_latency_*` distributions + worst-iterations-per-
1166 // worker efficiencies re-pool) and have no single-sample
1167 // equivalent —
1168 // they fall through to None below and the phase
1169 // aggregator paints them as absent bucket entries
1170 // (distinct from a real zero — sentinel-free contract).
1171 match self.name {
1172 // BPF dsq-state walker captures per-DSQ depth at the
1173 // freeze instant. `local_dsq_depth` is the per-CPU
1174 // local DSQ; take max across CPUs because the metric
1175 // is Peak-kind ("worst depth this instant"). DsqState
1176 // sets `origin = "local cpu N"` for local DSQs (see
1177 // src/monitor/scx_walker.rs `DsqState::origin`); the
1178 // filter pins the metric to the local-DSQ class so
1179 // global / bypass / user DSQs do not pollute the
1180 // reading.
1181 "max_dsq_depth" => sample
1182 .snapshot
1183 .dsq_states()
1184 .iter()
1185 .filter(|d| d.origin.starts_with("local cpu "))
1186 .map(|d| u64::from(d.nr))
1187 .max()
1188 .map(|v| v as f64),
1189 // Per-sample arithmetic mean of the same local-CPU
1190 // DSQ depth readings `max_dsq_depth` walks. Returns
1191 // `None` when no local DSQs are present so the bucket
1192 // renderer can distinguish "no data" from "real zero"
1193 // (sentinel-free contract); a zero-population set
1194 // never enters the mean.
1195 "avg_dsq_depth" => {
1196 let locals: Vec<f64> = sample
1197 .snapshot
1198 .dsq_states()
1199 .iter()
1200 .filter(|d| d.origin.starts_with("local cpu "))
1201 .map(|d| u64::from(d.nr) as f64)
1202 .collect();
1203 if locals.is_empty() {
1204 None
1205 } else {
1206 Some(locals.iter().sum::<f64>() / locals.len() as f64)
1207 }
1208 }
1209 // Cumulative `select_cpu_fallback` counter at the
1210 // freeze instant. The host's event-counter walker
1211 // builds a per-tick timeline of CPU-summed counters
1212 // (`EventCounterSample` at src/monitor/dump/mod.rs:477);
1213 // `.last()` gives the cumulative reading at the most
1214 // recent tick within this freeze's capture window.
1215 // Counter-kind reduction folds `last - first` across
1216 // the phase's sample window, yielding the per-phase
1217 // delta (the genuine "how many fallbacks fired during
1218 // THIS phase").
1219 "total_fallback" => sample
1220 .snapshot
1221 .event_counter_timeline()
1222 .last()
1223 .map(|e| e.select_cpu_fallback as f64),
1224 // Cumulative `dispatch_keep_last` counter; same
1225 // per-tick timeline source as `total_fallback`. Same
1226 // Counter-kind reduction semantic; per-phase delta
1227 // surfaces the keep-last count for THIS phase.
1228 "total_keep_last" => sample
1229 .snapshot
1230 .event_counter_timeline()
1231 .last()
1232 .map(|e| e.dispatch_keep_last as f64),
1233 // IRQ observability: cross-CPU SUM of the cumulative per-CPU
1234 // counter at this freeze. Counter kind takes the per-phase
1235 // last-minus-first (phase_counter_delta) over these per-freeze
1236 // totals. The per-CPU set is fixed across freezes (every CPU is
1237 // present every freeze), so the cross-CPU sum has NO task-set-change
1238 // inflation — the exact reason system_time_ns below is NOT a
1239 // read_sample arm but these are. Empty per_cpu_time -> None
1240 // (loud-absent, never a false zero). Softirq vectors index against
1241 // the compile-pinned named consts, never bare literals.
1242 "total_hardirqs" => {
1243 // saturating fold (applies to every IRQ arm below): overflow-safe
1244 // cross-CPU spatial sum of the guest per-CPU counter — a corrupt /
1245 // hostile per-CPU u64::MAX must clamp, not wrap, this per-freeze
1246 // total (the Counter delta reads it). Exact for every in-range value.
1247 let cpus = sample.snapshot.per_cpu_time();
1248 (!cpus.is_empty()).then(|| {
1249 cpus.iter()
1250 .map(|c| c.irqs_sum)
1251 .fold(0u64, u64::saturating_add) as f64
1252 })
1253 }
1254 "total_softirq_net_rx" => {
1255 let cpus = sample.snapshot.per_cpu_time();
1256 (!cpus.is_empty()).then(|| {
1257 cpus.iter()
1258 .map(|c| c.softirqs[crate::monitor::btf_offsets::SOFTIRQ_NET_RX])
1259 .fold(0u64, u64::saturating_add) as f64
1260 })
1261 }
1262 "total_softirq_net_tx" => {
1263 let cpus = sample.snapshot.per_cpu_time();
1264 (!cpus.is_empty()).then(|| {
1265 cpus.iter()
1266 .map(|c| c.softirqs[crate::monitor::btf_offsets::SOFTIRQ_NET_TX])
1267 .fold(0u64, u64::saturating_add) as f64
1268 })
1269 }
1270 "total_softirq_timer" => {
1271 let cpus = sample.snapshot.per_cpu_time();
1272 (!cpus.is_empty()).then(|| {
1273 cpus.iter()
1274 .map(|c| c.softirqs[crate::monitor::btf_offsets::SOFTIRQ_TIMER])
1275 .fold(0u64, u64::saturating_add) as f64
1276 })
1277 }
1278 "total_softirq_sched" => {
1279 let cpus = sample.snapshot.per_cpu_time();
1280 (!cpus.is_empty()).then(|| {
1281 cpus.iter()
1282 .map(|c| c.softirqs[crate::monitor::btf_offsets::SOFTIRQ_SCHED])
1283 .fold(0u64, u64::saturating_add) as f64
1284 })
1285 }
1286 "total_irq_time_ns" => {
1287 let cpus = sample.snapshot.per_cpu_time();
1288 (!cpus.is_empty()).then(|| {
1289 cpus.iter()
1290 .map(|c| c.cpustat_irq_ns)
1291 .fold(0u64, u64::saturating_add) as f64
1292 })
1293 }
1294 "total_softirq_time_ns" => {
1295 let cpus = sample.snapshot.per_cpu_time();
1296 (!cpus.is_empty()).then(|| {
1297 cpus.iter()
1298 .map(|c| c.cpustat_softirq_ns)
1299 .fold(0u64, u64::saturating_add) as f64
1300 })
1301 }
1302 "total_steal_time_ns" => {
1303 let cpus = sample.snapshot.per_cpu_time();
1304 (!cpus.is_empty()).then(|| {
1305 cpus.iter()
1306 .map(|c| c.cpustat_steal_ns)
1307 .fold(0u64, u64::saturating_add) as f64
1308 })
1309 }
1310 // `system_time_ns` / `user_time_ns` are deliberately absent
1311 // here: they are NOT read per-sample. A per-sample
1312 // cross-thread SUM followed by a Counter `last - first`
1313 // inflates whenever the captured task set changes between
1314 // freezes — a task carrying a large cumulative counter that
1315 // appears only in a LATER sample dumps its entire pre-phase
1316 // history into the delta. They are injected post-hoc as a
1317 // per-thread-GROUP delta (each tgid's first-seen-to-last-seen
1318 // `thread_group_cputime`) by
1319 // [`crate::assert::phase_group_cpu_delta`], which subtracts
1320 // each group's own first-seen total and so bounds the result
1321 // by wall-clock × cores. Still observer-free — that injector
1322 // reads the same frozen `task_struct` enrichments.
1323 //
1324 // Every other metric stays None. The 16 host-only
1325 // names (full list in the doc comment above) compute
1326 // cross-cgroup folds at `evaluate_vm_result` time and
1327 // have no per-sample equivalent until a per-cgroup
1328 // per-sample capture path lands; surfacing them via a
1329 // synthetic single-sample value would falsify the
1330 // per-phase trajectory the bucket renderer paints.
1331 _ => None,
1332 }
1333 }
1334
1335 /// Returns `true` when a metric INCREASING is the bad direction:
1336 /// [`LowerBetter`](crate::test_support::Polarity::LowerBetter),
1337 /// [`TargetValue`](crate::test_support::Polarity::TargetValue), and the
1338 /// conservative [`Unknown`](crate::test_support::Polarity::Unknown)
1339 /// default (an unclassified metric is treated as higher-is-worse so a
1340 /// real regression in it is still caught). `false` for
1341 /// [`HigherBetter`](crate::test_support::Polarity::HigherBetter) and
1342 /// [`Informational`](crate::test_support::Polarity::Informational).
1343 ///
1344 /// This is the cross-cgroup FOLD direction (max-vs-min when merging
1345 /// per-cgroup `ext_metrics`, see [`crate::assert::AssertResult::merge`])
1346 /// and the timeline-narrative direction. The PERF-DELTA VERDICT path
1347 /// uses [`classify_direction`](Self::classify_direction) instead, which
1348 /// returns `None` for `Informational` so it never gates. `Informational`
1349 /// folding as `false` (min) here is harmless: the system-wide monitor
1350 /// counters that carry it are row-level and never hit the per-cgroup
1351 /// merge.
1352 pub const fn higher_is_worse(&self) -> bool {
1353 use crate::test_support::Polarity;
1354 matches!(
1355 self.polarity,
1356 Polarity::LowerBetter | Polarity::TargetValue(_) | Polarity::Unknown
1357 )
1358 }
1359
1360 /// Verdict direction for the perf-delta comparison:
1361 /// - `Some(true)` — an INCREASE is a regression (`LowerBetter` /
1362 /// `TargetValue` / the conservative `Unknown` default),
1363 /// - `Some(false)` — a DECREASE is a regression (`HigherBetter`),
1364 /// - `None` — [`Informational`](crate::test_support::Polarity::Informational):
1365 /// directionless; the comparison records and displays it but NEVER
1366 /// classifies it as regression/improvement and it NEVER affects the
1367 /// exit code.
1368 ///
1369 /// The verdict sites in [`compare_partitions`] branch on the `Option`:
1370 /// `None` => informational, `Some(hiw)` => the dual-gated
1371 /// regression/improvement split. This matches
1372 /// [`higher_is_worse`](Self::higher_is_worse) for every variant EXCEPT
1373 /// `Informational` (there `false`/min-fold, here `None`/never-gated) —
1374 /// the deliberate split between "fold needs a direction" and "verdict
1375 /// must stay neutral".
1376 pub const fn classify_direction(&self) -> Option<bool> {
1377 self.polarity.classify_direction()
1378 }
1379
1380 /// Whether this metric's value can reach the AGGREGATE findings the
1381 /// perf-delta failure gate reads — the cross-run scalar compare
1382 /// ([`compare_partitions`], `noise_adjust == false`) or the per-run noise
1383 /// compare ([`compare_partitions_noise`], `noise_adjust == true`). `false`
1384 /// for names whose value never lands on a compared row in that mode, so a
1385 /// `--must-fail` gate on one could never fire (a silent no-op):
1386 /// - [`MetricKind::PerPhase`]: `accessor` is `None`, it has no run-level
1387 /// producer, and it is gated out of the cross-run ext fold — its value
1388 /// lives only in per-phase carriers, never on a `GauntletRow`, so it
1389 /// reaches NEITHER compare. Always `false`.
1390 /// - [`MetricKind::PerRunDistribution`]: `accessor` is `None` and it is
1391 /// gated out of the cross-run ext fold, so it is absent on the scalar
1392 /// compare's cross-run-folded rows — but each run carries its own
1393 /// `*_whole` scalar that [`compare_partitions_noise`] reads, so it CAN
1394 /// gate under `--noise-adjust`. `noise_adjust`-only.
1395 ///
1396 /// Every other kind reaches both compares; whether it then produces a
1397 /// *regression* (vs an informational finding) is a separate
1398 /// direction question — see [`classify_direction`](Self::classify_direction).
1399 pub const fn gates_aggregate(&self, noise_adjust: bool) -> bool {
1400 match self.kind {
1401 MetricKind::PerPhase => false,
1402 MetricKind::PerRunDistribution => noise_adjust,
1403 _ => true,
1404 }
1405 }
1406}
1407
1408/// Unified metric registry covering all built-in and extensible metrics.
1409///
1410/// The comparison pipeline uses `higher_is_worse` to determine regression
1411/// direction, `default_abs`/`default_rel` for dual-gate significance
1412/// thresholds, and `display_unit` for formatted output. Per-test
1413/// assertion overrides can still use their own thresholds; this registry
1414/// is the source of truth for polarity and display.
1415///
1416/// `AssertResult::merge` consults `higher_is_worse` via [`metric_def`]
1417/// when folding per-cgroup `ext_metrics` into the scenario-level worst
1418/// case: `true` takes max, `false` takes min. Unknown names (not in
1419/// this registry) default to max; register a `MetricDef` here before
1420/// relying on min-polarity merge. The comparison system
1421/// ([`compare_partitions`]) uses `higher_is_worse` for delta direction.
1422///
1423/// # Metric-name triples (registry / field / DataFrame column)
1424///
1425/// Each metric is referenced by three names across the pipeline.
1426/// The registry name is the stable surface — sidecars, CI gates,
1427/// and `cargo ktstr perf-delta` output all quote it verbatim —
1428/// and cannot be renamed without silently invalidating downstream
1429/// consumers. The field name on [`GauntletRow`] and the polars
1430/// DataFrame column name are internal; they are kept terse and
1431/// match each other, but diverge from the registry name where
1432/// the domain-level wording adds context (`worst_*`, `total_*`,
1433/// `max_*`) that would be noise on an already-qualified field.
1434/// Nine divergent triples:
1435///
1436/// | Registry (`MetricDef.name`) | `GauntletRow` field | DataFrame column |
1437/// |---|---|---|
1438/// | `worst_spread` | `spread` | `spread` |
1439/// | `worst_gap_ms` | `gap_ms` | `gap_ms` |
1440/// | `total_migrations` | `migrations` | `migrations` |
1441/// | `worst_migration_ratio` | `migration_ratio` | `migration_ratio` |
1442/// | `max_imbalance_ratio` | `imbalance_ratio` | `imbalance` |
1443/// | `max_dsq_depth` | `max_dsq_depth` | `dsq_depth` |
1444/// | `stuck_count` | `stuck_count` | `stuck` |
1445/// | `total_fallback` | `fallback_count` | `fallback` |
1446/// | `total_keep_last` | `keep_last_count` | `keep_last` |
1447///
1448/// One of the remaining metrics in [`METRICS`] has matching
1449/// registry / field / DataFrame column names backed by a typed
1450/// `GauntletRow` field (`total_iterations`) and is not listed — no
1451/// translation to document.
1452///
1453/// The ten wake-latency / run-delay / iteration-efficiency / NUMA roll-ups
1454/// (`worst_p99_wake_latency_us`, `worst_median_wake_latency_us`,
1455/// `worst_wake_latency_cv`, `worst_mean_run_delay_us`,
1456/// `worst_run_delay_us`, `worst_iterations_per_worker`,
1457/// `worst_iterations_per_cpu_sec`, `worst_wake_latency_tail_ratio`,
1458/// `worst_page_locality`, `worst_cross_node_migration_ratio`) are
1459/// DERIVED kinds ([`MetricKind::Distribution`] / [`MetricKind::WorstLowest`]
1460/// / [`MetricKind::WakeLatencyTailRatio`] / [`MetricKind::WorstCrossNodeRatio`])
1461/// with NO typed `GauntletRow`
1462/// field: their accessors are `|_| None` and
1463/// `crate::assert::populate_run_distribution_metrics` re-pools their value
1464/// into `ext_metrics` post-merge, so [`MetricDef::read`] reads them through
1465/// the ext fallback.
1466///
1467/// `worst_` naming convention: it is the codebase-wide prefix for a
1468/// cross-cgroup roll-up, independent of polarity and of HOW the roll-up is
1469/// formed. Polarity-directional selectors (`worst_spread`, and the derived
1470/// `worst_cross_node_migration_ratio`, both LowerBetter → max) and
1471/// [`MetricKind::WorstLowest`] (`worst_page_locality` +
1472/// `worst_iterations_per_*`, None-aware lowest-wins where a measured 0.0
1473/// wins) both surface the most problematic cgroup; whereas
1474/// [`MetricKind::Distribution`] (`worst_p99_wake_latency_us` etc.) is the
1475/// POOLED cross-cgroup distribution over the combined sample set, NOT a
1476/// per-cgroup selection — here `worst_` is retained for sidecar /
1477/// DataFrame / CI-gate name stability rather than literal accuracy. A
1478/// `lowest_*` rename of the HigherBetter selectors was weighed and
1479/// rejected as a high-churn rename across sidecars / DataFrames / CI gates
1480/// for no readability gain.
1481///
1482/// Quoting the matching list instead of a bare count avoids
1483/// silent drift on rename: a metric whose registry / field /
1484/// column names diverge belongs in the table above, while a
1485/// matching triple belongs in this paragraph; a future rename
1486/// that forgets to migrate the metric across the boundary
1487/// surfaces here as a stale list rather than a wrong count.
1488///
1489/// Consumers that cross the registry / DataFrame boundary should
1490/// go through [`MetricDef::read`] / the accessor closure rather
1491/// than hand-translating by string. The four-name mapping for
1492/// `worst_spread` specifically is documented in detail on the
1493/// [`GauntletRow::spread`] field (adds the
1494/// [`ScenarioStats::worst_spread`](crate::assert::ScenarioStats::worst_spread)
1495/// upstream source as a fourth name).
1496/// Registry names for the schbench per-phase metrics ([`MetricKind::PerPhase`]).
1497/// Shared by the [`METRICS`] entries below and the schbench per-phase derivation
1498/// (`crate::assert::derive_phase_metrics`) so the registered name and
1499/// the key the derivation writes into `crate::assert::PhaseBucket::metrics` are
1500/// one source of truth. Latency keys are µs (the unit `plat` buckets in);
1501/// sched-delay keys are µs (converted from ns at derivation); loop_count is a
1502/// bare count.
1503pub(crate) const SCHBENCH_WAKEUP_P50_US: &str = "wakeup_p50_latency_us";
1504pub(crate) const SCHBENCH_WAKEUP_P90_US: &str = "wakeup_p90_latency_us";
1505pub(crate) const SCHBENCH_WAKEUP_P99_US: &str = "wakeup_p99_latency_us";
1506pub(crate) const SCHBENCH_WAKEUP_P999_US: &str = "wakeup_p999_latency_us";
1507pub(crate) const SCHBENCH_REQUEST_P50_US: &str = "request_p50_latency_us";
1508pub(crate) const SCHBENCH_REQUEST_P90_US: &str = "request_p90_latency_us";
1509pub(crate) const SCHBENCH_REQUEST_P99_US: &str = "request_p99_latency_us";
1510pub(crate) const SCHBENCH_REQUEST_P999_US: &str = "request_p999_latency_us";
1511pub(crate) const SCHBENCH_SCHED_DELAY_MSG_US: &str = "sched_delay_msg_us";
1512pub(crate) const SCHBENCH_SCHED_DELAY_WORKER_US: &str = "sched_delay_worker_us";
1513pub(crate) const SCHBENCH_LOOP_COUNT: &str = "schbench_loop_count";
1514// schbench WHOLE-RUN Class-3 keys (loop count + role-separate run-delay gate
1515// Rates) for perf-delta --noise-adjust. Re-pooled run-level by
1516// `populate_run_pooled_schbench` from the per-phase per-cgroup
1517// `SchbenchPhaseStats` raw pairs (Σ over all phases+cgroups). The four
1518// `total_schbench_*` run-delay/pcount Counters are the rate components
1519// (RENDER_SUPPRESSED) for the two sample-weighted Σrun_delay/Σpcount gate Rates
1520// (workload-scoped siblings of the system-wide `total_run_delay_ns_per_sched`);
1521// the message and worker thread ROLES pool separately (different per-schedule
1522// wait populations). The per-phase `sched_delay_msg/worker_us` is the SAME
1523// Σrun_delay_ns/Σpcount per-schedule mean at phase scope (NOT schbench's native
1524// mean-of-per-thread-means, a separate whole-run SchbenchResult stat) and stays
1525// PerPhase display-only — only these Rates gate, no double-count.
1526// `total_schbench_loops` is the whole-run loop Counter (distinct from the
1527// per-phase `schbench_loop_count`).
1528pub(crate) const TOTAL_SCHBENCH_MSG_RUN_DELAY_NS: &str = "total_schbench_msg_run_delay_ns";
1529pub(crate) const TOTAL_SCHBENCH_MSG_PCOUNT: &str = "total_schbench_msg_pcount";
1530pub(crate) const TOTAL_SCHBENCH_WORKER_RUN_DELAY_NS: &str = "total_schbench_worker_run_delay_ns";
1531pub(crate) const TOTAL_SCHBENCH_WORKER_PCOUNT: &str = "total_schbench_worker_pcount";
1532pub(crate) const TOTAL_SCHBENCH_LOOPS: &str = "total_schbench_loops";
1533pub(crate) const SCHBENCH_MSG_RUN_DELAY_NS_PER_SCHED: &str = "schbench_msg_run_delay_ns_per_sched";
1534pub(crate) const SCHBENCH_WORKER_RUN_DELAY_NS_PER_SCHED: &str =
1535 "schbench_worker_run_delay_ns_per_sched";
1536// schbench WHOLE-RUN distributional keys for perf-delta --noise-adjust:
1537// each per-phase percentile/min/max re-pooled run-level by
1538// `populate_run_pooled_schbench_distribution` (union of the per-phase per-cgroup
1539// PlatStats histograms, percentile re-derived over the union). MetricKind::
1540// PerRunDistribution: noise-compared per-run, NEVER cross-run folded. `*_whole`
1541// names keep them registry-distinct from the per-phase PerPhase keys above (one
1542// name = one kind). Latency LowerBetter; rps HigherBetter.
1543pub(crate) const SCHBENCH_WAKEUP_P50_US_WHOLE: &str = "wakeup_p50_latency_us_whole";
1544pub(crate) const SCHBENCH_WAKEUP_P90_US_WHOLE: &str = "wakeup_p90_latency_us_whole";
1545pub(crate) const SCHBENCH_WAKEUP_P99_US_WHOLE: &str = "wakeup_p99_latency_us_whole";
1546pub(crate) const SCHBENCH_WAKEUP_P999_US_WHOLE: &str = "wakeup_p999_latency_us_whole";
1547pub(crate) const SCHBENCH_WAKEUP_MIN_US_WHOLE: &str = "wakeup_min_latency_us_whole";
1548pub(crate) const SCHBENCH_WAKEUP_MAX_US_WHOLE: &str = "wakeup_max_latency_us_whole";
1549pub(crate) const SCHBENCH_REQUEST_P50_US_WHOLE: &str = "request_p50_latency_us_whole";
1550pub(crate) const SCHBENCH_REQUEST_P90_US_WHOLE: &str = "request_p90_latency_us_whole";
1551pub(crate) const SCHBENCH_REQUEST_P99_US_WHOLE: &str = "request_p99_latency_us_whole";
1552pub(crate) const SCHBENCH_REQUEST_P999_US_WHOLE: &str = "request_p999_latency_us_whole";
1553pub(crate) const SCHBENCH_REQUEST_MIN_US_WHOLE: &str = "request_min_latency_us_whole";
1554pub(crate) const SCHBENCH_REQUEST_MAX_US_WHOLE: &str = "request_max_latency_us_whole";
1555pub(crate) const SCHBENCH_RPS_P20_WHOLE: &str = "rps_p20_whole";
1556pub(crate) const SCHBENCH_RPS_P50_WHOLE: &str = "rps_p50_whole";
1557pub(crate) const SCHBENCH_RPS_P90_WHOLE: &str = "rps_p90_whole";
1558pub(crate) const SCHBENCH_RPS_MIN_WHOLE: &str = "rps_min_whole";
1559pub(crate) const SCHBENCH_RPS_MAX_WHOLE: &str = "rps_max_whole";
1560// taobench per-phase metric keys (the WorkType::Taobench engine's qps + hit
1561// ratios, derived per-phase by write_taobench_scalars; MetricKind::PerPhase).
1562// total/fast qps are HigherBetter; slow_qps + hit_ratio + hit_rate are
1563// Informational (slow_qps is a component, not a direction; the hit numbers are
1564// run-validity signals, not regression directions). The two hit keys are distinct
1565// axes: `taobench_hit_ratio` is RESPONSE-time (fast_ops / (fast_ops + slow_ops),
1566// the whole-run analog is `taobench_hit_fraction`) and `taobench_hit_rate` is
1567// COMMAND-time (1 - get_misses / get_cmds, the whole-run analog is
1568// `taobench_command_hit_rate`). Under open-loop arrival the two diverge
1569// (request-time vs response-time).
1570pub(crate) const TAOBENCH_TOTAL_QPS: &str = "taobench_total_qps";
1571pub(crate) const TAOBENCH_FAST_QPS: &str = "taobench_fast_qps";
1572pub(crate) const TAOBENCH_SLOW_QPS: &str = "taobench_slow_qps";
1573/// Response-time per-phase hit ratio: fast_ops / (fast_ops + slow_ops).
1574pub(crate) const TAOBENCH_HIT_RATIO: &str = "taobench_hit_ratio";
1575/// Command-time per-phase hit rate: 1 - get_misses / get_cmds.
1576pub(crate) const TAOBENCH_HIT_RATE: &str = "taobench_hit_rate";
1577// taobench per-phase open-loop SERVE-LATENCY percentiles (µs): the
1578// coordinated-omission serve latency distribution per phase (PerPhase,
1579// LowerBetter), pooled cross-cgroup + re-derived by `write_taobench_scalars`.
1580// Absent in closed loop (no serve samples).
1581pub(crate) const TAOBENCH_SERVE_P50_US: &str = "taobench_serve_p50_us";
1582pub(crate) const TAOBENCH_SERVE_P90_US: &str = "taobench_serve_p90_us";
1583pub(crate) const TAOBENCH_SERVE_P99_US: &str = "taobench_serve_p99_us";
1584pub(crate) const TAOBENCH_SERVE_P999_US: &str = "taobench_serve_p999_us";
1585pub(crate) const TAOBENCH_SERVE_MIN_US: &str = "taobench_serve_min_us";
1586pub(crate) const TAOBENCH_SERVE_MAX_US: &str = "taobench_serve_max_us";
1587// taobench WHOLE-RUN Rate component + Rate keys (the run-level qps + hit
1588// fraction, pooled cross-cgroup by `populate_run_pooled_taobench` and derived by
1589// `derive_rate_metrics`). Distinct from the per-phase `taobench_*_qps` above
1590// (`MetricKind::PerPhase`, invisible to the whole-run cross-run fold): these are
1591// registered `Rate`/`Counter` METRICS so they reach perf-delta `--noise-adjust`
1592// spread. The four `total_taobench_*` Counters are the rate components (their
1593// `total_` prefix satisfies the Counter naming gate); they are
1594// `RENDER_SUPPRESSED_COMPONENTS` so the compare output shows the rates, not the
1595// raw counts.
1596pub(crate) const TOTAL_TAOBENCH_OPS: &str = "total_taobench_ops";
1597pub(crate) const TOTAL_TAOBENCH_FAST_OPS: &str = "total_taobench_fast_ops";
1598pub(crate) const TOTAL_TAOBENCH_SLOW_OPS: &str = "total_taobench_slow_ops";
1599pub(crate) const TOTAL_TAOBENCH_WALL_SEC: &str = "total_taobench_wall_sec";
1600pub(crate) const TAOBENCH_TOTAL_OPS_PER_SEC: &str = "taobench_total_ops_per_sec";
1601pub(crate) const TAOBENCH_FAST_OPS_PER_SEC: &str = "taobench_fast_ops_per_sec";
1602pub(crate) const TAOBENCH_SLOW_OPS_PER_SEC: &str = "taobench_slow_ops_per_sec";
1603pub(crate) const TAOBENCH_HIT_FRACTION: &str = "taobench_hit_fraction";
1604// taobench WHOLE-RUN open-loop serve-latency percentiles (µs): the union of the
1605// per-phase per-cgroup serve histograms re-derived run-level
1606// (`MetricKind::PerRunDistribution` — noise-compared per-run, never cross-run
1607// folded), pooled by `populate_run_pooled_taobench_distribution`. `*_whole`
1608// names, distinct from the per-phase `taobench_serve_*_us` keys above.
1609pub(crate) const TAOBENCH_SERVE_P50_US_WHOLE: &str = "taobench_serve_p50_us_whole";
1610pub(crate) const TAOBENCH_SERVE_P90_US_WHOLE: &str = "taobench_serve_p90_us_whole";
1611pub(crate) const TAOBENCH_SERVE_P99_US_WHOLE: &str = "taobench_serve_p99_us_whole";
1612pub(crate) const TAOBENCH_SERVE_P999_US_WHOLE: &str = "taobench_serve_p999_us_whole";
1613pub(crate) const TAOBENCH_SERVE_MIN_US_WHOLE: &str = "taobench_serve_min_us_whole";
1614pub(crate) const TAOBENCH_SERVE_MAX_US_WHOLE: &str = "taobench_serve_max_us_whole";
1615// taobench WHOLE-RUN command-time hit: the request-time hit rate (distinct from
1616// the response-time `taobench_hit_fraction`; the two diverge under open-loop
1617// arrival). hits = cmds − misses, pooled cross-cgroup by
1618// `populate_run_pooled_taobench`; `taobench_command_hit_rate` = Σhits/Σcmds
1619// (`total_` Counter components satisfy the naming gate; the Rate ends in `_rate`).
1620// Whole-run Rates use their rate-form name, never the `_whole` suffix (which is
1621// the PerRunDistribution marker) — the same convention as `taobench_*_per_sec`.
1622pub(crate) const TOTAL_TAOBENCH_GET_CMDS: &str = "total_taobench_get_cmds";
1623pub(crate) const TOTAL_TAOBENCH_GET_HITS: &str = "total_taobench_get_hits";
1624pub(crate) const TAOBENCH_COMMAND_HIT_RATE: &str = "taobench_command_hit_rate";
1625// Per-phase latency min/max (schbench's `min=`/`max=` table footer,
1626// `schbench.c:579`): the per-phase PlatStats already carries them, so these are
1627// emitted from `q.min`/`q.max`. LowerBetter (a higher min/max latency is worse).
1628pub(crate) const SCHBENCH_WAKEUP_MIN_US: &str = "wakeup_min_latency_us";
1629pub(crate) const SCHBENCH_WAKEUP_MAX_US: &str = "wakeup_max_latency_us";
1630pub(crate) const SCHBENCH_REQUEST_MIN_US: &str = "request_min_latency_us";
1631pub(crate) const SCHBENCH_REQUEST_MAX_US: &str = "request_max_latency_us";
1632// Per-phase achieved-RPS distribution (schbench's RPS table, PLIST_FOR_RPS =
1633// 20/50/90, `schbench.c:130`) + its min/max. HigherBetter (more requests/sec =
1634// more throughput); the min/max INVERT the latency polarity (a higher worst-
1635// second rate is better). A per-second RATE, so no `_us` suffix.
1636pub(crate) const SCHBENCH_RPS_P20: &str = "rps_p20";
1637pub(crate) const SCHBENCH_RPS_P50: &str = "rps_p50";
1638pub(crate) const SCHBENCH_RPS_P90: &str = "rps_p90";
1639pub(crate) const SCHBENCH_RPS_MIN: &str = "rps_min";
1640pub(crate) const SCHBENCH_RPS_MAX: &str = "rps_max";
1641
1642pub static METRICS: &[MetricDef] = &[
1643 MetricDef {
1644 // `"worst_spread"` is the wire/surface name — emitted in
1645 // sidecars, referenced by CI gates, and printed by
1646 // `cargo ktstr perf-delta`. Internally the field on
1647 // `GauntletRow` is named `spread` and the polars DataFrame
1648 // column keeps that shorter name; see the doc on
1649 // `GauntletRow.spread` for the rationale (rename-of-
1650 // registry-name is not safe because existing gate configs
1651 // match this string by value).
1652 name: "worst_spread",
1653 polarity: crate::test_support::Polarity::LowerBetter,
1654 kind: MetricKind::Gauge(GaugeAgg::Last),
1655 default_abs: 5.0,
1656 default_rel: 0.25,
1657 display_unit: "%",
1658 accessor: |r| Some(r.spread),
1659 },
1660 MetricDef {
1661 name: "worst_gap_ms",
1662 polarity: crate::test_support::Polarity::LowerBetter,
1663 kind: MetricKind::Peak,
1664 default_abs: 500.0,
1665 default_rel: 0.50,
1666 display_unit: "ms",
1667 accessor: |r| Some(r.gap_ms as f64),
1668 },
1669 MetricDef {
1670 name: "total_migrations",
1671 polarity: crate::test_support::Polarity::LowerBetter,
1672 kind: MetricKind::Counter,
1673 default_abs: 2.0,
1674 default_rel: 0.30,
1675 display_unit: "",
1676 accessor: |r| Some(r.migrations as f64),
1677 },
1678 MetricDef {
1679 name: "worst_migration_ratio",
1680 polarity: crate::test_support::Polarity::LowerBetter,
1681 kind: MetricKind::Gauge(GaugeAgg::Last),
1682 default_abs: 0.05,
1683 default_rel: 0.20,
1684 display_unit: "",
1685 accessor: |r| Some(r.migration_ratio),
1686 },
1687 MetricDef {
1688 name: "max_imbalance_ratio",
1689 polarity: crate::test_support::Polarity::LowerBetter,
1690 kind: MetricKind::Peak,
1691 default_abs: 1.0,
1692 default_rel: 0.25,
1693 display_unit: "x",
1694 accessor: |r| Some(r.imbalance_ratio),
1695 },
1696 MetricDef {
1697 // Per-phase mean of per-tick imbalance_ratio observations
1698 // (max(nr_running) / max(1, min(nr_running)) per CPU; full-
1699 // class count). Sourced from MonitorSample (not Snapshot)
1700 // because Snapshot exposes only scx_rq.nr_running (SCX-
1701 // only) while imbalance is meaningful only across the
1702 // full per-CPU runqueue. Populated by build_phase_buckets
1703 // via per-phase MonitorSample windowing — bypasses
1704 // MetricDef::read_sample (which dispatches off
1705 // sample.snapshot only) per the data-axis split. Kind
1706 // Gauge(Avg) folds across cgroups via weighted-mean per
1707 // sample_count; Polarity::LowerBetter mirrors the Peak
1708 // sibling.
1709 name: "avg_imbalance_ratio",
1710 polarity: crate::test_support::Polarity::LowerBetter,
1711 kind: MetricKind::Gauge(GaugeAgg::Avg),
1712 default_abs: 0.5,
1713 default_rel: 0.25,
1714 display_unit: "x",
1715 accessor: |_| None,
1716 },
1717 MetricDef {
1718 name: "max_dsq_depth",
1719 polarity: crate::test_support::Polarity::LowerBetter,
1720 kind: MetricKind::Peak,
1721 default_abs: 10.0,
1722 default_rel: 0.50,
1723 display_unit: "",
1724 accessor: |r| Some(r.max_dsq_depth as f64),
1725 },
1726 MetricDef {
1727 // Per-sample mean of local-CPU DSQ depths sourced from
1728 // the BPF DSQ walker (Snapshot::dsq_states() filtered by
1729 // `origin.starts_with("local cpu ")`), reduced per phase
1730 // via the Gauge(Avg) path. The DSQ-walker axis is the
1731 // authoritative source — it reads the actual scheduler
1732 // dispatch queues. The legacy Timeline::build path
1733 // computed avg_dsq_depth from MonitorSample.CpuSnapshot.
1734 // local_dsq_depth (a per-CPU rq-level metric); the new
1735 // DSQ-walker axis is more accurate for an scx scheduler
1736 // because it observes the dispatch queue directly rather
1737 // than the rq-level reflection.
1738 //
1739 // Truncation caveat: when scx_walker hits MAX_NODES_PER_LIST
1740 // (per src/monitor/scx_walker.rs), the captured DSQs are a
1741 // prefix of the full set. The mean then shifts toward the
1742 // captured prefix's central tendency; a 64-CPU box capturing
1743 // only 20 DSQs reports the mean of those 20, not the mean
1744 // over 64. max_dsq_depth (the Peak sibling) is robust to
1745 // this (max-of-captured surfaces the deepest queue ever
1746 // captured); avg_dsq_depth has no such monotonicity. If
1747 // walker truncation becomes routine, add a denom-aware
1748 // version that sums-then-divides by the topology's
1749 // expected local-CPU count.
1750 //
1751 // Accessor falls back to ext_metrics (no typed GauntletRow
1752 // field; promoting to typed is gated on cross-RUN
1753 // aggregation needs surfacing).
1754 name: "avg_dsq_depth",
1755 polarity: crate::test_support::Polarity::LowerBetter,
1756 kind: MetricKind::Gauge(GaugeAgg::Avg),
1757 default_abs: 5.0,
1758 default_rel: 0.50,
1759 display_unit: "",
1760 accessor: |_| None,
1761 },
1762 MetricDef {
1763 name: "stuck_count",
1764 polarity: crate::test_support::Polarity::LowerBetter,
1765 kind: MetricKind::Counter,
1766 // abs=1.0 (vs 5.0 for the event counters below): one additional
1767 // scheduler stall is high-signal, so a delta of a single whole
1768 // stall — gated by the 0.50 rel threshold — is worth flagging.
1769 default_abs: 1.0,
1770 default_rel: 0.50,
1771 display_unit: "",
1772 accessor: |r| Some(r.stuck_count),
1773 },
1774 MetricDef {
1775 name: "total_fallback",
1776 polarity: crate::test_support::Polarity::LowerBetter,
1777 kind: MetricKind::Counter,
1778 default_abs: 5.0,
1779 default_rel: 0.30,
1780 // Integer event count, not a rate — the source field on
1781 // `MonitorSummary::event_deltas.total_fallback` is a cumulative
1782 // delta across the run, not per-second. Empty unit matches the
1783 // other counter metrics (`stuck_count`, `total_iterations`,
1784 // `total_migrations`).
1785 display_unit: "",
1786 accessor: |r| Some(r.fallback_count as f64),
1787 },
1788 MetricDef {
1789 name: "total_keep_last",
1790 polarity: crate::test_support::Polarity::LowerBetter,
1791 kind: MetricKind::Counter,
1792 default_abs: 5.0,
1793 default_rel: 0.30,
1794 // Integer event count, not a rate — see `total_fallback`
1795 // rationale above. Source field is
1796 // `MonitorSummary::event_deltas.total_dispatch_keep_last`.
1797 display_unit: "",
1798 accessor: |r| Some(r.keep_last_count as f64),
1799 },
1800 // -- System-wide schedstat aggregates. Read host-side from guest memory at
1801 // -- freeze (zero observer effect) via `MonitorSummary::schedstat_deltas`
1802 // -- (per-rq `struct rq` schedstat fields summed across CPUs over the run);
1803 // -- `sidecar_to_row` inserts them into `GauntletRow::ext_metrics` so the
1804 // -- `|_| None` accessors surface them through the ext fallback. The seven
1805 // -- raw counters are `Polarity::Informational` — directionless (more
1806 // -- wakeups / context-switches / yields is neither inherently better nor
1807 // -- worse), so they are SHOWN but NEVER gated. They are also
1808 // -- WINDOW-DURATION- and LOAD-CONFOUNDED raw sums (a longer monitor window
1809 // -- or more offered runnable work inflates them independent of the
1810 // -- scheduler) — a second reason they are Informational, not LowerBetter:
1811 // -- a large raw delta is not a regression. The duration- and load-robust
1812 // -- GATED signals are the per-schedule mean (`total_run_delay_ns_per_sched`)
1813 // -- and the locality ratio (`ttwu_local_fraction`) derived below; four of
1814 // -- the raw counters double as those Rates' Counter components.
1815 MetricDef {
1816 // Numerator of `total_run_delay_ns_per_sched`. Cumulative runqueue-wait
1817 // delay (ns) across all tasks + all CPUs (`rq.rq_sched_info.run_delay`).
1818 // `total_` prefix satisfies the Counter naming gate.
1819 name: "total_run_delay",
1820 polarity: crate::test_support::Polarity::Informational,
1821 kind: MetricKind::Counter,
1822 default_abs: 1000.0,
1823 default_rel: 0.10,
1824 display_unit: "ns",
1825 accessor: |_| None,
1826 },
1827 MetricDef {
1828 // Denominator of `total_run_delay_ns_per_sched`. Count of non-idle task
1829 // arrivals (`rq.rq_sched_info.pcount`) — the number of schedules the
1830 // run-delay accrued over.
1831 name: "total_pcount",
1832 polarity: crate::test_support::Polarity::Informational,
1833 kind: MetricKind::Counter,
1834 default_abs: 1.0,
1835 default_rel: 0.10,
1836 display_unit: "",
1837 accessor: |_| None,
1838 },
1839 MetricDef {
1840 // schedule() invocation count (`rq.sched_count` — incremented once per
1841 // __schedule() call, a superset of context switches since re-picking the
1842 // same task still counts). Informational: more scheduler entries can
1843 // mean responsiveness OR thrashing — no direction.
1844 name: "total_sched_count",
1845 polarity: crate::test_support::Polarity::Informational,
1846 kind: MetricKind::Counter,
1847 default_abs: 1.0,
1848 default_rel: 0.10,
1849 display_unit: "",
1850 accessor: |_| None,
1851 },
1852 MetricDef {
1853 // `sched_yield()` call count (`rq.yld_count`). Informational (workload
1854 // behavior, not a scheduler-quality signal).
1855 name: "total_yld_count",
1856 polarity: crate::test_support::Polarity::Informational,
1857 kind: MetricKind::Counter,
1858 default_abs: 1.0,
1859 default_rel: 0.10,
1860 display_unit: "",
1861 accessor: |_| None,
1862 },
1863 MetricDef {
1864 // Go-idle count (`rq.sched_goidle`): times a CPU picked the idle task.
1865 // Informational (good utilization vs wasted idle — ambiguous).
1866 name: "total_sched_goidle",
1867 polarity: crate::test_support::Polarity::Informational,
1868 kind: MetricKind::Counter,
1869 default_abs: 1.0,
1870 default_rel: 0.10,
1871 display_unit: "",
1872 accessor: |_| None,
1873 },
1874 MetricDef {
1875 // Denominator of `ttwu_local_fraction`. Total wakeups (`rq.ttwu_count`)
1876 // — workload activity. Informational.
1877 name: "total_ttwu_count",
1878 polarity: crate::test_support::Polarity::Informational,
1879 kind: MetricKind::Counter,
1880 default_abs: 1.0,
1881 default_rel: 0.10,
1882 display_unit: "",
1883 accessor: |_| None,
1884 },
1885 MetricDef {
1886 // Numerator of `ttwu_local_fraction`. Wakeups kept on the waking CPU
1887 // (`rq.ttwu_local`). Informational on its own; the locality RATIO below
1888 // carries the direction.
1889 name: "total_ttwu_local",
1890 polarity: crate::test_support::Polarity::Informational,
1891 kind: MetricKind::Counter,
1892 default_abs: 1.0,
1893 default_rel: 0.10,
1894 display_unit: "",
1895 accessor: |_| None,
1896 },
1897 MetricDef {
1898 // GATED. System-wide per-schedule MEAN runqueue-wait delay =
1899 // Σrun_delay / Σpcount, re-derived across CPUs/runs by
1900 // `derive_rate_metrics` (the `Rate` kind's `MergeKind::Recompute` pools
1901 // the components — never a mean-of-ratios). Duration- and load-robust
1902 // (per-EVENT, not per-time): the system-wide analog of the
1903 // workload-scoped per-task `mean_run_delay_us` (schbench's
1904 // `mean_sched_delay = run_delay/pcount`). LowerBetter. Absent when
1905 // `total_pcount` is 0 (no schedules) or CONFIG_SCHEDSTATS is off
1906 // (components absent).
1907 name: "total_run_delay_ns_per_sched",
1908 polarity: crate::test_support::Polarity::LowerBetter,
1909 kind: MetricKind::Rate {
1910 numerator: "total_run_delay",
1911 denominator: "total_pcount",
1912 },
1913 default_abs: 100.0,
1914 default_rel: 0.15,
1915 display_unit: "ns",
1916 accessor: |_| None,
1917 },
1918 MetricDef {
1919 // GATED. Wakeup LOCALITY = Σttwu_local / Σttwu_count, re-derived by
1920 // `derive_rate_metrics`. A fraction in [0, 1]: the share of wakeups kept
1921 // on the waking CPU (better cache locality, fewer cross-CPU hops on
1922 // wakeup). HigherBetter. Absent when `total_ttwu_count` is 0 or
1923 // CONFIG_SCHEDSTATS is off.
1924 name: "ttwu_local_fraction",
1925 polarity: crate::test_support::Polarity::HigherBetter,
1926 kind: MetricKind::Rate {
1927 numerator: "total_ttwu_local",
1928 denominator: "total_ttwu_count",
1929 },
1930 default_abs: 0.05,
1931 default_rel: 0.10,
1932 display_unit: "",
1933 accessor: |_| None,
1934 },
1935 MetricDef {
1936 // GATED. Go-idle FRACTION = Σsched_goidle / Σsched_count, re-derived by
1937 // `derive_rate_metrics`. A fraction in [0, 1]: the share of `schedule()`
1938 // calls that picked the idle task (the CPU found nothing runnable).
1939 // Load-normalized (per-schedule, not per-time), so it is duration- AND
1940 // arrival-rate-stable — the genuinely-useful-for-spread schedstat rate
1941 // (a bare per-second rate carries the same spread as the raw total when
1942 // cohort runs share a duration, so it adds nothing at equal duration).
1943 // Informational: a high idle fraction is ambiguous — efficient when no
1944 // runnable work exists, but a starvation symptom when runnable work is
1945 // not dispatched — so it surfaces in `--noise-adjust` spread but does
1946 // not gate a regression verdict. Absent when `total_sched_count` is 0
1947 // (no schedules) or CONFIG_SCHEDSTATS is off (components absent).
1948 name: "sched_goidle_fraction",
1949 polarity: crate::test_support::Polarity::Informational,
1950 kind: MetricKind::Rate {
1951 numerator: "total_sched_goidle",
1952 denominator: "total_sched_count",
1953 },
1954 default_abs: 0.05,
1955 default_rel: 0.10,
1956 display_unit: "",
1957 accessor: |_| None,
1958 },
1959 // Per-second schedstat rates: each total_* schedstat Counter divided by
1960 // total_schedstat_wall_sec (the monitor-window span). Unlike the
1961 // per-schedule ratios above (total_run_delay_ns_per_sched / *_fraction,
1962 // load-normalized per-EVENT), these are per-TIME — duration-normalized so
1963 // --noise-adjust can compare cohorts whose runs differ in wall duration
1964 // (raw counts are not comparable across differing durations; per-second
1965 // rates are). At EQUAL duration a per-second rate ranks identically to the
1966 // raw count, so it adds nothing then — its value is the differing-duration
1967 // case. Rate kind => cross-run Σnumerator/Σdenominator (duration-weighted),
1968 // NOT a mean of per-run rates. All Informational (raw activity rates carry
1969 // no universal better-direction) except run_delay_per_sec (latency,
1970 // LowerBetter). Absent when CONFIG_SCHEDSTATS is off or the window is
1971 // degenerate (denominator absent/0).
1972 MetricDef {
1973 // Hidden rate-denominator component (NOT user-facing): the schedstat
1974 // monitor-window span in seconds, co-inserted both-or-neither with the
1975 // total_* schedstat counters in sidecar_to_row. Counter so it survives
1976 // the cross-RUN Sum-fold (Σcount / Σsec re-derives). Distinct from
1977 // total_phase_wall_sec (the per-phase IRQ-capture window) — schedstat's
1978 // window is the monitor-sample span, a different measurement.
1979 name: "total_schedstat_wall_sec",
1980 polarity: crate::test_support::Polarity::Informational,
1981 kind: MetricKind::Counter,
1982 default_abs: 0.1,
1983 default_rel: 0.30,
1984 display_unit: "s",
1985 accessor: |_| None,
1986 },
1987 MetricDef {
1988 // Σrun_delay / Σwindow-seconds — total scheduling-wait delay accrued per
1989 // second (ns/s). LowerBetter (less accrued wait = better). Distinct from
1990 // total_run_delay_ns_per_sched (ns PER SCHEDULE): _per_sec is per-time,
1991 // _ns_per_sched is per-event.
1992 name: "run_delay_per_sec",
1993 polarity: crate::test_support::Polarity::LowerBetter,
1994 kind: MetricKind::Rate {
1995 numerator: "total_run_delay",
1996 denominator: "total_schedstat_wall_sec",
1997 },
1998 default_abs: 1000.0,
1999 default_rel: 0.30,
2000 display_unit: "ns/s",
2001 accessor: |_| None,
2002 },
2003 MetricDef {
2004 // Σpcount / Σwindow-seconds — task-arrival (non-idle schedule) rate per
2005 // second. Informational (scheduling-activity throughput tracks offered
2006 // load + scheduler behavior together, no universal direction).
2007 name: "pcount_per_sec",
2008 polarity: crate::test_support::Polarity::Informational,
2009 kind: MetricKind::Rate {
2010 numerator: "total_pcount",
2011 denominator: "total_schedstat_wall_sec",
2012 },
2013 default_abs: 1.0,
2014 default_rel: 0.30,
2015 display_unit: "/s",
2016 accessor: |_| None,
2017 },
2018 MetricDef {
2019 // Σsched_count / Σwindow-seconds — schedule() invocations per second
2020 // (rq.sched_count increments once per __schedule() call, a superset of
2021 // context switches since re-picking the same task still counts).
2022 // Informational. The per-second sibling of the precomputed struct rate
2023 // that was retired; cross-run-foldable here (Σnum/Σden), the struct
2024 // field was not.
2025 name: "sched_count_per_sec",
2026 polarity: crate::test_support::Polarity::Informational,
2027 kind: MetricKind::Rate {
2028 numerator: "total_sched_count",
2029 denominator: "total_schedstat_wall_sec",
2030 },
2031 default_abs: 1.0,
2032 default_rel: 0.30,
2033 display_unit: "/s",
2034 accessor: |_| None,
2035 },
2036 MetricDef {
2037 // Σyld_count / Σwindow-seconds — sched_yield() calls per second.
2038 // Informational; high-signal only under a yield-storm pathology.
2039 name: "yld_count_per_sec",
2040 polarity: crate::test_support::Polarity::Informational,
2041 kind: MetricKind::Rate {
2042 numerator: "total_yld_count",
2043 denominator: "total_schedstat_wall_sec",
2044 },
2045 default_abs: 1.0,
2046 default_rel: 0.30,
2047 display_unit: "/s",
2048 accessor: |_| None,
2049 },
2050 MetricDef {
2051 // Σttwu_count / Σwindow-seconds — wakeups per second. Informational
2052 // (wakeup volume; the locality DIRECTION is ttwu_local_fraction, not a
2053 // per-second magnitude — so ttwu_local has no _per_sec rate).
2054 name: "ttwu_count_per_sec",
2055 polarity: crate::test_support::Polarity::Informational,
2056 kind: MetricKind::Rate {
2057 numerator: "total_ttwu_count",
2058 denominator: "total_schedstat_wall_sec",
2059 },
2060 default_abs: 1.0,
2061 default_rel: 0.30,
2062 display_unit: "/s",
2063 accessor: |_| None,
2064 },
2065 MetricDef {
2066 // Σsched_goidle / Σwindow-seconds — go-idle transitions per second.
2067 // Informational; the per-TIME companion to sched_goidle_fraction (the
2068 // per-schedule share) — a high goidle/sec can signal wakeup-thrash.
2069 name: "sched_goidle_per_sec",
2070 polarity: crate::test_support::Polarity::Informational,
2071 kind: MetricKind::Rate {
2072 numerator: "total_sched_goidle",
2073 denominator: "total_schedstat_wall_sec",
2074 },
2075 default_abs: 1.0,
2076 default_rel: 0.30,
2077 display_unit: "/s",
2078 accessor: |_| None,
2079 },
2080 MetricDef {
2081 // Whole-run mean per-CPU runqueue depth (`rq.nr_running`, ALL scheduling
2082 // classes), read host-side from guest memory via
2083 // `MonitorSummary::avg_nr_running`. The occupancy LEVEL — distinct from
2084 // `avg_dsq_depth` (scx DSQ only) and `avg_imbalance_ratio` (cross-CPU
2085 // SKEW, not level). `MetricKind::Gauge(Avg)`: the cross-run fold is the
2086 // sample-weighted pooled mean (Σ avg×samples / Σ samples via
2087 // `aggregate_samples_weighted`, weight = `run_sample_count`). The weight
2088 // is sample count, not samples×CPUs — EXACT under same-topology pairing
2089 // (CPU count is a pairing dim, so cross-folded runs share it; the same
2090 // basis `avg_imbalance_ratio` uses). LowerBetter — higher mean depth =
2091 // more tasks waiting, but load-confounded (more offered runnable tasks
2092 // raises it independent of the scheduler), the same caveat
2093 // `avg_dsq_depth` carries; meaningful for same-offered-load A/B.
2094 // ext_metrics-only (accessor `|_| None`, surfaced via the ext fallback);
2095 // absent when the run has no monitor samples.
2096 name: "avg_nr_running",
2097 polarity: crate::test_support::Polarity::LowerBetter,
2098 kind: MetricKind::Gauge(GaugeAgg::Avg),
2099 default_abs: 0.5,
2100 default_rel: 0.20,
2101 display_unit: "",
2102 accessor: |_| None,
2103 },
2104 MetricDef {
2105 // Wake-latency p99, re-pooled over the COMBINED wake-latency sample
2106 // set across every cgroup (and phase), NOT a max of per-cgroup p99s.
2107 // Distribution kind: derived post-merge by
2108 // `crate::assert::populate_run_distribution_metrics`; accessor is
2109 // |_| None so `MetricDef::read` takes the ext_metrics value the
2110 // re-pool writes. (The `worst_` name is retained for sidecar /
2111 // DataFrame / CI-gate stability — see the `worst_` naming
2112 // convention on [`METRICS`].)
2113 name: "worst_p99_wake_latency_us",
2114 polarity: crate::test_support::Polarity::LowerBetter,
2115 kind: MetricKind::Distribution {
2116 source: SampleSource::WakeLatencyNs,
2117 reduction: SampleReduction::P99,
2118 },
2119 default_abs: 50.0,
2120 default_rel: 0.25,
2121 display_unit: "\u{00b5}s",
2122 accessor: |_| None,
2123 },
2124 MetricDef {
2125 // Wake-latency median (50th pct), re-pooled over the combined wake
2126 // set — see `worst_p99_wake_latency_us`.
2127 name: "worst_median_wake_latency_us",
2128 polarity: crate::test_support::Polarity::LowerBetter,
2129 kind: MetricKind::Distribution {
2130 source: SampleSource::WakeLatencyNs,
2131 reduction: SampleReduction::Median,
2132 },
2133 default_abs: 20.0,
2134 default_rel: 0.25,
2135 display_unit: "\u{00b5}s",
2136 accessor: |_| None,
2137 },
2138 MetricDef {
2139 // Wake-latency coefficient of variation (stddev/mean), re-pooled
2140 // over the combined wake set with a population-WEIGHTED variance and
2141 // mean (denominator = Σ per-sample population weights, i.e. the
2142 // reconstructed true wakeup population; == `pool.len()` only below the
2143 // reservoir cap, where every weight is 1.0) — see
2144 // `worst_p99_wake_latency_us`.
2145 name: "worst_wake_latency_cv",
2146 polarity: crate::test_support::Polarity::LowerBetter,
2147 kind: MetricKind::Distribution {
2148 source: SampleSource::WakeLatencyNs,
2149 reduction: SampleReduction::Cv,
2150 },
2151 default_abs: 0.10,
2152 default_rel: 0.25,
2153 display_unit: "",
2154 accessor: |_| None,
2155 },
2156 MetricDef {
2157 // Run-level timer-latency p99 (WorkType::TimerLatency cyclictest probe),
2158 // re-pooled over the combined timer-latency sample set across every
2159 // cgroup and phase (NOT a max of per-cgroup p99s). Distribution: derived
2160 // post-merge by populate_run_distribution_metrics; accessor |_| None
2161 // reads the ext_metrics value the re-pool writes.
2162 name: "worst_p99_timer_latency_us",
2163 polarity: crate::test_support::Polarity::LowerBetter,
2164 kind: MetricKind::Distribution {
2165 source: SampleSource::TimerLatencyNs,
2166 reduction: SampleReduction::P99,
2167 },
2168 default_abs: 50.0,
2169 default_rel: 0.25,
2170 display_unit: "\u{00b5}s",
2171 accessor: |_| None,
2172 },
2173 MetricDef {
2174 // Run-level timer-latency median — see worst_p99_timer_latency_us.
2175 name: "worst_median_timer_latency_us",
2176 polarity: crate::test_support::Polarity::LowerBetter,
2177 kind: MetricKind::Distribution {
2178 source: SampleSource::TimerLatencyNs,
2179 reduction: SampleReduction::Median,
2180 },
2181 default_abs: 20.0,
2182 default_rel: 0.25,
2183 display_unit: "\u{00b5}s",
2184 accessor: |_| None,
2185 },
2186 MetricDef {
2187 // Run-level timer-latency p99.9 (the deep RT tail) — see
2188 // worst_p99_timer_latency_us.
2189 name: "worst_p999_timer_latency_us",
2190 polarity: crate::test_support::Polarity::LowerBetter,
2191 kind: MetricKind::Distribution {
2192 source: SampleSource::TimerLatencyNs,
2193 reduction: SampleReduction::P999,
2194 },
2195 default_abs: 100.0,
2196 default_rel: 0.25,
2197 display_unit: "\u{00b5}s",
2198 accessor: |_| None,
2199 },
2200 MetricDef {
2201 // Run-level WORST (max) timer-latency — the cyclictest headline.
2202 // MAX-folds cross-RUN (SampleReduction::Worst, the peak survives) via
2203 // aggregate_finite, distinct from the MEAN-folded percentiles above.
2204 // Named worst_* with no pNN exactly like worst_run_delay_us
2205 // (Distribution{RunDelayNs, Worst}).
2206 name: "worst_timer_latency_us",
2207 polarity: crate::test_support::Polarity::LowerBetter,
2208 kind: MetricKind::Distribution {
2209 source: SampleSource::TimerLatencyNs,
2210 reduction: SampleReduction::Worst,
2211 },
2212 default_abs: 200.0,
2213 default_rel: 0.25,
2214 display_unit: "\u{00b5}s",
2215 accessor: |_| None,
2216 },
2217 MetricDef {
2218 // Per-phase worker iterations per second. MetricKind::Rate with
2219 // Counter components total_phase_iterations / total_phase_duration_sec:
2220 // build_phase_buckets_with_stimulus emits those two components (the
2221 // iteration delta + the window seconds) from adjacent stimulus events'
2222 // total_iterations / elapsed_ms deltas — NOT a ready ratio — and
2223 // derive_rate_metrics re-derives iteration_rate = Σiterations /
2224 // Σseconds, so it re-pools correctly across phases/runs rather than
2225 // averaging per-phase ratios. Higher-is-better (more throughput). The
2226 // registry entry exists so MetricDef::read on a
2227 // GauntletRow.ext_metrics fallback surfaces it through cargo ktstr
2228 // perf-delta like any other metric, and so
2229 // Timeline::from_phase_buckets reads it by the canonical name from
2230 // PhaseBucket.metrics. No typed GauntletRow field; accessor is the
2231 // ext_metrics fallback.
2232 name: "iteration_rate",
2233 polarity: crate::test_support::Polarity::HigherBetter,
2234 kind: MetricKind::Rate {
2235 numerator: "total_phase_iterations",
2236 denominator: "total_phase_duration_sec",
2237 },
2238 default_abs: 1.0,
2239 default_rel: 0.30,
2240 display_unit: "iter/s",
2241 accessor: |_| None,
2242 },
2243 MetricDef {
2244 name: "total_iterations",
2245 polarity: crate::test_support::Polarity::HigherBetter,
2246 kind: MetricKind::Counter,
2247 default_abs: 2.0,
2248 default_rel: 0.10,
2249 display_unit: "",
2250 accessor: |r| Some(r.total_iterations as f64),
2251 },
2252 MetricDef {
2253 // Per-phase iteration delta — the NUMERATOR component of the
2254 // `iteration_rate` Rate. ext_metrics-only (no GauntletRow field):
2255 // inserted per phase as the last-minus-first delta of the cumulative
2256 // iteration counter, alongside `total_phase_duration_sec`, so
2257 // `derive_rate_metrics` yields `iteration_rate` = Σ(iter delta) /
2258 // Σ(phase seconds). `total_` prefix satisfies the Counter naming gate.
2259 name: "total_phase_iterations",
2260 polarity: crate::test_support::Polarity::HigherBetter,
2261 kind: MetricKind::Counter,
2262 default_abs: 1.0,
2263 default_rel: 0.10,
2264 display_unit: "",
2265 accessor: |_| None,
2266 },
2267 MetricDef {
2268 // Per-phase WALL-clock duration in SECONDS — the DENOMINATOR
2269 // component of the `iteration_rate` Rate. ext_metrics-only. The
2270 // ms→s conversion is applied at the component-insertion site (NOT in
2271 // `derive_rate_metrics`, which does a bare num/den with no scaling),
2272 // so the stored value is already seconds and the derived rate is
2273 // iterations/second. `total_` prefix satisfies the Counter naming gate.
2274 name: "total_phase_duration_sec",
2275 polarity: crate::test_support::Polarity::HigherBetter,
2276 kind: MetricKind::Counter,
2277 default_abs: 1.0,
2278 default_rel: 0.30,
2279 display_unit: "s",
2280 accessor: |_| None,
2281 },
2282 MetricDef {
2283 // Run-level POOLED CPU-seconds — the DENOMINATOR component of the
2284 // pooled `iterations_per_cpu_sec` Rate. ext_metrics-only (accessor
2285 // |_| None): populate_run_pooled_iterations_per_cpu_sec sums the
2286 // MEASURED cgroups' CgroupStats.total_cpu_time_ns (total_cpu_time_ns >
2287 // 0) and inserts the ns→s value (= Σns / 1e9) at the post-merge eval
2288 // site. The measured-only filter leaves this denominator unchanged
2289 // (excluded cgroups contribute 0 ns) — it matters for the numerator,
2290 // whose excluded cgroups carry nonzero iterations. The /1e9 lives
2291 // there (NOT in derive_rate_metrics, which does a bare num/den),
2292 // applied ONCE on the summed ns. `total_` prefix satisfies the Counter
2293 // gate.
2294 name: "total_cpu_time_sec",
2295 polarity: crate::test_support::Polarity::HigherBetter,
2296 kind: MetricKind::Counter,
2297 default_abs: 1.0,
2298 default_rel: 0.30,
2299 display_unit: "s",
2300 accessor: |_| None,
2301 },
2302 MetricDef {
2303 // Run-level POOLED iteration count — the NUMERATOR component of the
2304 // pooled `iterations_per_cpu_sec` Rate, summed over cgroups with
2305 // MEASURED cpu-time (total_cpu_time_ns > 0). ext_metrics-only,
2306 // DISTINCT from the typed `total_iterations` Counter on purpose: the
2307 // typed field is skipped from ext_metrics (TYPED_FIELD_NAMES) and folds
2308 // cross-RUN as a MEAN (group_and_average_by's round_u64 divides the
2309 // accumulated sum by the contributor count — a display average), while
2310 // a Rate numerator must fold cross-RUN as a SUM (aggregate_finite
2311 // Counter arm, no divide) so Σnum/Σdenom re-pools. One shared key
2312 // cannot carry both folds, so the numerator gets its own ext key. It
2313 // also sums only MEASURED cgroups, where the typed field's per-RUN
2314 // cross-cgroup merge sums ALL cgroups — so it equals the merge-summed
2315 // typed total_iterations unless an excluded (zero-cpu-time) cgroup
2316 // carried iterations>0, in which case it is LESS.
2317 // `total_` prefix satisfies the Counter naming gate.
2318 name: "total_iterations_pooled",
2319 polarity: crate::test_support::Polarity::HigherBetter,
2320 kind: MetricKind::Counter,
2321 default_abs: 1.0,
2322 default_rel: 0.10,
2323 display_unit: "",
2324 accessor: |_| None,
2325 },
2326 MetricDef {
2327 // Run-level cohort CPU-time EFFICIENCY pooled across cgroups (and
2328 // re-pooled across runs): Σiterations / Σcpu-seconds. MetricKind::Rate
2329 // over the two Counter components above; derive_rate_metrics re-derives
2330 // it = Σtotal_iterations_pooled / Σtotal_cpu_time_sec at every level.
2331 // Distinct from the per-cgroup `worst_iterations_per_cpu_sec`
2332 // WorstLowest metric (the lowest-wins min-fold starvation selector):
2333 // this is the POOLED cohort rate, overcommit-invariant. _per_cpu_sec name + Rate kind passes the
2334 // reverse naming gate; ext_metrics-only (accessor |_| None).
2335 //
2336 // SAME physical quantity as worst_iterations_per_cpu_sec (iter/CPU-s
2337 // efficiency), so it shares that sibling's compare thresholds:
2338 // default_rel=0.10 (a 10% efficiency change is the regression signal)
2339 // and default_abs=10.0 (near-zero anti-jitter floor — a real busy
2340 // workload's rate is orders of magnitude larger). NOT the looser
2341 // iteration_rate throughput gate (rel=0.30), which would silently
2342 // swallow a 10-29% efficiency regression the per-cgroup row flags.
2343 name: "iterations_per_cpu_sec",
2344 polarity: crate::test_support::Polarity::HigherBetter,
2345 kind: MetricKind::Rate {
2346 numerator: "total_iterations_pooled",
2347 denominator: "total_cpu_time_sec",
2348 },
2349 default_abs: 10.0,
2350 default_rel: 0.10,
2351 display_unit: "iter/cpu-s",
2352 accessor: |_| None,
2353 },
2354 MetricDef {
2355 // Per-phase SYSTEM (in-kernel) CPU time in nanoseconds. Read
2356 // host-side from frozen task_struct.stime + the thread-group
2357 // signal_struct.stime accumulator (zero guest work). Injected
2358 // post-hoc — NOT a read_sample metric — as a per-thread-GROUP
2359 // delta over the phase: `crate::assert::phase_group_cpu_delta`
2360 // sums each tgid's `thread_group_cputime` (signal + live-thread
2361 // stime) at its first and last appearance among the phase's
2362 // freeze samples and takes `last - first` = system CPU time the
2363 // group spent during the phase. PerPhaseDeltaSum: the per-phase value
2364 // is already a delta, so the disjoint per-phase deltas SUM across the
2365 // run (the run's total OBSERVED system CPU time — a lower bound
2366 // excluding head / tail / inter-phase-gap windows; see the kind doc),
2367 // and the per-run totals fold by UNWEIGHTED MEAN cross-RUN (NOT
2368 // sample-count-weighted), like user_time_ns. LowerBetter — the DSQ-spinlock
2369 // regression surfaces as rising system time (CPUs spinning in
2370 // the kernel). No typed GauntletRow field; the ext_metrics
2371 // fallback carries it through cargo ktstr perf-delta.
2372 name: "system_time_ns",
2373 polarity: crate::test_support::Polarity::LowerBetter,
2374 kind: MetricKind::PerPhaseDeltaSum,
2375 default_abs: 1000.0,
2376 default_rel: 0.30,
2377 display_unit: "ns",
2378 accessor: |_| None,
2379 },
2380 MetricDef {
2381 // Per-phase USER-mode CPU time in nanoseconds. Same host-side /
2382 // injected / PerPhaseDeltaSum shape as `system_time_ns` (task_struct
2383 // .utime + the thread-group signal_struct.utime accumulator,
2384 // per-tgid delta via `crate::assert::phase_group_cpu_delta`; SUM
2385 // cross-phase, unweighted MEAN cross-run).
2386 // Pairs with it so a test can distinguish "system time rose,
2387 // user work flat" (the lock-contention signature) from "both
2388 // rose" (genuine extra work). LowerBetter — less CPU consumed
2389 // for the same work is the efficiency win; utime already
2390 // includes gtime so the two are never summed.
2391 name: "user_time_ns",
2392 polarity: crate::test_support::Polarity::LowerBetter,
2393 kind: MetricKind::PerPhaseDeltaSum,
2394 default_abs: 1000.0,
2395 default_rel: 0.30,
2396 display_unit: "ns",
2397 accessor: |_| None,
2398 },
2399 // ---- IRQ observability ----
2400 // Host-side observer-free IRQ signals from PerCpuTimeStats (freeze
2401 // Snapshot, src/monitor/dump/mod.rs), cross-CPU folded at
2402 // read_sample and carried through ext_metrics (accessor |_| None) like
2403 // system_time_ns. The time signals require CONFIG_IRQ_TIME_ACCOUNTING;
2404 // loud-absent (None), never false-zero, when off. Per-phase
2405 // reduction is the Counter last-minus-first over the bucket's freeze
2406 // captures (needs num_snapshots >= 2). The per-CPU SPATIAL axis
2407 // (max_cpu_hardirqs + max_cpu_hardirq_concentration, the busiest-CPU
2408 // dimension vs this cross-CPU SUM) is registered below; per-softirq
2409 // spatial-max is a follow-up.
2410 MetricDef {
2411 // Sum of kernel_stat.irqs_sum across CPUs — total hardirqs fired
2412 // (per-CPU monotonic count, __kstat_incr_irqs_this_cpu,
2413 // kernel/irq/internals.h). NOT gated on irqtime (always populates).
2414 name: "total_hardirqs",
2415 polarity: crate::test_support::Polarity::Informational,
2416 kind: MetricKind::Counter,
2417 default_abs: 10.0,
2418 default_rel: 0.50,
2419 display_unit: "",
2420 accessor: |_| None,
2421 },
2422 MetricDef {
2423 // Sum of kernel_stat.softirqs[NET_RX] across CPUs (index via
2424 // SOFTIRQ_NAMES; kstat_incr_softirqs_this_cpu, kernel/softirq.c). The
2425 // load-bearing softirq for NetTraffic RX.
2426 name: "total_softirq_net_rx",
2427 polarity: crate::test_support::Polarity::Informational,
2428 kind: MetricKind::Counter,
2429 default_abs: 10.0,
2430 default_rel: 0.50,
2431 display_unit: "",
2432 accessor: |_| None,
2433 },
2434 MetricDef {
2435 // Sum of kernel_stat.softirqs[NET_TX] across CPUs.
2436 name: "total_softirq_net_tx",
2437 polarity: crate::test_support::Polarity::Informational,
2438 kind: MetricKind::Counter,
2439 default_abs: 10.0,
2440 default_rel: 0.50,
2441 display_unit: "",
2442 accessor: |_| None,
2443 },
2444 MetricDef {
2445 // Sum of kernel_stat.softirqs[TIMER] across CPUs.
2446 name: "total_softirq_timer",
2447 polarity: crate::test_support::Polarity::Informational,
2448 kind: MetricKind::Counter,
2449 default_abs: 10.0,
2450 default_rel: 0.50,
2451 display_unit: "",
2452 accessor: |_| None,
2453 },
2454 MetricDef {
2455 // Sum of kernel_stat.softirqs[SCHED] across CPUs.
2456 name: "total_softirq_sched",
2457 polarity: crate::test_support::Polarity::Informational,
2458 kind: MetricKind::Counter,
2459 default_abs: 10.0,
2460 default_rel: 0.50,
2461 display_unit: "",
2462 accessor: |_| None,
2463 },
2464 MetricDef {
2465 // Sum of kernel_cpustat.cpustat[CPUTIME_IRQ] across CPUs — raw ns in
2466 // hardirq (irqtime_account_delta, kernel/sched/cputime.c). Read from
2467 // guest memory as ns (NOT /proc/stat jiffies — no nsec_to_clock_t).
2468 // Requires CONFIG_IRQ_TIME_ACCOUNTING; Counter/ns like system_time_ns.
2469 name: "total_irq_time_ns",
2470 polarity: crate::test_support::Polarity::Informational,
2471 kind: MetricKind::Counter,
2472 default_abs: 1000.0,
2473 default_rel: 0.50,
2474 display_unit: "ns",
2475 accessor: |_| None,
2476 },
2477 MetricDef {
2478 // Sum of kernel_cpustat.cpustat[CPUTIME_SOFTIRQ] across CPUs — raw ns
2479 // in softirq. Requires CONFIG_IRQ_TIME_ACCOUNTING.
2480 name: "total_softirq_time_ns",
2481 polarity: crate::test_support::Polarity::Informational,
2482 kind: MetricKind::Counter,
2483 default_abs: 1000.0,
2484 default_rel: 0.50,
2485 display_unit: "ns",
2486 accessor: |_| None,
2487 },
2488 MetricDef {
2489 // Sum of kernel_cpustat.cpustat[CPUTIME_STEAL] across CPUs — raw ns the
2490 // hypervisor stole (account_steal_time; needs CONFIG_PARAVIRT_TIME_
2491 // ACCOUNTING + kvm-clock steal-time). CPUTIME_STEAL is an unconditional
2492 // enum member (enum cpu_usage_stat, include/linux/kernel_stat.h), so
2493 // steal-accounting-off reads a constant 0 — a measured Some(0.0), NOT
2494 // loud-absent like the BTF-gated avg_irq gauge.
2495 name: "total_steal_time_ns",
2496 polarity: crate::test_support::Polarity::Informational,
2497 kind: MetricKind::Counter,
2498 default_abs: 1000.0,
2499 default_rel: 0.50,
2500 display_unit: "ns",
2501 accessor: |_| None,
2502 },
2503 MetricDef {
2504 // Mean across CPUs of rq->avg_irq.util_avg — the PELT IRQ load average
2505 // (struct sched_avg, kernel/sched/sched.h; range [0, 1024] =
2506 // SCHED_CAPACITY_SCALE). INSTANTANEOUS gauge (decaying PELT), NEVER
2507 // deltaed. Requires CONFIG_HAVE_SCHED_AVG_IRQ (def_bool y when
2508 // (IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING) && SMP — init/Kconfig).
2509 // Distinct from taskstats avg_irq_delay_ns (irq-DELAY accounting); this
2510 // is PELT util.
2511 name: "avg_irq_util",
2512 polarity: crate::test_support::Polarity::LowerBetter,
2513 kind: MetricKind::Gauge(GaugeAgg::Avg),
2514 default_abs: 20.0,
2515 default_rel: 0.30,
2516 display_unit: "",
2517 accessor: |_| None,
2518 },
2519 MetricDef {
2520 // Host spatial-max ACROSS CPUs of the INSTANTANEOUS rq->avg_irq.util_avg
2521 // gauge (worst-CPU IRQ load at the freeze) — NOT a kernel max-of-window.
2522 // Peak because both the spatial and temporal reduces are max over
2523 // instantaneous values (no cumulative-delta hazard, unlike a counter's
2524 // spatial-max; per-CPU axis is a follow-up). Range [0, 1024].
2525 name: "max_avg_irq_util",
2526 polarity: crate::test_support::Polarity::LowerBetter,
2527 kind: MetricKind::Peak,
2528 default_abs: 50.0,
2529 default_rel: 0.30,
2530 display_unit: "",
2531 accessor: |_| None,
2532 },
2533 MetricDef {
2534 // DERIVED rate: total_hardirqs / total_phase_wall_sec — hardirqs per
2535 // second over the CAPTURE WINDOW (first->last freeze span, NOT the full
2536 // phase; see total_phase_wall_sec). For A/B compare the cadence cancels.
2537 name: "hardirq_rate",
2538 polarity: crate::test_support::Polarity::Informational,
2539 kind: MetricKind::Rate {
2540 numerator: "total_hardirqs",
2541 denominator: "total_phase_wall_sec",
2542 },
2543 default_abs: 1.0,
2544 default_rel: 0.30,
2545 display_unit: "irq/s",
2546 accessor: |_| None,
2547 },
2548 MetricDef {
2549 // DERIVED rate: total_softirq_net_rx / total_phase_wall_sec — NET_RX
2550 // softirqs per second over the capture window. The NetTraffic
2551 // softirq-pressure signal.
2552 name: "net_rx_softirq_rate",
2553 polarity: crate::test_support::Polarity::Informational,
2554 kind: MetricKind::Rate {
2555 numerator: "total_softirq_net_rx",
2556 denominator: "total_phase_wall_sec",
2557 },
2558 default_abs: 1.0,
2559 default_rel: 0.30,
2560 display_unit: "softirq/s",
2561 accessor: |_| None,
2562 },
2563 MetricDef {
2564 // DERIVED rate: total_irq_time_ns / total_phase_wall_ns — the
2565 // dimensionless [0,1] fraction of the capture window spent in hardirq.
2566 // ns/ns (both over the SAME first->last freeze span) so the span-vs-
2567 // phase gap cancels. The exact-integral companion to avg_irq_util's
2568 // smoothed PELT gauge.
2569 name: "irq_time_fraction",
2570 polarity: crate::test_support::Polarity::LowerBetter,
2571 kind: MetricKind::Rate {
2572 numerator: "total_irq_time_ns",
2573 denominator: "total_phase_wall_ns",
2574 },
2575 default_abs: 0.02,
2576 default_rel: 0.30,
2577 display_unit: "",
2578 accessor: |_| None,
2579 },
2580 MetricDef {
2581 // Hidden rate-denominator component (NOT user-facing): the CAPTURE-
2582 // WINDOW duration in seconds = (bucket end_ms - start_ms)/1000, co-
2583 // inserted in buckets_from_grouped both-or-neither with the IRQ
2584 // counters (the /1000 lives at the insertion site; derive_rate_metrics
2585 // does bare num/den). Backs hardirq_rate / net_rx_softirq_rate. Counter
2586 // so it survives the cross-RUN Sum-fold (Sum count / Sum sec re-derives).
2587 name: "total_phase_wall_sec",
2588 polarity: crate::test_support::Polarity::Informational,
2589 kind: MetricKind::Counter,
2590 default_abs: 0.1,
2591 default_rel: 0.30,
2592 display_unit: "s",
2593 accessor: |_| None,
2594 },
2595 MetricDef {
2596 // Hidden rate-denominator component (NOT user-facing): the capture-
2597 // window duration in NANOSECONDS = (bucket end_ms - start_ms) * 1e6,
2598 // co-inserted with the IRQ counters. Backs irq_time_fraction (ns/ns).
2599 name: "total_phase_wall_ns",
2600 polarity: crate::test_support::Polarity::Informational,
2601 kind: MetricKind::Counter,
2602 default_abs: 1000.0,
2603 default_rel: 0.30,
2604 display_unit: "ns",
2605 accessor: |_| None,
2606 },
2607 MetricDef {
2608 // Per-CPU IRQ spatial axis: the BUSIEST CPU's hardirq delta over the
2609 // phase — max over CPUs of each CPU's (last - first freeze) irqs_sum,
2610 // correlated by the per_cpu_time cpu field (NOT the cross-CPU sum, which
2611 // is total_hardirqs). Custom per-CPU-delta fold in assert::phase_build,
2612 // NOT a read_sample arm (read_sample yields one f64 per freeze, no
2613 // per-CPU vector). Peak = spatial-max of a per-CPU cumulative-counter
2614 // delta. Informational: a high busiest-CPU count is ambiguous (high
2615 // traffic vs concentration) — the concentration ratio below is the
2616 // balance signal, mirroring the raw-counts-Informational split.
2617 name: "max_cpu_hardirqs",
2618 polarity: crate::test_support::Polarity::Informational,
2619 kind: MetricKind::Peak,
2620 default_abs: 10.0,
2621 default_rel: 0.50,
2622 display_unit: "",
2623 accessor: |_| None,
2624 },
2625 MetricDef {
2626 // IRQ-concentration ratio: max_cpu_hardirqs / mean per-CPU hardirq delta
2627 // over the SAME reporting-CPU set — the busiest CPU's share of the
2628 // average. Range [1, num_cpus]: 1.0 = perfectly even, higher = IRQs
2629 // concentrated on one CPU. Peak (worst per-phase concentration),
2630 // LowerBetter. Computed in the same per-CPU-delta fold, NOT a Rate (a
2631 // Peak numerator fails every_rate_metric_has_registered_counter_components,
2632 // and max/mean is not Σ-poolable). DELIBERATELY max/MEAN, distinct from
2633 // the sibling max_imbalance_ratio's max/MIN: max/min explodes when any
2634 // CPU takes ~0 IRQs, whereas max/mean measures disproportionate SHARE
2635 // (the IRQ-steering question). Absent (None) when < 2 reporting CPUs or
2636 // mean == 0.
2637 name: "max_cpu_hardirq_concentration",
2638 polarity: crate::test_support::Polarity::LowerBetter,
2639 kind: MetricKind::Peak,
2640 default_abs: 1.0,
2641 default_rel: 0.25,
2642 display_unit: "x",
2643 accessor: |_| None,
2644 },
2645 MetricDef {
2646 // Per-CPU NET_RX softirq spatial axis: the BUSIEST CPU's NET_RX softirq
2647 // delta over the phase — max over CPUs of each CPU's (last - first freeze)
2648 // kstat.softirqs[NET_RX] delta, correlated by the per_cpu_time cpu field
2649 // (NOT the cross-CPU sum, which is total_softirq_net_rx). Counts softirq
2650 // RUNS/invocations (handle_softirqs increments once per pending NET_RX bit
2651 // per dispatch via kstat_incr_softirqs_this_cpu), NOT packets — a
2652 // softirq-frequency / affinity-concentration signal. Custom per-CPU-delta
2653 // fold in assert::phase_build (fold_per_cpu_spatial_max), NOT a read_sample
2654 // arm. Peak = spatial-max of a per-CPU cumulative-counter delta. The
2655 // softirq sibling of max_cpu_hardirqs; Informational for the same reason —
2656 // a high busiest-CPU count is ambiguous (high RX traffic vs concentration),
2657 // the concentration ratio below is the balance signal.
2658 name: "max_cpu_softirq_net_rx",
2659 polarity: crate::test_support::Polarity::Informational,
2660 kind: MetricKind::Peak,
2661 default_abs: 10.0,
2662 default_rel: 0.50,
2663 display_unit: "",
2664 accessor: |_| None,
2665 },
2666 MetricDef {
2667 // NET_RX-softirq-concentration ratio: max_cpu_softirq_net_rx / mean
2668 // per-CPU NET_RX softirq delta over the SAME reporting-CPU set — the
2669 // busiest CPU's share of the average. Range [1, num_cpus]: 1.0 = even,
2670 // higher = NET_RX softirqs concentrated on one CPU (the single-queue-NIC
2671 // vs RPS/RSS-spread signal). Peak, LowerBetter, max/mean — the softirq
2672 // sibling of max_cpu_hardirq_concentration (same NOT-a-Rate, max/MEAN-not-
2673 // max/MIN, >=2-reporting-CPU + mean>0 discipline). Absent (None) when
2674 // < 2 reporting CPUs or mean == 0.
2675 name: "max_cpu_softirq_net_rx_concentration",
2676 polarity: crate::test_support::Polarity::LowerBetter,
2677 kind: MetricKind::Peak,
2678 default_abs: 1.0,
2679 default_rel: 0.25,
2680 display_unit: "x",
2681 accessor: |_| None,
2682 },
2683 MetricDef {
2684 // Mean ACROSS CPUs of the scx_layered util-compensation SCALE over the
2685 // capture window — the factor by which a CPU's useful-work capacity is
2686 // scaled up to compensate for IRQ / softirq / stolen time. Per CPU over
2687 // the first->last per_cpu_time freeze: scale = delta_total / available,
2688 // where delta_total = Σ of ALL 8 kernel_cpustat[] ns deltas
2689 // (user+nice+system+idle+iowait+irq+softirq+steal) and available =
2690 // delta_total - (irq+softirq+steal); clamped to [1.0, 20.0], and
2691 // available == 0 yields the 1.0 floor. Byte-faithful to scx_layered's
2692 // util_compensation compute — the ns-vs-µs unit cancels in the ratio
2693 // (scx_layered reads /proc microseconds; we read kernel_cpustat ns, the
2694 // same slots /proc/stat formats from). 1.0 = no IRQ/steal interference;
2695 // higher = more capacity stolen, so LowerBetter. An idle ktstr VM reads
2696 // exactly 1.0 — the MEASURED clamp floor (a real Some), NOT loud-absent;
2697 // a compensation > 1.0 requires an IRQ/steal-generating workload.
2698 // Gauge(Avg): cross-phase folds weighted-mean to run-level, cross-run
2699 // means — the typical compensation magnitude. Custom per-CPU-delta fold
2700 // in assert::phase_build (fold_util_comp_scale), NOT a read_sample arm: a
2701 // per-CPU clamp-then-mean is not expressible as a scalar Counter/Rate.
2702 // System-axis mean: scx_layered clamps per-CPU then applies per-LAYER;
2703 // ktstr has no layers, so the run-level signal is the mean of the
2704 // per-CPU scale distribution. cpustat[CPUTIME_SOFTIRQ] excludes
2705 // softirq deferred to ksoftirqd (irqtime_account_irq's
2706 // curr != this_cpu_ksoftirqd() guard, kernel/sched/cputime.c) — the same
2707 // undercount scx_layered inherits from /proc, so faithful to it; the
2708 // scale is a lower bound on true IRQ+softirq pressure.
2709 name: "avg_cpu_util_comp_scale",
2710 polarity: crate::test_support::Polarity::LowerBetter,
2711 kind: MetricKind::Gauge(GaugeAgg::Avg),
2712 default_abs: 0.5,
2713 default_rel: 0.30,
2714 display_unit: "x",
2715 accessor: |_| None,
2716 },
2717 MetricDef {
2718 // Mean across (freeze, live task) of scx_lavd's per-task
2719 // normalized_lat_cri (task_ctx.normalized_lat_cri, [0,1024]) — the
2720 // scheduler's latency-criticality score, host-read from the sdt_alloc
2721 // arena (BPF_MAP_TYPE_ARENA) each freeze and BTF-rendered, NOT a kernel
2722 // counter and NOT a BPF .bss field. A GAUGE (an instantaneous per-task
2723 // value lavd recomputes each schedule, scx_lavd lat_cri.bpf.c: lat_cri is
2724 // squared then waker/wakee-propagated, normalized to [0,1024]), so folded
2725 // as a mean over every (freeze, task) observation. Informational: a
2726 // scheduler-internal decision signal with no good/bad direction.
2727 // normalized (not raw lat_cri) for cross-run comparability — raw lat_cri
2728 // is squared + propagated + load-dependent. Custom per-task fold in
2729 // assert::phase_build (fold_lat_cri), NOT a read_sample arm. Distinct from
2730 // lavd's own .bss sys_stat.avg_lat_cri EWMA over SCHEDULED tasks (surfaced
2731 // via watch_bpf_map as the scx_lavd_avg_lat_cri key) — this is an
2732 // instantaneous host walk over ALL live task_ctx, INCLUDING not-yet-scored
2733 // slots that render 0, so the mean is population-sensitive to task-alloc
2734 // churn. Loud-absent for non-lavd schedulers (the rendered payload has no
2735 // such member).
2736 name: "avg_task_lat_cri",
2737 polarity: crate::test_support::Polarity::Informational,
2738 kind: MetricKind::Gauge(GaugeAgg::Avg),
2739 default_abs: 50.0,
2740 default_rel: 0.30,
2741 display_unit: "",
2742 accessor: |_| None,
2743 },
2744 MetricDef {
2745 // Max across (freeze, live task) of scx_lavd's per-task
2746 // normalized_lat_cri ([0,1024]) — the worst-case latency-criticality
2747 // observed over the phase. Peak (spatial+temporal max of an instantaneous
2748 // gauge, no delta). Informational. Same host sdt_alloc-arena source +
2749 // per-task fold (fold_lat_cri) + normalized rationale + loud-absent as
2750 // avg_task_lat_cri.
2751 name: "max_task_lat_cri",
2752 polarity: crate::test_support::Polarity::Informational,
2753 kind: MetricKind::Peak,
2754 default_abs: 100.0,
2755 default_rel: 0.30,
2756 display_unit: "",
2757 accessor: |_| None,
2758 },
2759 MetricDef {
2760 // Per-cgroup IRQ-pressure spatial axis: the busiest workload-leaf cgroup's
2761 // PSI-irq `full` stall DELTA over the phase (decoded µs) — max over the
2762 // workload-root leaf cgroups of each leaf's (last - first freeze)
2763 // cgroup->psi total[PSI_AVGS][PSI_IRQ_FULL], correlated across freezes by
2764 // (cgroup_kva, serial_nr) — the serial disambiguates a freed slab KVA
2765 // reused by a new cgroup. The per-cgroup analog of max_cpu_hardirqs (the
2766 // busiest-CELL dimension);
2767 // attributes IRQ-servicing stall to the workload cell that induced it,
2768 // which the system-wide total_irq_pressure_us cannot. Custom per-cgroup
2769 // delta fold (assert::phase_build fold_per_cgroup_psi), NOT a read_sample
2770 // arm. Peak = spatial-max of a per-cgroup cumulative-counter delta.
2771 // Informational: an absolute per-cell stall is workload-confounded (more
2772 // work → more stall) — the concentration ratio below is the isolation
2773 // signal, mirroring the max_cpu_hardirqs raw-counts split.
2774 name: "max_cgroup_irq_pressure",
2775 polarity: crate::test_support::Polarity::Informational,
2776 kind: MetricKind::Peak,
2777 default_abs: 1.0,
2778 default_rel: 0.50,
2779 display_unit: "µs",
2780 accessor: |_| None,
2781 },
2782 MetricDef {
2783 // Per-cgroup IRQ-pressure concentration: max_cgroup_irq_pressure / the
2784 // mean per-leaf IRQ-full stall delta over the SAME reporting-leaf set —
2785 // the busiest cell's share of the average. Range [1, num_leaves]: 1.0 =
2786 // evenly spread, higher = IRQ-servicing stall concentrated on one workload
2787 // cell (the cgroup-isolation / cell-steering signal). Peak, LowerBetter,
2788 // max/MEAN — the per-cgroup sibling of max_cpu_hardirq_concentration (same
2789 // NOT-a-Rate, max/MEAN-not-max/MIN, >=2-reporting-leaf + mean>0 discipline;
2790 // disjoint leaves — cgroup2's no-internal-process rule — so no
2791 // double-count). Absent (None) when < 2 reporting leaves or mean == 0.
2792 name: "max_cgroup_irq_pressure_concentration",
2793 polarity: crate::test_support::Polarity::LowerBetter,
2794 kind: MetricKind::Peak,
2795 default_abs: 1.0,
2796 default_rel: 0.25,
2797 display_unit: "x",
2798 accessor: |_| None,
2799 },
2800 MetricDef {
2801 // Per-cgroup IRQ-pressure GAUGE: the worst workload-leaf cgroup's PSI-irq
2802 // `full` avg10 (decoded 10s-EWMA percent, 0..=100) — per freeze the max
2803 // across the leaves, then the max across the phase's freezes. The
2804 // instantaneous-pressure companion to max_cgroup_irq_pressure (a gauge, so
2805 // a spatial-max with no delta — the max_avg_irq_util shape on the cgroup
2806 // axis). Peak; LowerBetter (less IRQ pressure on the worst cell is better).
2807 // Custom fold (fold_per_cgroup_psi), NOT a read_sample arm. Loud-absent
2808 // when no leaf reported PSI (psi_cgroups off / absent workload root).
2809 name: "max_cgroup_psi_irq_avg10",
2810 polarity: crate::test_support::Polarity::LowerBetter,
2811 kind: MetricKind::Peak,
2812 default_abs: 5.0,
2813 default_rel: 0.30,
2814 display_unit: "%",
2815 accessor: |_| None,
2816 },
2817 MetricDef {
2818 // System-wide PSI-irq `full` avg10: the mean over monitor samples of the
2819 // decoded 10s-EWMA full IRQ pressure (percent, 0..=100), host-walked from
2820 // the global `psi_system` (NOT a guest /proc read). Gauge(Avg) like
2821 // `avg_irq_util` — an instantaneous smoothed gauge, never deltaed; the
2822 // cross-run fold sample-weights it. LowerBetter (less IRQ pressure is
2823 // better). ext-only (accessor |_| None), folded from MonitorSummary in
2824 // group::sidecar_to_row. Loud-absent (None) when CONFIG_PSI /
2825 // CONFIG_IRQ_TIME_ACCOUNTING is off (no PSI_IRQ_FULL in BTF), never 0.0.
2826 name: "psi_irq_full_avg10",
2827 polarity: crate::test_support::Polarity::LowerBetter,
2828 kind: MetricKind::Gauge(GaugeAgg::Avg),
2829 default_abs: 5.0,
2830 default_rel: 0.30,
2831 display_unit: "%",
2832 accessor: |_| None,
2833 },
2834 MetricDef {
2835 // Cumulative system-wide PSI-irq `full` stall over the monitoring window
2836 // (µs): the end-start delta of `total[PSI_AVGS][PSI_IRQ_FULL]` (decoded
2837 // ns→µs), host-walked from `psi_system`. Counter (a monotonic cumulative
2838 // total, end-start deltaed; saturating on reset) like `total_irq_time_ns`,
2839 // so the cross-run fold Σ-pools it. Informational: an absolute stall time
2840 // is workload-confounded (longer run → more stall) — the avg10 gauge is
2841 // the magnitude-normalized signal; mirrors the total_irq_time_ns split.
2842 // ext-only, same loud-absent gate as `psi_irq_full_avg10`.
2843 name: "total_irq_pressure_us",
2844 polarity: crate::test_support::Polarity::Informational,
2845 kind: MetricKind::Counter,
2846 default_abs: 1.0,
2847 default_rel: 0.50,
2848 display_unit: "µs",
2849 accessor: |_| None,
2850 },
2851 MetricDef {
2852 // Mean schedstat run-delay, re-pooled as the mean over the COMBINED
2853 // run-delay sample set across every cgroup (and phase), RAW ns→µs
2854 // once — see `worst_p99_wake_latency_us`. Each sample is one per-WORKER
2855 // cumulative sched_info.run_delay total (NOT per-dispatch), so the pool
2856 // size is the worker count — see
2857 // [`crate::assert::PhaseCgroupStats::run_delays_ns`]. Distribution kind;
2858 // accessor |_| None (ext_metrics-sourced from the re-pool).
2859 name: "worst_mean_run_delay_us",
2860 polarity: crate::test_support::Polarity::LowerBetter,
2861 kind: MetricKind::Distribution {
2862 source: SampleSource::RunDelayNs,
2863 reduction: SampleReduction::Mean,
2864 },
2865 default_abs: 50.0,
2866 default_rel: 0.25,
2867 display_unit: "\u{00b5}s",
2868 accessor: |_| None,
2869 },
2870 MetricDef {
2871 // Worst (max) schedstat run-delay over the combined run-delay sample
2872 // set, RAW ns→µs once. Distribution kind with the Worst reduction:
2873 // the one Distribution reduction whose cross-RUN fold is MAX (the
2874 // peak survives), not MEAN — see [`crate::stats::SampleReduction::Worst`].
2875 name: "worst_run_delay_us",
2876 polarity: crate::test_support::Polarity::LowerBetter,
2877 kind: MetricKind::Distribution {
2878 source: SampleSource::RunDelayNs,
2879 reduction: SampleReduction::Worst,
2880 },
2881 default_abs: 100.0,
2882 default_rel: 0.50,
2883 display_unit: "\u{00b5}s",
2884 accessor: |_| None,
2885 },
2886 MetricDef {
2887 // Ratio of p99 / median wake latency, worst-case across
2888 // cgroups. `LowerBetter` because a higher ratio signals a
2889 // stretched long tail. Unitless; baseline is 1.0 (p99 == median
2890 // is the perfect-uniform floor set by order-statistic
2891 // ordering). `default_abs = 0.5` guards against trivially
2892 // small deltas that percent-only gates would flag; `default_rel
2893 // = 0.25` matches the wake-latency metrics' percent gate.
2894 //
2895 // BASIS: the per-cgroup worst — the MAX over each cgroup's own
2896 // p99/median ratio (`CgroupStats::wake_latency_tail_ratio`), selected
2897 // post-merge over `stats.cgroups`. Deliberately NOT
2898 // `pooled_p99 / pooled_median` of the `worst_p99_wake_latency_us` /
2899 // `worst_median_wake_latency_us` Distributions (those re-pool the
2900 // cross-cgroup union), so the two do not satisfy
2901 // `tail_ratio == pooled_p99/pooled_median`.
2902 //
2903 // CROSS-RUN FOLD = unweighted exclude-missing MEAN (NOT MAX), by
2904 // design. `MetricKind::WakeLatencyTailRatio` is a WITHIN-RUN
2905 // worst-across-cgroups selector; cross-RUN `aggregate_finite`
2906 // MEAN-folds the per-run worst values over ONLY the runs that cleared
2907 // the floor (divisor = present-finite-contributor count), so a cohort
2908 // of repeated runs reports its TYPICAL worst-cgroup tail amplification
2909 // — the operator-facing cohort-comparison default shared with every
2910 // WorstLowest selector. It deliberately does NOT fold by MAX: MAX
2911 // (peak-of-peaks) is reserved for `SampleReduction::Worst`
2912 // (worst_run_delay_us), a peak DETECTOR; this answers "what is this
2913 // cohort's characteristic worst-cgroup tail". Aligning worst-across
2914 // selectors to a cross-RUN extremum is a tracked product decision (see
2915 // the EXTREMUM ASYMMETRY note in `aggregate_finite`), not this fix.
2916 //
2917 // Samples-required noise gate, enforced at the PRODUCER (not an
2918 // accessor): `crate::assert::populate_run_distribution_metrics` emits
2919 // NO ext key when the run completed fewer than
2920 // [`WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS`] iterations (with few
2921 // samples the p99 estimate is effectively the observed maximum and the
2922 // ratio is dominated by a single outlier, not a distributional signal),
2923 // and none when no cgroup carried a measurable tail. An absent key is
2924 // EXCLUDED from the cross-RUN mean (no sub-threshold run dilutes the
2925 // cohort) and read as `None` by `compare_rows`, where the `(None,
2926 // None)` arm skips the pair entirely (no verdict, no coverage diff).
2927 // This REPLACES the deleted
2928 // typed field's accessor gate, which (a) summed every passing run's raw
2929 // ratio over `passes_observed` cross-RUN — folding noisy low-N runs in
2930 // as real values — and (b) re-gated the AGGREGATED row against a MEANED
2931 // iteration count. See [`WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS`] for
2932 // the threshold-value rationale.
2933 //
2934 // accessor |_| None: ext_metrics-sourced from the post-merge producer.
2935 name: "worst_wake_latency_tail_ratio",
2936 polarity: crate::test_support::Polarity::LowerBetter,
2937 kind: MetricKind::WakeLatencyTailRatio,
2938 default_abs: 0.5,
2939 default_rel: 0.25,
2940 display_unit: "x",
2941 accessor: |_| None,
2942 },
2943 MetricDef {
2944 // Per-worker iteration throughput, worst (lowest) cgroup.
2945 // `HigherBetter` mirrors [`total_iterations`]: a cgroup that
2946 // fell behind regresses this downward, and a cross-variant
2947 // improvement raises it. `default_abs = 10.0` is the absolute
2948 // iteration-count floor below which deltas are noise;
2949 // `default_rel = 0.10` mirrors the `total_iterations` gate.
2950 //
2951 // Derivation of `abs = 10`: this metric is PER-WORKER. In-tree
2952 // fixtures span `workers_per_cgroup` from 1 through 8 (see
2953 // the KtstrTestEntry declarations under src/scenario/*.rs and
2954 // tests/*.rs); `CtxBuilder`'s `workers_per_cgroup`
2955 // defaults to 1, with scenario-level overrides raising it. A
2956 // per-worker floor of 10 therefore corresponds to
2957 // aggregate regressions of 10-80 total iterations across the
2958 // supported worker counts — high enough that a lightly-
2959 // loaded scheduler's jitter does not flag a regression, low
2960 // enough that a genuine drop (e.g. a cgroup that fell behind
2961 // by 10 iterations at workers=1, or 80 at workers=8) still
2962 // trips the gate. Going below 10 would flag normal cross-run
2963 // jitter on single-worker configs; going above 10 would mask
2964 // regressions on low-worker-count tests. The `rel=0.10`
2965 // companion gate handles larger throughputs proportionally,
2966 // so the `abs=10` floor only binds in the small-count regime
2967 // where rel-only would let single-digit losses slip through.
2968 //
2969 // WorstLowest kind: the lowest (worst) cgroup's
2970 // total_iterations / num_workers, re-pooled post-merge by
2971 // `crate::assert::populate_run_distribution_metrics` from the
2972 // per-cgroup counters via the None-aware lowest-wins fold (a
2973 // measured Some(0.0) wins; a no-workers None is skipped). Accessor
2974 // |_| None — ext_metrics-sourced; an all-None cohort writes no key.
2975 name: "worst_iterations_per_worker",
2976 polarity: crate::test_support::Polarity::HigherBetter,
2977 kind: MetricKind::WorstLowest {
2978 numerator: WorstLowestNumerator::Iterations,
2979 denominator: WorstLowestDenominator::NumWorkers,
2980 },
2981 default_abs: 10.0,
2982 default_rel: 0.10,
2983 display_unit: "",
2984 accessor: |_| None,
2985 },
2986 MetricDef {
2987 // Overcommit-INVARIANT per-cgroup efficiency (iterations per
2988 // CPU-second). `HigherBetter`: a cgroup that lost efficiency
2989 // regresses this downward. Unlike worst_iterations_per_worker
2990 // (raw work, scales with the host-CPU budget), this is the metric
2991 // to compare across `cpu_budget` settings — the overcommit marker
2992 // and compare-path warning point operators here.
2993 //
2994 // `default_rel = 0.10` is the binding proportional gate (a 10%
2995 // efficiency change is the regression signal), mirroring the
2996 // per-worker sibling. `default_abs = 10.0` (iterations/CPU-second)
2997 // is a near-zero noise floor: for any real busy workload the rate
2998 // is orders of magnitude larger, so the floor only binds for a
2999 // near-idle cgroup, where it stops a large rel% on a tiny rate
3000 // from flagging jitter. Distinct from the per-worker metric's
3001 // floor (which scales with worker count) — this is a per-second
3002 // rate, so the floor is a flat anti-noise guard, not a per-worker
3003 // derivation.
3004 //
3005 // WorstLowest kind: the lowest (worst) cgroup's
3006 // total_iterations / (total_cpu_time_ns / 1e9), re-pooled post-merge
3007 // by `crate::assert::populate_run_distribution_metrics` (None when a
3008 // cgroup has no workers or no on-CPU time; lowest measured wins).
3009 // Accessor |_| None — ext_metrics-sourced.
3010 name: "worst_iterations_per_cpu_sec",
3011 polarity: crate::test_support::Polarity::HigherBetter,
3012 kind: MetricKind::WorstLowest {
3013 numerator: WorstLowestNumerator::Iterations,
3014 denominator: WorstLowestDenominator::CpuTimeNs,
3015 },
3016 default_abs: 10.0,
3017 default_rel: 0.10,
3018 // Same physical quantity as the pooled iterations_per_cpu_sec Rate;
3019 // share its unit string rather than leaving this one under-specified.
3020 display_unit: "iter/cpu-s",
3021 accessor: |_| None,
3022 },
3023 MetricDef {
3024 // The WORST (lowest) per-cgroup page-locality fraction across the run.
3025 // HigherBetter, so lowest-wins = worst — a WorstLowest selector
3026 // (None-aware: a measured 0.0, all pages off-node, WINS the lowest; a
3027 // cgroup that measured no NUMA pages is skipped, not a 0.0 sentinel).
3028 // Re-pooled post-merge from the per-phase NUMA carriers
3029 // (assert::populate_run_distribution_metrics, numa_agg_per_cgroup) — NOT
3030 // a typed field: the reports-only CgroupStats hardcodes page_locality 0.0
3031 // (no expected-node set), and the prior typed Gauge field folded via
3032 // fold_lowest_nonzero, which SKIPPED a measured 0.0 and reported a
3033 // better-than-worst cross-run value. accessor None: ext-sourced.
3034 name: "worst_page_locality",
3035 polarity: crate::test_support::Polarity::HigherBetter,
3036 kind: MetricKind::WorstLowest {
3037 numerator: WorstLowestNumerator::NumaLocal,
3038 denominator: WorstLowestDenominator::NumaTotal,
3039 },
3040 default_abs: 0.05,
3041 default_rel: 0.10,
3042 display_unit: "",
3043 accessor: |_| None,
3044 },
3045 MetricDef {
3046 // The WORST (highest) per-cgroup cross-node migration-churn ratio across
3047 // the run. LowerBetter, so highest-wins = worst — a WorstCrossNodeRatio
3048 // max-selector re-pooled post-merge from the per-phase NUMA carriers
3049 // (assert::populate_run_distribution_metrics, numa_agg_per_cgroup) — NOT a
3050 // typed field: the prior typed Gauge(Last) field/GauntletRow column was
3051 // merge-max-folded within-run but cross-run averaged each run's value over
3052 // passes_observed (folding a NUMA-less run's 0.0 sentinel in), AND diverged
3053 // from run_metric (which already re-derived from the per-phase carriers), so
3054 // the sidecar and the in-test read gave different values on multi-phase
3055 // runs. accessor None: ext-sourced.
3056 name: "worst_cross_node_migration_ratio",
3057 polarity: crate::test_support::Polarity::LowerBetter,
3058 kind: MetricKind::WorstCrossNodeRatio,
3059 default_abs: 0.05,
3060 default_rel: 0.20,
3061 display_unit: "",
3062 accessor: |_| None,
3063 },
3064 // -- schbench per-phase metrics (MetricKind::PerPhase) --
3065 // Derived ONCE per phase by `crate::assert::derive_phase_metrics`
3066 // from the phase's pooled schbench histograms / run-delay raw pairs, written
3067 // directly into `PhaseBucket::metrics`. is_derived (skipped by the within-run
3068 // reducers + the phase-bucket merge) with no run-level producer; a per-phase
3069 // A/B claim reads them via `phase_metric`. `accessor: |_| None` — they never
3070 // live on a `GauntletRow`. Latency p50/p90 mirror worst_median_wake_latency_us
3071 // (abs 20), p99/p999 + sched-delay mirror worst_p99/mean (abs 50); all rel 0.25.
3072 MetricDef {
3073 name: SCHBENCH_WAKEUP_P50_US,
3074 polarity: crate::test_support::Polarity::LowerBetter,
3075 kind: MetricKind::PerPhase,
3076 default_abs: 20.0,
3077 default_rel: 0.25,
3078 display_unit: "\u{00b5}s",
3079 accessor: |_| None,
3080 },
3081 MetricDef {
3082 name: SCHBENCH_WAKEUP_P90_US,
3083 polarity: crate::test_support::Polarity::LowerBetter,
3084 kind: MetricKind::PerPhase,
3085 default_abs: 20.0,
3086 default_rel: 0.25,
3087 display_unit: "\u{00b5}s",
3088 accessor: |_| None,
3089 },
3090 MetricDef {
3091 name: SCHBENCH_WAKEUP_P99_US,
3092 polarity: crate::test_support::Polarity::LowerBetter,
3093 kind: MetricKind::PerPhase,
3094 default_abs: 50.0,
3095 default_rel: 0.25,
3096 display_unit: "\u{00b5}s",
3097 accessor: |_| None,
3098 },
3099 MetricDef {
3100 name: SCHBENCH_WAKEUP_P999_US,
3101 polarity: crate::test_support::Polarity::LowerBetter,
3102 kind: MetricKind::PerPhase,
3103 default_abs: 50.0,
3104 default_rel: 0.25,
3105 display_unit: "\u{00b5}s",
3106 accessor: |_| None,
3107 },
3108 MetricDef {
3109 name: SCHBENCH_REQUEST_P50_US,
3110 polarity: crate::test_support::Polarity::LowerBetter,
3111 kind: MetricKind::PerPhase,
3112 default_abs: 20.0,
3113 default_rel: 0.25,
3114 display_unit: "\u{00b5}s",
3115 accessor: |_| None,
3116 },
3117 MetricDef {
3118 name: SCHBENCH_REQUEST_P90_US,
3119 polarity: crate::test_support::Polarity::LowerBetter,
3120 kind: MetricKind::PerPhase,
3121 default_abs: 20.0,
3122 default_rel: 0.25,
3123 display_unit: "\u{00b5}s",
3124 accessor: |_| None,
3125 },
3126 MetricDef {
3127 name: SCHBENCH_REQUEST_P99_US,
3128 polarity: crate::test_support::Polarity::LowerBetter,
3129 kind: MetricKind::PerPhase,
3130 default_abs: 50.0,
3131 default_rel: 0.25,
3132 display_unit: "\u{00b5}s",
3133 accessor: |_| None,
3134 },
3135 MetricDef {
3136 name: SCHBENCH_REQUEST_P999_US,
3137 polarity: crate::test_support::Polarity::LowerBetter,
3138 kind: MetricKind::PerPhase,
3139 default_abs: 50.0,
3140 default_rel: 0.25,
3141 display_unit: "\u{00b5}s",
3142 accessor: |_| None,
3143 },
3144 MetricDef {
3145 name: SCHBENCH_SCHED_DELAY_MSG_US,
3146 polarity: crate::test_support::Polarity::LowerBetter,
3147 kind: MetricKind::PerPhase,
3148 default_abs: 50.0,
3149 default_rel: 0.25,
3150 display_unit: "\u{00b5}s",
3151 accessor: |_| None,
3152 },
3153 MetricDef {
3154 name: SCHBENCH_SCHED_DELAY_WORKER_US,
3155 polarity: crate::test_support::Polarity::LowerBetter,
3156 kind: MetricKind::PerPhase,
3157 default_abs: 50.0,
3158 default_rel: 0.25,
3159 display_unit: "\u{00b5}s",
3160 accessor: |_| None,
3161 },
3162 MetricDef {
3163 // Completed work cycles in the phase — more is better (throughput).
3164 name: SCHBENCH_LOOP_COUNT,
3165 polarity: crate::test_support::Polarity::HigherBetter,
3166 kind: MetricKind::PerPhase,
3167 default_abs: 1.0,
3168 default_rel: 0.30,
3169 display_unit: "",
3170 accessor: |_| None,
3171 },
3172 // taobench per-phase qps + hit ratios (WorkType::Taobench engine, derived by
3173 // write_taobench_scalars). total/fast qps HigherBetter (throughput); slow_qps
3174 // + hit_ratio + hit_rate Informational (a component / run-validity signals,
3175 // never a regression direction — see classify_direction).
3176 MetricDef {
3177 name: TAOBENCH_TOTAL_QPS,
3178 polarity: crate::test_support::Polarity::HigherBetter,
3179 kind: MetricKind::PerPhase,
3180 default_abs: 10.0,
3181 default_rel: 0.10,
3182 display_unit: "ops/s",
3183 accessor: |_| None,
3184 },
3185 MetricDef {
3186 name: TAOBENCH_FAST_QPS,
3187 polarity: crate::test_support::Polarity::HigherBetter,
3188 kind: MetricKind::PerPhase,
3189 default_abs: 10.0,
3190 default_rel: 0.10,
3191 display_unit: "ops/s",
3192 accessor: |_| None,
3193 },
3194 MetricDef {
3195 name: TAOBENCH_SLOW_QPS,
3196 polarity: crate::test_support::Polarity::Informational,
3197 kind: MetricKind::PerPhase,
3198 default_abs: 10.0,
3199 default_rel: 0.10,
3200 display_unit: "ops/s",
3201 accessor: |_| None,
3202 },
3203 MetricDef {
3204 name: TAOBENCH_HIT_RATIO,
3205 polarity: crate::test_support::Polarity::Informational,
3206 kind: MetricKind::PerPhase,
3207 default_abs: 0.02,
3208 default_rel: 0.05,
3209 display_unit: "",
3210 accessor: |_| None,
3211 },
3212 MetricDef {
3213 name: TAOBENCH_HIT_RATE,
3214 polarity: crate::test_support::Polarity::Informational,
3215 kind: MetricKind::PerPhase,
3216 default_abs: 0.02,
3217 default_rel: 0.05,
3218 display_unit: "",
3219 accessor: |_| None,
3220 },
3221 // taobench per-phase open-loop SERVE-LATENCY percentiles (µs, LowerBetter,
3222 // PerPhase): the coordinated-omission serve distribution per phase. Thresholds
3223 // mirror the schbench per-phase latency siblings (p50/p90/min abs 20; p99/p999
3224 // abs 50; max abs 50 / rel 0.50 for the noisier tail). Absent in closed loop.
3225 MetricDef {
3226 name: TAOBENCH_SERVE_P50_US,
3227 polarity: crate::test_support::Polarity::LowerBetter,
3228 kind: MetricKind::PerPhase,
3229 default_abs: 20.0,
3230 default_rel: 0.25,
3231 display_unit: "\u{00b5}s",
3232 accessor: |_| None,
3233 },
3234 MetricDef {
3235 name: TAOBENCH_SERVE_P90_US,
3236 polarity: crate::test_support::Polarity::LowerBetter,
3237 kind: MetricKind::PerPhase,
3238 default_abs: 20.0,
3239 default_rel: 0.25,
3240 display_unit: "\u{00b5}s",
3241 accessor: |_| None,
3242 },
3243 MetricDef {
3244 name: TAOBENCH_SERVE_P99_US,
3245 polarity: crate::test_support::Polarity::LowerBetter,
3246 kind: MetricKind::PerPhase,
3247 default_abs: 50.0,
3248 default_rel: 0.25,
3249 display_unit: "\u{00b5}s",
3250 accessor: |_| None,
3251 },
3252 MetricDef {
3253 name: TAOBENCH_SERVE_P999_US,
3254 polarity: crate::test_support::Polarity::LowerBetter,
3255 kind: MetricKind::PerPhase,
3256 default_abs: 50.0,
3257 default_rel: 0.25,
3258 display_unit: "\u{00b5}s",
3259 accessor: |_| None,
3260 },
3261 MetricDef {
3262 name: TAOBENCH_SERVE_MIN_US,
3263 polarity: crate::test_support::Polarity::LowerBetter,
3264 kind: MetricKind::PerPhase,
3265 default_abs: 20.0,
3266 default_rel: 0.25,
3267 display_unit: "\u{00b5}s",
3268 accessor: |_| None,
3269 },
3270 MetricDef {
3271 name: TAOBENCH_SERVE_MAX_US,
3272 polarity: crate::test_support::Polarity::LowerBetter,
3273 kind: MetricKind::PerPhase,
3274 default_abs: 50.0,
3275 default_rel: 0.50,
3276 display_unit: "\u{00b5}s",
3277 accessor: |_| None,
3278 },
3279 // taobench WHOLE-RUN qps + hit Rates and their Counter components, pooled
3280 // cross-cgroup by `crate::assert::populate_run_pooled_taobench` and derived
3281 // by `derive_rate_metrics`. The four `total_taobench_*` Counters are the rate
3282 // components (ext_metrics-only, accessor |_| None; `total_` prefix satisfies
3283 // the Counter naming gate) and are `RENDER_SUPPRESSED_COMPONENTS`, so their
3284 // default_abs/default_rel are inert at the compare layer — the entries exist
3285 // for the re-pool (`name` is the component key, `kind` drives the Counter
3286 // SUM-fold). Cross-RUN each component SUMs, so the Rates re-pool as
3287 // Σnumerator / Σdenominator (aggregate throughput, not a mean of per-run qps).
3288 // HIT is exposed whole-run BOTH ways: the RESPONSE-time taobench_hit_fraction
3289 // (Σfast/Σcompleted) AND the COMMAND-time taobench_command_hit_rate (Σhits/Σcmds,
3290 // hits = cmds − misses — the whole-run analog of the per-phase
3291 // taobench_hit_rate). Under closed-loop every issued lookup completes so the
3292 // two converge; under OPEN-LOOP arrival they diverge (a slow/overloaded run
3293 // issues lookups that have not yet completed), which is why both carry distinct
3294 // --noise-adjust spread and both are registered.
3295 MetricDef {
3296 name: TOTAL_TAOBENCH_OPS,
3297 polarity: crate::test_support::Polarity::HigherBetter,
3298 kind: MetricKind::Counter,
3299 default_abs: 10.0,
3300 default_rel: 0.10,
3301 display_unit: "",
3302 accessor: |_| None,
3303 },
3304 MetricDef {
3305 name: TOTAL_TAOBENCH_FAST_OPS,
3306 polarity: crate::test_support::Polarity::HigherBetter,
3307 kind: MetricKind::Counter,
3308 default_abs: 10.0,
3309 default_rel: 0.10,
3310 display_unit: "",
3311 accessor: |_| None,
3312 },
3313 MetricDef {
3314 name: TOTAL_TAOBENCH_SLOW_OPS,
3315 polarity: crate::test_support::Polarity::Informational,
3316 kind: MetricKind::Counter,
3317 default_abs: 10.0,
3318 default_rel: 0.10,
3319 display_unit: "",
3320 accessor: |_| None,
3321 },
3322 MetricDef {
3323 // Whole-run wall window (ns→s applied once at the producer), the qps
3324 // DENOMINATOR. Counter — cross-RUN SUM, mirroring `total_cpu_time_sec`,
3325 // so Σops/Σwall re-pools the cohort throughput. `total_` prefix satisfies
3326 // the Counter naming gate.
3327 name: TOTAL_TAOBENCH_WALL_SEC,
3328 polarity: crate::test_support::Polarity::HigherBetter,
3329 kind: MetricKind::Counter,
3330 default_abs: 1.0,
3331 default_rel: 0.30,
3332 display_unit: "s",
3333 accessor: |_| None,
3334 },
3335 MetricDef {
3336 // Whole-run total throughput = Σcompleted ops / Σwall-seconds.
3337 // HigherBetter (throughput). Shares the per-phase `taobench_total_qps`
3338 // thresholds. Absent when no Taobench cgroup ran or the wall window was
3339 // unmeasured (components absent).
3340 name: TAOBENCH_TOTAL_OPS_PER_SEC,
3341 polarity: crate::test_support::Polarity::HigherBetter,
3342 kind: MetricKind::Rate {
3343 numerator: TOTAL_TAOBENCH_OPS,
3344 denominator: TOTAL_TAOBENCH_WALL_SEC,
3345 },
3346 default_abs: 10.0,
3347 default_rel: 0.10,
3348 display_unit: "ops/s",
3349 accessor: |_| None,
3350 },
3351 MetricDef {
3352 // Whole-run hit (fast-path) throughput = Σfast ops / Σwall-seconds.
3353 // HigherBetter.
3354 name: TAOBENCH_FAST_OPS_PER_SEC,
3355 polarity: crate::test_support::Polarity::HigherBetter,
3356 kind: MetricKind::Rate {
3357 numerator: TOTAL_TAOBENCH_FAST_OPS,
3358 denominator: TOTAL_TAOBENCH_WALL_SEC,
3359 },
3360 default_abs: 10.0,
3361 default_rel: 0.10,
3362 display_unit: "ops/s",
3363 accessor: |_| None,
3364 },
3365 MetricDef {
3366 // Whole-run slow-path throughput = Σslow ops / Σwall-seconds.
3367 // Informational — the slow path is a component of total throughput, not a
3368 // standalone regression direction (mirrors the per-phase
3369 // `taobench_slow_qps`).
3370 name: TAOBENCH_SLOW_OPS_PER_SEC,
3371 polarity: crate::test_support::Polarity::Informational,
3372 kind: MetricKind::Rate {
3373 numerator: TOTAL_TAOBENCH_SLOW_OPS,
3374 denominator: TOTAL_TAOBENCH_WALL_SEC,
3375 },
3376 default_abs: 10.0,
3377 default_rel: 0.10,
3378 display_unit: "ops/s",
3379 accessor: |_| None,
3380 },
3381 MetricDef {
3382 // Whole-run cache hit FRACTION = Σfast ops / Σcompleted ops — the SAME
3383 // response-time hit measurement as the per-phase `taobench_hit_ratio`, at
3384 // whole-run scope (its run-level Σ/Σ pool). The name differs only by axis:
3385 // per-phase `_ratio` vs whole-run `_fraction` — a distinct registry key is
3386 // required because a Rate cannot share a name with the per-phase PerPhase
3387 // entry, and `_fraction` reads as the pooled [0, 1] ratio-of-counters (the
3388 // qps siblings diverge the same way: per-phase `_qps` vs whole-run
3389 // `_ops_per_sec`). DISTINCT from the command-time `taobench_hit_rate`
3390 // (1 - misses/cmds), which is request-time, not response-time — see the
3391 // block comment above. A fraction in [0, 1]. Informational — a hit-rate
3392 // change is a workload-shape signal, not a scheduler regression direction.
3393 // Absent when no ops completed (`total_taobench_ops` is 0 →
3394 // `derive_rate_metrics` skips the zero denominator).
3395 name: TAOBENCH_HIT_FRACTION,
3396 polarity: crate::test_support::Polarity::Informational,
3397 kind: MetricKind::Rate {
3398 numerator: TOTAL_TAOBENCH_FAST_OPS,
3399 denominator: TOTAL_TAOBENCH_OPS,
3400 },
3401 default_abs: 0.02,
3402 default_rel: 0.05,
3403 display_unit: "",
3404 accessor: |_| None,
3405 },
3406 // taobench WHOLE-RUN open-loop serve-latency percentiles (µs, LowerBetter,
3407 // PerRunDistribution): the union of the per-phase per-cgroup serve histograms,
3408 // percentile re-derived over the union by
3409 // `crate::assert::populate_run_pooled_taobench_distribution`. Noise-compared
3410 // per-run, never cross-run folded (is_derived). Thresholds mirror the
3411 // per-phase serve siblings. Absent in closed loop (no serve samples).
3412 MetricDef {
3413 name: TAOBENCH_SERVE_P50_US_WHOLE,
3414 polarity: crate::test_support::Polarity::LowerBetter,
3415 kind: MetricKind::PerRunDistribution,
3416 default_abs: 20.0,
3417 default_rel: 0.25,
3418 display_unit: "\u{00b5}s",
3419 accessor: |_| None,
3420 },
3421 MetricDef {
3422 name: TAOBENCH_SERVE_P90_US_WHOLE,
3423 polarity: crate::test_support::Polarity::LowerBetter,
3424 kind: MetricKind::PerRunDistribution,
3425 default_abs: 20.0,
3426 default_rel: 0.25,
3427 display_unit: "\u{00b5}s",
3428 accessor: |_| None,
3429 },
3430 MetricDef {
3431 name: TAOBENCH_SERVE_P99_US_WHOLE,
3432 polarity: crate::test_support::Polarity::LowerBetter,
3433 kind: MetricKind::PerRunDistribution,
3434 default_abs: 50.0,
3435 default_rel: 0.25,
3436 display_unit: "\u{00b5}s",
3437 accessor: |_| None,
3438 },
3439 MetricDef {
3440 name: TAOBENCH_SERVE_P999_US_WHOLE,
3441 polarity: crate::test_support::Polarity::LowerBetter,
3442 kind: MetricKind::PerRunDistribution,
3443 default_abs: 50.0,
3444 default_rel: 0.25,
3445 display_unit: "\u{00b5}s",
3446 accessor: |_| None,
3447 },
3448 MetricDef {
3449 name: TAOBENCH_SERVE_MIN_US_WHOLE,
3450 polarity: crate::test_support::Polarity::LowerBetter,
3451 kind: MetricKind::PerRunDistribution,
3452 default_abs: 20.0,
3453 default_rel: 0.25,
3454 display_unit: "\u{00b5}s",
3455 accessor: |_| None,
3456 },
3457 MetricDef {
3458 name: TAOBENCH_SERVE_MAX_US_WHOLE,
3459 polarity: crate::test_support::Polarity::LowerBetter,
3460 kind: MetricKind::PerRunDistribution,
3461 default_abs: 50.0,
3462 default_rel: 0.50,
3463 display_unit: "\u{00b5}s",
3464 accessor: |_| None,
3465 },
3466 // taobench WHOLE-RUN command-time hit: get_cmds + get_hits (= cmds − misses)
3467 // Counter components (ext-only, RENDER_SUPPRESSED, `total_` gate) →
3468 // taobench_command_hit_rate = Σhits/Σcmds (the request-time hit, which diverges
3469 // from the response-time taobench_hit_fraction under open-loop). Pooled by
3470 // `crate::assert::populate_run_pooled_taobench`.
3471 MetricDef {
3472 name: TOTAL_TAOBENCH_GET_CMDS,
3473 polarity: crate::test_support::Polarity::HigherBetter,
3474 kind: MetricKind::Counter,
3475 default_abs: 10.0,
3476 default_rel: 0.10,
3477 display_unit: "",
3478 accessor: |_| None,
3479 },
3480 MetricDef {
3481 name: TOTAL_TAOBENCH_GET_HITS,
3482 polarity: crate::test_support::Polarity::HigherBetter,
3483 kind: MetricKind::Counter,
3484 default_abs: 10.0,
3485 default_rel: 0.10,
3486 display_unit: "",
3487 accessor: |_| None,
3488 },
3489 MetricDef {
3490 name: TAOBENCH_COMMAND_HIT_RATE,
3491 polarity: crate::test_support::Polarity::Informational,
3492 kind: MetricKind::Rate {
3493 numerator: TOTAL_TAOBENCH_GET_HITS,
3494 denominator: TOTAL_TAOBENCH_GET_CMDS,
3495 },
3496 default_abs: 0.02,
3497 default_rel: 0.05,
3498 display_unit: "",
3499 accessor: |_| None,
3500 },
3501 // schbench WHOLE-RUN Class-3: role-separate run-delay gate Rates + their
3502 // Counter components + the whole-run loop Counter, re-pooled run-level by
3503 // `crate::assert::populate_run_pooled_schbench` from the per-phase per-cgroup
3504 // SchbenchPhaseStats raw pairs (Σ over phases+cgroups). The four run-delay /
3505 // pcount Counters are ext-only rate components (accessor |_| None;
3506 // RENDER_SUPPRESSED; `total_` prefix → Counter gate). The two Rates are the
3507 // sample-weighted Σrun_delay/Σpcount per-schedule means (the workload-scoped
3508 // siblings of the system-wide `total_run_delay_ns_per_sched`); message and
3509 // worker roles pool separately. The per-phase `sched_delay_msg/worker_us`
3510 // (PerPhase, display-only) is the SAME Σrun_delay_ns/Σpcount per-schedule
3511 // mean at phase scope -- NOT schbench's native mean-of-per-thread-means
3512 // (that is a separate whole-run stat on `SchbenchResult`, see
3513 // workload/schbench). Only these Rates gate, so no double-count.
3514 MetricDef {
3515 name: TOTAL_SCHBENCH_MSG_RUN_DELAY_NS,
3516 polarity: crate::test_support::Polarity::LowerBetter,
3517 kind: MetricKind::Counter,
3518 default_abs: 1000.0,
3519 default_rel: 0.10,
3520 display_unit: "ns",
3521 accessor: |_| None,
3522 },
3523 MetricDef {
3524 name: TOTAL_SCHBENCH_MSG_PCOUNT,
3525 polarity: crate::test_support::Polarity::Informational,
3526 kind: MetricKind::Counter,
3527 default_abs: 1.0,
3528 default_rel: 0.10,
3529 display_unit: "",
3530 accessor: |_| None,
3531 },
3532 MetricDef {
3533 name: TOTAL_SCHBENCH_WORKER_RUN_DELAY_NS,
3534 polarity: crate::test_support::Polarity::LowerBetter,
3535 kind: MetricKind::Counter,
3536 default_abs: 1000.0,
3537 default_rel: 0.10,
3538 display_unit: "ns",
3539 accessor: |_| None,
3540 },
3541 MetricDef {
3542 name: TOTAL_SCHBENCH_WORKER_PCOUNT,
3543 polarity: crate::test_support::Polarity::Informational,
3544 kind: MetricKind::Counter,
3545 default_abs: 1.0,
3546 default_rel: 0.10,
3547 display_unit: "",
3548 accessor: |_| None,
3549 },
3550 MetricDef {
3551 // Whole-run completed work cycles (Σ over phases+cgroups). HigherBetter
3552 // (throughput). NOT a rate component, so NOT suppressed. Uses the tighter
3553 // rel 0.10 throughput-Counter band shared by its structural peers
3554 // total_iterations / total_phase_iterations (HigherBetter completed-work
3555 // Counters): the whole-run Σ pools every cycle, so a 10-29% drop is a real
3556 // regression, not noise. The per-phase twin `schbench_loop_count`
3557 // (PerPhase) DELIBERATELY keeps the looser rel 0.30 for a
3558 // SMALL-SAMPLE-WINDOW reason, not an accounting one: a single phase pools
3559 // far fewer completed cycles than the whole-run Σ, so its run-to-run
3560 // relative variance (CV) is higher and needs a wider band. It is NOT
3561 // phase-edge jitter -- the per-phase counts partition EXACTLY to the
3562 // whole-run total and cycles are whole, never fractional (schbench/run.rs
3563 // increments once per completed cycle, drains a whole count at the phase
3564 // boundary). The nearest per-phase RAW-COUNT peer, total_phase_iterations,
3565 // itself gates at 0.10; loop_count's 0.30 is the small-window-CV
3566 // exception, not a like-for-like registry precedent. default_abs is the
3567 // near-idle activity floor; default_rel carries materiality (see
3568 // MetricDef::default_abs).
3569 name: TOTAL_SCHBENCH_LOOPS,
3570 polarity: crate::test_support::Polarity::HigherBetter,
3571 kind: MetricKind::Counter,
3572 default_abs: 1.0,
3573 default_rel: 0.10,
3574 display_unit: "",
3575 accessor: |_| None,
3576 },
3577 MetricDef {
3578 // Message-thread per-schedule run-delay mean = Σrun_delay_ns / Σpcount
3579 // (sample-weighted, NOT mean-of-per-run-means). LowerBetter (higher
3580 // scheduling wait is worse). Absent when no message thread was scheduled
3581 // (Σpcount == 0).
3582 name: SCHBENCH_MSG_RUN_DELAY_NS_PER_SCHED,
3583 polarity: crate::test_support::Polarity::LowerBetter,
3584 kind: MetricKind::Rate {
3585 numerator: TOTAL_SCHBENCH_MSG_RUN_DELAY_NS,
3586 denominator: TOTAL_SCHBENCH_MSG_PCOUNT,
3587 },
3588 default_abs: 100.0,
3589 default_rel: 0.10,
3590 display_unit: "ns",
3591 accessor: |_| None,
3592 },
3593 MetricDef {
3594 // Worker per-schedule run-delay mean = Σrun_delay_ns / Σpcount. LowerBetter.
3595 // Pooled SEPARATELY from the message role (different per-schedule wait
3596 // populations). Absent when no worker was scheduled (Σpcount == 0).
3597 name: SCHBENCH_WORKER_RUN_DELAY_NS_PER_SCHED,
3598 polarity: crate::test_support::Polarity::LowerBetter,
3599 kind: MetricKind::Rate {
3600 numerator: TOTAL_SCHBENCH_WORKER_RUN_DELAY_NS,
3601 denominator: TOTAL_SCHBENCH_WORKER_PCOUNT,
3602 },
3603 default_abs: 100.0,
3604 default_rel: 0.10,
3605 display_unit: "ns",
3606 accessor: |_| None,
3607 },
3608 // Per-phase latency min/max. LowerBetter (a higher min/max latency is worse).
3609 // min is a low-tail value → p50/p90 abs tier (20). max is a PEAK (a single
3610 // extreme sample, the flakiest latency stat) → the peak rel tolerance (0.50,
3611 // matching worst_gap_ms) so one outlier spike does not fabricate a regression.
3612 MetricDef {
3613 name: SCHBENCH_WAKEUP_MIN_US,
3614 polarity: crate::test_support::Polarity::LowerBetter,
3615 kind: MetricKind::PerPhase,
3616 default_abs: 20.0,
3617 default_rel: 0.25,
3618 display_unit: "\u{00b5}s",
3619 accessor: |_| None,
3620 },
3621 MetricDef {
3622 name: SCHBENCH_WAKEUP_MAX_US,
3623 polarity: crate::test_support::Polarity::LowerBetter,
3624 kind: MetricKind::PerPhase,
3625 default_abs: 50.0,
3626 default_rel: 0.50,
3627 display_unit: "\u{00b5}s",
3628 accessor: |_| None,
3629 },
3630 MetricDef {
3631 name: SCHBENCH_REQUEST_MIN_US,
3632 polarity: crate::test_support::Polarity::LowerBetter,
3633 kind: MetricKind::PerPhase,
3634 default_abs: 20.0,
3635 default_rel: 0.25,
3636 display_unit: "\u{00b5}s",
3637 accessor: |_| None,
3638 },
3639 MetricDef {
3640 name: SCHBENCH_REQUEST_MAX_US,
3641 polarity: crate::test_support::Polarity::LowerBetter,
3642 kind: MetricKind::PerPhase,
3643 default_abs: 50.0,
3644 default_rel: 0.50,
3645 display_unit: "\u{00b5}s",
3646 accessor: |_| None,
3647 },
3648 // Per-phase achieved-RPS distribution (PLIST_FOR_RPS = 20/50/90, schbench.c:130)
3649 // + min/max (schbench.c:579 stderr footer + :713-714/:1963 JSON — parity with
3650 // what schbench emits, not an extension). HigherBetter (more requests/sec = more
3651 // throughput) — note min/max INVERT the latency polarity (a higher worst-second
3652 // rate is better). A per-second RATE spanning tens..tens-of-thousands, so
3653 // rel-dominant (rel 0.10) with a near-idle abs floor (10) — NOT loop_count's
3654 // count-style abs 1/rel 0.30.
3655 //
3656 // rps min/max keep the percentile-tier rel (0.10), NOT the loosened latency-max
3657 // tier (0.50): each rps sample is a 1-second-AVERAGED rate (cycles completed that
3658 // second), not a single event like a latency sample, so an rps extreme is the
3659 // worst/best SECOND — far less flaky than a latency per-request peak, and the
3660 // worst-second is a meaningful scheduler-tail signal worth a tight gate. (The
3661 // latency-max 0.50 loosening guards single-request spikes that do not exist in the
3662 // 1s-averaged rps series.)
3663 //
3664 // rps_min is UNRELIABLE when any 0-rps (starvation) second occurs: TWO independent
3665 // paths drop a real 0 from the min. (1) Within a histogram, add_lat's min==0
3666 // sentinel (plat.rs `if min==0 || us<min`) treats 0 as "unset" — a 0 sets min=0 but
3667 // the next sample replaces it (e.g. [100,0,200] -> min=200), so min reads 0 only
3668 // when a 0 is the last min-lowering sample. (2) Across cgroups, PlatStats::combine's
3669 // `other.min != 0` guard (plat.rs:230 — correct for latency, where 0 means empty)
3670 // skips a starved cgroup's min=0 when pooling, so a 0-rps cgroup pooled with a
3671 // nonzero one leaves rps_min nonzero. rps_min is thus a trustworthy worst-second
3672 // floor only absent 0-seconds. Sustained starvation (0-seconds >= 20% of the
3673 // window) still shows in rps_p20, which reads the pooled histogram's bucket 0
3674 // (folded unconditionally, plat.rs:223-224) in both cases; a single 0-second in a longer
3675 // window is below p20 and lost from rps_min — invisible to both. Faithful to
3676 // schbench's add_lat min sentinel.
3677 MetricDef {
3678 name: SCHBENCH_RPS_P20,
3679 polarity: crate::test_support::Polarity::HigherBetter,
3680 kind: MetricKind::PerPhase,
3681 default_abs: 10.0,
3682 default_rel: 0.10,
3683 display_unit: "req/s",
3684 accessor: |_| None,
3685 },
3686 MetricDef {
3687 name: SCHBENCH_RPS_P50,
3688 polarity: crate::test_support::Polarity::HigherBetter,
3689 kind: MetricKind::PerPhase,
3690 default_abs: 10.0,
3691 default_rel: 0.10,
3692 display_unit: "req/s",
3693 accessor: |_| None,
3694 },
3695 MetricDef {
3696 name: SCHBENCH_RPS_P90,
3697 polarity: crate::test_support::Polarity::HigherBetter,
3698 kind: MetricKind::PerPhase,
3699 default_abs: 10.0,
3700 default_rel: 0.10,
3701 display_unit: "req/s",
3702 accessor: |_| None,
3703 },
3704 MetricDef {
3705 name: SCHBENCH_RPS_MIN,
3706 polarity: crate::test_support::Polarity::HigherBetter,
3707 kind: MetricKind::PerPhase,
3708 default_abs: 10.0,
3709 default_rel: 0.10,
3710 display_unit: "req/s",
3711 accessor: |_| None,
3712 },
3713 MetricDef {
3714 name: SCHBENCH_RPS_MAX,
3715 polarity: crate::test_support::Polarity::HigherBetter,
3716 kind: MetricKind::PerPhase,
3717 default_abs: 10.0,
3718 default_rel: 0.10,
3719 display_unit: "req/s",
3720 accessor: |_| None,
3721 },
3722 // schbench WHOLE-RUN distributional metrics (MetricKind::
3723 // PerRunDistribution): each per-phase percentile/min/max above, re-pooled
3724 // run-level by populate_run_pooled_schbench_distribution (union of the
3725 // per-phase per-cgroup PlatStats histograms, percentile re-derived over the
3726 // union — the faithful percentile-of-union). Noise-compared per-run (never
3727 // cross-run folded). Thresholds + polarity + unit mirror the per-phase
3728 // sibling. accessor |_| None (ext-only, written by the union populate).
3729 MetricDef {
3730 name: SCHBENCH_WAKEUP_P50_US_WHOLE,
3731 polarity: crate::test_support::Polarity::LowerBetter,
3732 kind: MetricKind::PerRunDistribution,
3733 default_abs: 20.0,
3734 default_rel: 0.25,
3735 display_unit: "\u{00b5}s",
3736 accessor: |_| None,
3737 },
3738 MetricDef {
3739 name: SCHBENCH_WAKEUP_P90_US_WHOLE,
3740 polarity: crate::test_support::Polarity::LowerBetter,
3741 kind: MetricKind::PerRunDistribution,
3742 default_abs: 20.0,
3743 default_rel: 0.25,
3744 display_unit: "\u{00b5}s",
3745 accessor: |_| None,
3746 },
3747 MetricDef {
3748 name: SCHBENCH_WAKEUP_P99_US_WHOLE,
3749 polarity: crate::test_support::Polarity::LowerBetter,
3750 kind: MetricKind::PerRunDistribution,
3751 default_abs: 50.0,
3752 default_rel: 0.25,
3753 display_unit: "\u{00b5}s",
3754 accessor: |_| None,
3755 },
3756 MetricDef {
3757 name: SCHBENCH_WAKEUP_P999_US_WHOLE,
3758 polarity: crate::test_support::Polarity::LowerBetter,
3759 kind: MetricKind::PerRunDistribution,
3760 default_abs: 50.0,
3761 default_rel: 0.25,
3762 display_unit: "\u{00b5}s",
3763 accessor: |_| None,
3764 },
3765 MetricDef {
3766 name: SCHBENCH_WAKEUP_MIN_US_WHOLE,
3767 polarity: crate::test_support::Polarity::LowerBetter,
3768 kind: MetricKind::PerRunDistribution,
3769 default_abs: 20.0,
3770 default_rel: 0.25,
3771 display_unit: "\u{00b5}s",
3772 accessor: |_| None,
3773 },
3774 MetricDef {
3775 name: SCHBENCH_WAKEUP_MAX_US_WHOLE,
3776 polarity: crate::test_support::Polarity::LowerBetter,
3777 kind: MetricKind::PerRunDistribution,
3778 default_abs: 50.0,
3779 default_rel: 0.50,
3780 display_unit: "\u{00b5}s",
3781 accessor: |_| None,
3782 },
3783 MetricDef {
3784 name: SCHBENCH_REQUEST_P50_US_WHOLE,
3785 polarity: crate::test_support::Polarity::LowerBetter,
3786 kind: MetricKind::PerRunDistribution,
3787 default_abs: 20.0,
3788 default_rel: 0.25,
3789 display_unit: "\u{00b5}s",
3790 accessor: |_| None,
3791 },
3792 MetricDef {
3793 name: SCHBENCH_REQUEST_P90_US_WHOLE,
3794 polarity: crate::test_support::Polarity::LowerBetter,
3795 kind: MetricKind::PerRunDistribution,
3796 default_abs: 20.0,
3797 default_rel: 0.25,
3798 display_unit: "\u{00b5}s",
3799 accessor: |_| None,
3800 },
3801 MetricDef {
3802 name: SCHBENCH_REQUEST_P99_US_WHOLE,
3803 polarity: crate::test_support::Polarity::LowerBetter,
3804 kind: MetricKind::PerRunDistribution,
3805 default_abs: 50.0,
3806 default_rel: 0.25,
3807 display_unit: "\u{00b5}s",
3808 accessor: |_| None,
3809 },
3810 MetricDef {
3811 name: SCHBENCH_REQUEST_P999_US_WHOLE,
3812 polarity: crate::test_support::Polarity::LowerBetter,
3813 kind: MetricKind::PerRunDistribution,
3814 default_abs: 50.0,
3815 default_rel: 0.25,
3816 display_unit: "\u{00b5}s",
3817 accessor: |_| None,
3818 },
3819 MetricDef {
3820 name: SCHBENCH_REQUEST_MIN_US_WHOLE,
3821 polarity: crate::test_support::Polarity::LowerBetter,
3822 kind: MetricKind::PerRunDistribution,
3823 default_abs: 20.0,
3824 default_rel: 0.25,
3825 display_unit: "\u{00b5}s",
3826 accessor: |_| None,
3827 },
3828 MetricDef {
3829 name: SCHBENCH_REQUEST_MAX_US_WHOLE,
3830 polarity: crate::test_support::Polarity::LowerBetter,
3831 kind: MetricKind::PerRunDistribution,
3832 default_abs: 50.0,
3833 default_rel: 0.50,
3834 display_unit: "\u{00b5}s",
3835 accessor: |_| None,
3836 },
3837 MetricDef {
3838 name: SCHBENCH_RPS_P20_WHOLE,
3839 polarity: crate::test_support::Polarity::HigherBetter,
3840 kind: MetricKind::PerRunDistribution,
3841 default_abs: 10.0,
3842 default_rel: 0.10,
3843 display_unit: "req/s",
3844 accessor: |_| None,
3845 },
3846 MetricDef {
3847 name: SCHBENCH_RPS_P50_WHOLE,
3848 polarity: crate::test_support::Polarity::HigherBetter,
3849 kind: MetricKind::PerRunDistribution,
3850 default_abs: 10.0,
3851 default_rel: 0.10,
3852 display_unit: "req/s",
3853 accessor: |_| None,
3854 },
3855 MetricDef {
3856 name: SCHBENCH_RPS_P90_WHOLE,
3857 polarity: crate::test_support::Polarity::HigherBetter,
3858 kind: MetricKind::PerRunDistribution,
3859 default_abs: 10.0,
3860 default_rel: 0.10,
3861 display_unit: "req/s",
3862 accessor: |_| None,
3863 },
3864 MetricDef {
3865 name: SCHBENCH_RPS_MIN_WHOLE,
3866 polarity: crate::test_support::Polarity::HigherBetter,
3867 kind: MetricKind::PerRunDistribution,
3868 default_abs: 10.0,
3869 default_rel: 0.10,
3870 display_unit: "req/s",
3871 accessor: |_| None,
3872 },
3873 MetricDef {
3874 name: SCHBENCH_RPS_MAX_WHOLE,
3875 polarity: crate::test_support::Polarity::HigherBetter,
3876 kind: MetricKind::PerRunDistribution,
3877 default_abs: 10.0,
3878 default_rel: 0.10,
3879 display_unit: "req/s",
3880 accessor: |_| None,
3881 },
3882 // -- Per-cgroup per-phase NON-schbench families. PerPhase, `accessor:
3883 // |_| None`: read from `PhaseCgroupStats::metrics` by name (written by
3884 // `write_carrier_scalars`), never from a `GauntletRow`. BARE per-cgroup
3885 // names (NOT the run-level `worst_*`): a single cgroup's value is not a
3886 // "worst across cgroups", and reusing `worst_*` would collide
3887 // `metric_def` with the run-level selector. Thresholds mirror the
3888 // analogous `worst_*` entries. (`iterations_per_cpu_sec` is intentionally
3889 // absent — it is already a Rate entry above; the per-cgroup value resolves
3890 // through that name without a second registration.)
3891 MetricDef {
3892 name: "p99_wake_latency_us",
3893 polarity: crate::test_support::Polarity::LowerBetter,
3894 kind: MetricKind::PerPhase,
3895 default_abs: 50.0,
3896 default_rel: 0.25,
3897 display_unit: "\u{00b5}s",
3898 accessor: |_| None,
3899 },
3900 MetricDef {
3901 name: "median_wake_latency_us",
3902 polarity: crate::test_support::Polarity::LowerBetter,
3903 kind: MetricKind::PerPhase,
3904 default_abs: 20.0,
3905 default_rel: 0.25,
3906 display_unit: "\u{00b5}s",
3907 accessor: |_| None,
3908 },
3909 MetricDef {
3910 name: "wake_latency_cv",
3911 polarity: crate::test_support::Polarity::LowerBetter,
3912 kind: MetricKind::PerPhase,
3913 default_abs: 0.10,
3914 default_rel: 0.25,
3915 display_unit: "",
3916 accessor: |_| None,
3917 },
3918 MetricDef {
3919 // Per-cgroup per-phase timer-latency (WorkType::TimerLatency). PerPhase,
3920 // accessor |_| None: read from PhaseCgroupStats::metrics by name
3921 // (written by write_carrier_scalars). Bare name (not worst_*) — a single
3922 // cgroup's value, not a worst-across-cgroups.
3923 name: "p99_timer_latency_us",
3924 polarity: crate::test_support::Polarity::LowerBetter,
3925 kind: MetricKind::PerPhase,
3926 default_abs: 50.0,
3927 default_rel: 0.25,
3928 display_unit: "\u{00b5}s",
3929 accessor: |_| None,
3930 },
3931 MetricDef {
3932 name: "median_timer_latency_us",
3933 polarity: crate::test_support::Polarity::LowerBetter,
3934 kind: MetricKind::PerPhase,
3935 default_abs: 20.0,
3936 default_rel: 0.25,
3937 display_unit: "\u{00b5}s",
3938 accessor: |_| None,
3939 },
3940 MetricDef {
3941 name: "p999_timer_latency_us",
3942 polarity: crate::test_support::Polarity::LowerBetter,
3943 kind: MetricKind::PerPhase,
3944 default_abs: 100.0,
3945 default_rel: 0.25,
3946 display_unit: "\u{00b5}s",
3947 accessor: |_| None,
3948 },
3949 MetricDef {
3950 name: "mean_run_delay_us",
3951 polarity: crate::test_support::Polarity::LowerBetter,
3952 kind: MetricKind::PerPhase,
3953 default_abs: 50.0,
3954 default_rel: 0.25,
3955 display_unit: "\u{00b5}s",
3956 accessor: |_| None,
3957 },
3958 MetricDef {
3959 name: "max_run_delay_us",
3960 polarity: crate::test_support::Polarity::LowerBetter,
3961 kind: MetricKind::PerPhase,
3962 default_abs: 100.0,
3963 default_rel: 0.50,
3964 display_unit: "\u{00b5}s",
3965 accessor: |_| None,
3966 },
3967 MetricDef {
3968 name: "avg_off_cpu_pct",
3969 polarity: crate::test_support::Polarity::LowerBetter,
3970 kind: MetricKind::PerPhase,
3971 default_abs: 5.0,
3972 default_rel: 0.25,
3973 display_unit: "%",
3974 accessor: |_| None,
3975 },
3976 MetricDef {
3977 name: "min_off_cpu_pct",
3978 polarity: crate::test_support::Polarity::LowerBetter,
3979 kind: MetricKind::PerPhase,
3980 default_abs: 5.0,
3981 default_rel: 0.25,
3982 display_unit: "%",
3983 accessor: |_| None,
3984 },
3985 MetricDef {
3986 name: "max_off_cpu_pct",
3987 polarity: crate::test_support::Polarity::LowerBetter,
3988 kind: MetricKind::PerPhase,
3989 default_abs: 5.0,
3990 default_rel: 0.25,
3991 display_unit: "%",
3992 accessor: |_| None,
3993 },
3994 MetricDef {
3995 name: "off_cpu_spread_pct",
3996 polarity: crate::test_support::Polarity::LowerBetter,
3997 kind: MetricKind::PerPhase,
3998 default_abs: 5.0,
3999 default_rel: 0.25,
4000 display_unit: "%",
4001 accessor: |_| None,
4002 },
4003 MetricDef {
4004 name: "migration_ratio",
4005 polarity: crate::test_support::Polarity::LowerBetter,
4006 kind: MetricKind::PerPhase,
4007 default_abs: 0.05,
4008 default_rel: 0.20,
4009 display_unit: "",
4010 accessor: |_| None,
4011 },
4012 MetricDef {
4013 name: "iterations_per_worker",
4014 polarity: crate::test_support::Polarity::HigherBetter,
4015 kind: MetricKind::PerPhase,
4016 default_abs: 10.0,
4017 default_rel: 0.10,
4018 display_unit: "",
4019 accessor: |_| None,
4020 },
4021 MetricDef {
4022 name: "page_locality",
4023 polarity: crate::test_support::Polarity::HigherBetter,
4024 kind: MetricKind::PerPhase,
4025 default_abs: 0.05,
4026 default_rel: 0.10,
4027 display_unit: "",
4028 accessor: |_| None,
4029 },
4030 MetricDef {
4031 name: "cross_node_migration_ratio",
4032 polarity: crate::test_support::Polarity::LowerBetter,
4033 kind: MetricKind::PerPhase,
4034 default_abs: 0.05,
4035 default_rel: 0.20,
4036 display_unit: "",
4037 accessor: |_| None,
4038 },
4039 // Per-cgroup carrier counter (read via `cgroup_counter` /
4040 // `cgroup_counter_total`), Counter kind, `accessor: |_| None` (no
4041 // GauntletRow field). Mirrors `total_iterations`' HigherBetter polarity.
4042 MetricDef {
4043 name: "total_cpu_time_ns",
4044 polarity: crate::test_support::Polarity::HigherBetter,
4045 kind: MetricKind::Counter,
4046 default_abs: 100.0,
4047 default_rel: 0.10,
4048 display_unit: "ns",
4049 accessor: |_| None,
4050 },
4051];
4052
4053/// Minimum total iterations a run must have accumulated before the
4054/// `worst_wake_latency_tail_ratio` metric participates in regression
4055/// math.
4056///
4057/// Below this threshold the p99 / median ratio is dominated by a
4058/// handful of outlier samples rather than a distributional signal:
4059/// p99 on an N-sample set where `N < 100` collapses to approximately
4060/// `samples.max()` (the empirical p99 sits at the Nth item of a
4061/// sorted set, rounded down, so with N=10 every "p99" is in fact the
4062/// maximum), and the ratio `max/median` swings by order of magnitude
4063/// across runs that differ only in which worker happened to hit a
4064/// scheduling stall. `compare_rows` would report those swings as
4065/// regressions / improvements, burying real signal under low-N noise.
4066///
4067/// 100 is the threshold of interest because percentile estimation
4068/// stabilizes when the sample count crosses `1 / (1 - target_p)` —
4069/// i.e. 100 samples for a p99 — which is the point at which at least
4070/// one sample is expected in the 99th-percentile tail by pigeonhole.
4071/// Below this floor the p99 estimator degenerates to the observed
4072/// maximum (`samples[99]` when N is exactly 100, and a still-sparse
4073/// tail at N just above 100). Above 100 the ratio begins to reflect
4074/// actual tail behavior rather than single-sample extrema.
4075///
4076/// The gate uses `total_iterations` (scenario-wide sum across every
4077/// cgroup in the run) as a coarse floor, not an exact per-cgroup
4078/// sample count. That sum OVERESTIMATES the per-cgroup iteration
4079/// count when the scenario has multiple cgroups sharing load, so a
4080/// scenario whose total just clears the floor may still have
4081/// individual cgroups with fewer than 100 iterations and therefore
4082/// noisy per-cgroup tail ratios. The floor is a minimum-viable
4083/// filter against the lowest-N degeneracy, not a guarantee that
4084/// every cgroup in a passing row has a stable p99.
4085///
4086/// The gate is applied at the PRODUCER, not an accessor:
4087/// `crate::assert::populate_run_distribution_metrics` emits no
4088/// `worst_wake_latency_tail_ratio` ext key for a run with
4089/// `total_iterations < WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS`. The absent key
4090/// is excluded from the cross-RUN mean and read as `None` by `compare_rows`,
4091/// where the `(None, None)` arm skips the pair entirely (no verdict, no
4092/// coverage diff) when the key is absent on both sides.
4093pub const WAKE_LATENCY_TAIL_RATIO_MIN_ITERATIONS: u64 = 100;
4094
4095/// Look up a metric definition by name.
4096pub fn metric_def(name: &str) -> Option<&'static MetricDef> {
4097 METRICS.iter().find(|m| m.name == name)
4098}
4099
4100/// Rate-COMPONENT metric names suppressed from compare OUTPUT (scalar findings and the
4101/// noise per-phase spread + coverage rows). These are the internal
4102/// numerator/denominator Counters of the derived rates — `iteration_rate`
4103/// (`total_phase_iterations` / `total_phase_duration_sec`) and the pooled
4104/// `iterations_per_cpu_sec` (`total_iterations_pooled` / `total_cpu_time_sec`) —
4105/// and emitting them alongside their rate is redundant: three rows for one
4106/// user-facing concept.
4107///
4108/// They are suppressed ONLY at the compare-render layer. They REMAIN in the
4109/// persisted sidecar, in `GauntletRow::ext_metrics`, and in
4110/// `PhaseBucket::metrics`, because the cross-RUN re-pool
4111/// ([`group_and_average_by`]) re-derives the rates as `Σnum / Σdenom` from these
4112/// components read out of the rows — stripping them from storage would break
4113/// rate aggregation. The two user-facing rates and the typed `total_iterations`
4114/// are NOT suppressed. (Their `default_abs`/`default_rel` thresholds are inert
4115/// while suppressed — the compare significance gate never reads them — but the
4116/// entries keep their registry slot: `name` is the re-pool component key and
4117/// `kind` drives the fold dispatch.)
4118const RENDER_SUPPRESSED_COMPONENTS: &[&str] = &[
4119 "total_phase_iterations",
4120 "total_phase_duration_sec",
4121 "total_iterations_pooled",
4122 "total_cpu_time_sec",
4123 // taobench whole-run qps / hit_fraction Rate components (the raw op counts +
4124 // wall window). Suppressed so compare shows the four `taobench_*` Rates, not
4125 // the redundant raw counts. Remain in the sidecar / row for the cross-RUN
4126 // Σnum/Σdenom re-pool, like the iterations components above.
4127 TOTAL_TAOBENCH_OPS,
4128 TOTAL_TAOBENCH_FAST_OPS,
4129 TOTAL_TAOBENCH_SLOW_OPS,
4130 TOTAL_TAOBENCH_WALL_SEC,
4131 // taobench command-time hit Rate components (get_cmds + get_hits): suppressed
4132 // so compare shows `taobench_command_hit_rate`, not the raw get counts; remain
4133 // in the row for the cross-RUN Σhits/Σcmds re-pool.
4134 TOTAL_TAOBENCH_GET_CMDS,
4135 TOTAL_TAOBENCH_GET_HITS,
4136 // schbench role-separate run-delay gate-Rate components (raw run_delay_ns +
4137 // pcount per role). Suppressed so compare shows the two
4138 // `schbench_*_run_delay_ns_per_sched` Rates, not the raw Σ pairs. Remain in
4139 // the row for the cross-RUN Σ/Σ re-pool. (total_schbench_loops is NOT here —
4140 // it is a standalone throughput Counter, not a rate component.)
4141 TOTAL_SCHBENCH_MSG_RUN_DELAY_NS,
4142 TOTAL_SCHBENCH_MSG_PCOUNT,
4143 TOTAL_SCHBENCH_WORKER_RUN_DELAY_NS,
4144 TOTAL_SCHBENCH_WORKER_PCOUNT,
4145];
4146
4147/// True when `name` is a Rate component suppressed from compare output (see
4148/// the private `RENDER_SUPPRESSED_COMPONENTS` list).
4149pub fn is_render_suppressed_component(name: &str) -> bool {
4150 RENDER_SUPPRESSED_COMPONENTS.contains(&name)
4151}
4152
4153/// Infer the regression polarity (`higher_is_worse`) of a metric
4154/// not present in [`METRICS`].
4155///
4156/// Used by [`crate::assert::AssertResult::merge`] when it folds an
4157/// `ext_metrics` value whose name is not registered. Returning the
4158/// wrong polarity here surfaces as a silent merge bug: a
4159/// throughput-shaped metric (`*_iops`, `*_throughput`) folded with
4160/// `max` keeps the BETTER value across cgroups instead of the
4161/// worst, masking the cgroup that fell behind. The previous
4162/// fallback (`unwrap_or(true)` — always max) had this exact bug
4163/// for any payload-author metric whose name was not pre-registered
4164/// in the static `METRICS` table.
4165///
4166/// The inference is name-substring based, in the style of the
4167/// `Polarity::Unknown` fallback used by `MetricHint`. The token
4168/// list mirrors the polarity choices in [`METRICS`] for the
4169/// metrics already registered there:
4170///
4171/// - Tokens that signal HigherBetter (returned `false`):
4172/// `iops`, `throughput`, `bandwidth`, `iterations`, `ops_per_sec`,
4173/// `locality`, `_score`, `goodput`. The scheduler-test fixture's
4174/// `total_iterations` and `worst_iterations_per_worker` already
4175/// carry this polarity in the registry; a payload-author metric
4176/// like `jobs.0.read.iops` from the schbench JSON path
4177/// should fold the same way.
4178/// - Tokens that signal LowerBetter (returned `true`):
4179/// `latency`, `delay`, `_gap`, `stall`, `stuck`, `_cv`, `error`,
4180/// `fail`, `drop`, `spread`, `_us`, `_ms`, `_ns`, `migration_ratio`,
4181/// `imbalance`, `_depth`, `dsq`. These are the polarity signals from the existing
4182/// registered LowerBetter entries (`worst_p99_wake_latency_us`,
4183/// `worst_run_delay_us`, `worst_gap_ms`, `stuck_count`,
4184/// `worst_wake_latency_cv`, `worst_spread`, `worst_migration_ratio`,
4185/// `max_imbalance_ratio`). `stall` covers payload-author metrics
4186/// that surface the sched_ext watchdog stall (`SCX_EXIT_ERROR_STALL`)
4187/// while `stuck` covers `stuck_count` (CPU's `rq_clock` not
4188/// advancing) — distinct conditions but both higher-is-worse.
4189///
4190/// When a name matches no token (e.g. `bogo_ops`, `read_kb`,
4191/// `jobs.0.runtime`), returns `true` (LowerBetter). The fallback
4192/// is conservative for regression detection: a payload that emits
4193/// a not-yet-classifiable metric and then folds an unexpectedly
4194/// high value across cgroups is more useful surfaced than silently
4195/// kept at the minimum (which would mask the high reading
4196/// entirely). Authors who need a different default should register
4197/// a [`MetricDef`] in [`METRICS`] or tag the metric via
4198/// [`crate::test_support::MetricHint`].
4199///
4200/// Token order matters when names contain both signals (e.g. the
4201/// hypothetical `low_iops_latency_ms` would match `latency` first
4202/// and be classified as higher-is-worse). The token lists above
4203/// are tested by `infer_higher_is_worse_*` in this module's tests.
4204pub fn infer_higher_is_worse(name: &str) -> bool {
4205 // First-pass: explicit "higher value is the regression" signals
4206 // (latency, delay, error, etc.). Checked first so a name
4207 // carrying both kinds of token (rare; e.g. `*_iops_latency_us`)
4208 // resolves to the latency interpretation, which matches the
4209 // semantics of compound counters/timers.
4210 const HIGHER_IS_WORSE_TOKENS: &[&str] = &[
4211 "latency",
4212 "delay",
4213 "_gap",
4214 "stall",
4215 "stuck",
4216 "_cv",
4217 "error",
4218 "fail",
4219 "drop",
4220 "spread",
4221 "_us",
4222 "_ms",
4223 "_ns",
4224 "migration_ratio",
4225 "imbalance",
4226 // DSQ depth is "lower is better" — a shallower queue
4227 // means the scheduler is keeping up. `_depth` and `dsq`
4228 // are independently meaningful: `_depth` catches names
4229 // like `max_dsq_depth` / `avg_dsq_depth` whose source is
4230 // the scheduler's local dispatch queue; `dsq` is the
4231 // defensive fallback for future DSQ-related metric names
4232 // that don't carry `_depth` (e.g. `dsq_overflow_count`).
4233 // Without these tokens, a future refactor that drops a
4234 // DSQ metric from the METRICS registry would fall through
4235 // to the conservative `true` default — correct by luck
4236 // for DSQ depth (higher = worse) but not by reasoning;
4237 // these tokens make the inference grounded.
4238 //
4239 // False-positive caveat: a future metric named
4240 // `dsq_throughput` / `dsq_iops` / `cache_depth` /
4241 // `tree_depth` would be classified higher-is-worse
4242 // here when the truth is the opposite. The fallback
4243 // path matters only when METRICS doesn't register the
4244 // name explicitly — register every new dsq-or-depth
4245 // metric so the token-based inference never runs.
4246 "_depth",
4247 "dsq",
4248 ];
4249 if HIGHER_IS_WORSE_TOKENS.iter().any(|t| name.contains(t)) {
4250 return true;
4251 }
4252 // Second-pass: "higher value is the improvement" signals
4253 // (throughput, iops, etc.). Matching here returns `false`
4254 // (LowerBetter inverted into HigherBetter, i.e. min is the
4255 // worst-case fold).
4256 const HIGHER_IS_BETTER_TOKENS: &[&str] = &[
4257 "iops",
4258 "throughput",
4259 "bandwidth",
4260 "iterations",
4261 "ops_per_sec",
4262 "locality",
4263 "_score",
4264 "goodput",
4265 ];
4266 if HIGHER_IS_BETTER_TOKENS.iter().any(|t| name.contains(t)) {
4267 return false;
4268 }
4269 // Conservative fallback: treat as higher-is-worse so a folded
4270 // value is the maximum across cgroups. Surfacing a maximum is
4271 // safer than masking it; payload authors who disagree should
4272 // register the metric.
4273 true
4274}
4275
4276/// Render the [`METRICS`] registry for `cargo ktstr stats list-metrics`.
4277///
4278/// `json=false` renders a comfy-table with one row per registered
4279/// metric and columns NAME / POLARITY / DEFAULT_ABS / DEFAULT_REL
4280/// / UNIT. `json=true` emits `serde_json::to_string_pretty`
4281/// on the whole [`METRICS`] slice — the `accessor` fn-pointer is
4282/// `#[serde(skip)]` so the array carries only wire-stable fields.
4283///
4284/// Iteration order equals [`METRICS`] declaration order (the
4285/// canonical surface order for sidecar / CI-gate consumers).
4286///
4287/// The return is owned `String` rather than a print-direct helper so
4288/// callers can pin output via `assert_eq!` in tests; the cargo-ktstr
4289/// dispatch arm at `run_stats` writes it to stdout verbatim.
4290pub fn list_metrics(json: bool) -> anyhow::Result<String> {
4291 if json {
4292 return serde_json::to_string_pretty(METRICS)
4293 .map_err(|e| anyhow::anyhow!("serialize METRICS to JSON: {e}"));
4294 }
4295
4296 let mut table = crate::cli::new_table();
4297 table.set_header(vec![
4298 "NAME",
4299 "POLARITY",
4300 "DEFAULT_ABS",
4301 "DEFAULT_REL",
4302 "UNIT",
4303 ]);
4304 for m in METRICS {
4305 table.add_row(vec![
4306 m.name.to_string(),
4307 polarity_label(m.polarity),
4308 format!("{}", m.default_abs),
4309 format!("{}", m.default_rel),
4310 m.display_unit.to_string(),
4311 ]);
4312 }
4313 Ok(format!("{table}\n"))
4314}
4315
4316/// Short human label for a [`Polarity`](crate::test_support::Polarity)
4317/// variant in the list-metrics table.
4318///
4319/// `HigherBetter` → `higher`, `LowerBetter` → `lower`,
4320/// `TargetValue(t)` → `target(t)`, `Unknown` → `unknown`. Match is
4321/// total; adding a new `Polarity` variant without extending this
4322/// rendering surfaces as a compile error.
4323fn polarity_label(p: crate::test_support::Polarity) -> String {
4324 use crate::test_support::Polarity;
4325 match p {
4326 Polarity::HigherBetter => "higher".to_string(),
4327 Polarity::LowerBetter => "lower".to_string(),
4328 Polarity::TargetValue(t) => format!("target({t})"),
4329 Polarity::Unknown => "unknown".to_string(),
4330 Polarity::Informational => "informational".to_string(),
4331 }
4332}