ktstr/ctprof_compare/
metrics.rs

1//! Metric registry — the master catalog the comparison pipeline
2//! parameterizes itself over.
3//!
4//! Two registries live here:
5//!
6//! 1. [`CTPROF_METRICS`] — array of [`CtprofMetricDef`] entries,
7//!    one per primary metric. Each entry pairs a name with an
8//!    [`super::AggRule`] (typed reduction over a thread bucket),
9//!    scheduler-class scope, kernel CONFIG gating, dead-counter
10//!    flag, operator-facing description, and rendered-section tag.
11//!    Order of entries IS load-bearing — it's the default
12//!    display order for rows that have no numeric delta to sort by.
13//!
14//! 2. [`CTPROF_DERIVED_METRICS`] — array of [`DerivedMetricDef`]
15//!    entries, one per derived metric (ratio, average, signed
16//!    difference). Each entry consumes already-aggregated input
17//!    metrics from a group's metrics map and produces a single
18//!    [`DerivedValue`] scalar with its own scale ladder. The
19//!    helpers [`input_scalar`], [`ratio_compute`], and
20//!    [`ratio_of_sum_compute`] are private to this module and
21//!    feed the closures stored in each entry's `compute` field.
22//!
23//! [`metric_display_name`] and [`metric_tags`] are pure formatters
24//! for the metric-list rendering path; they take a [`CtprofMetricDef`]
25//! and return the user-visible name + bracketed tag suffix.
26//!
27//! **PSI is intentionally NOT in this registry.** Each
28//! [`super::AggRule`] variant's accessor takes
29//! `&crate::ctprof::ThreadState` and returns a
30//! [`crate::metric_types`] newtype (or a primitive the dispatch
31//! coerces via `to_string()` for `ModeChar` / `ModeBool`); only
32//! per-thread data fits that signature, while Pressure Stall
33//! Information is per-snapshot (host-level) and per-cgroup. PSI
34//! surfaces in dedicated secondary tables under
35//! `## Host pressure / ...` and `## Pressure / ...` headers,
36//! rendered by [`super::write_diff`] / `write_show` directly
37//! rather than via [`super::AggRule`].
38
39use std::collections::BTreeMap;
40
41use super::{AggRule, Aggregated, ScaleLadder, Section};
42
43/// One metric exposed by the comparison pipeline.
44///
45/// The auto-scale ladder for the rendered cell is derived from
46/// [`AggRule::ladder`] at render time — there is no separate
47/// `unit` tag on the metric def. A registry entry that pairs an
48/// AggRule variant with a category-mismatched ladder fails at
49/// compile time (the ladder mapping is a closed match on the
50/// variant, not a free-form string).
51#[derive(Debug, Clone, Copy)]
52#[non_exhaustive]
53pub struct CtprofMetricDef {
54    pub name: &'static str,
55    pub rule: AggRule,
56    /// Scheduler-class scope for the metric. `None` means
57    /// class-agnostic — every task class accumulates the value
58    /// (e.g. `nr_migrations`). Concrete spellings:
59    /// - `"cfs-only"` — incremented strictly inside CFS-class
60    ///   call paths (`kernel/sched/fair.c`), zero under
61    ///   SCHED_EXT / SCHED_FIFO / SCHED_RR / SCHED_DEADLINE /
62    ///   SCHED_IDLE. Examples: `nr_wakeups_affine`,
63    ///   `nr_wakeups_affine_attempts`, `nr_failed_migrations_*`,
64    ///   `nr_forced_migrations`, `slice_max`.
65    /// - `"fair-policy"` — emitted only when
66    ///   `fair_policy(p->policy)` returns true. Per
67    ///   `kernel/sched/sched.h:194,203`, that admits
68    ///   SCHED_NORMAL, SCHED_BATCH, AND SCHED_EXT (under
69    ///   CONFIG_SCHED_CLASS_EXT). Zero under SCHED_FIFO/RR/DL/IDLE.
70    ///   Example: `fair_slice_ns`.
71    /// - `"non-ext"` — written by the schedstats sleep/wait
72    ///   family wrappers `__update_stats_enqueue_sleeper`
73    ///   (kernel/sched/stats.c:48) and `__update_stats_wait_end`
74    ///   (kernel/sched/stats.c:21), called from fair.c, rt.c,
75    ///   deadline.c but NOT ext.c — i.e. CFS/RT/DL accumulate,
76    ///   sched_ext bypasses. Examples: `wait_sum`, `wait_count`,
77    ///   `wait_max`, `voluntary_sleep_ns`, `sleep_max`,
78    ///   `block_sum`, `block_max`, `iowait_sum`, `iowait_count`.
79    pub sched_class: Option<&'static str>,
80    /// Kernel CONFIG options that gate the metric. `&[]` means
81    /// no gating (always populated when the source path runs).
82    /// One element typically; multi-element when more than one
83    /// gate is required (e.g. `core_forceidle_sum` requires
84    /// CONFIG_SCHED_CORE AND CONFIG_SCHEDSTATS). Concrete
85    /// spellings match the literal `Kconfig` symbol so an
86    /// operator can `grep CONFIG_X /boot/config-$(uname -r)` to
87    /// confirm. Verified gates:
88    /// - `"CONFIG_SCHEDSTATS"` — gates every `__schedstat_*` /
89    ///   `schedstat_*` macro call. Off → the macro is
90    ///   `do { } while (0)` per `kernel/sched/stats.h:75-82`.
91    /// - `"CONFIG_SCHED_INFO"` — gates the lighter-weight
92    ///   `sched_info_*` accounting (`run_time_ns`,
93    ///   `wait_time_ns`, `timeslices`); the schedstat file is
94    ///   gated by `sched_info_on()` at
95    ///   `proc_pid_schedstat` (fs/proc/base.c:511-523).
96    /// - `"CONFIG_SCHED_CORE"` — gates the core-scheduling
97    ///   subsystem (`__account_forceidle_time`).
98    /// - `"CONFIG_SCHED_CLASS_EXT"` — gates the sched_ext
99    ///   class. When off, no task can land on ext, so
100    ///   `ext_enabled` reads false uniformly.
101    /// - `"CONFIG_TASK_DELAY_ACCT"` — gates the delayacct
102    ///   accounting path that populates the taskstats genetlink
103    ///   delay-family fields (`cpu_delay_*`, `blkio_delay_*`,
104    ///   etc.).
105    /// - `"CONFIG_TASK_IO_ACCOUNTING"` — gates the per-task
106    ///   I/O accounting fields exposed by `/proc/<tid>/io`
107    ///   (`rchar`, `wchar`, `syscr`, `syscw`, `read_bytes`,
108    ///   `write_bytes`, `cancelled_write_bytes`). The kernel
109    ///   emits all 7 fields under one `do_io_accounting` call,
110    ///   and CONFIG_TASK_IO_ACCOUNTING `depends on`
111    ///   CONFIG_TASK_XACCT in `init/Kconfig` — so from the
112    ///   procfs-reader perspective the file is all-or-nothing.
113    pub config_gates: &'static [&'static str],
114    /// True for kernel counters that are exposed in `/proc`
115    /// but never incremented anywhere in the kernel tree —
116    /// always reads zero. Operators reading the rendered table
117    /// see the `[dead]` flag and stop chasing the always-zero
118    /// cell. The registry is currently empty of `is_dead: true`
119    /// entries: the previously-registered dead counters
120    /// (`nr_wakeups_idle`, `nr_wakeups_passive`,
121    /// `nr_migrations_cold`) were dropped from `ThreadState`
122    /// and the registry; the kernel still emits the lines so
123    /// the parser silently ignores them. The flag remains as
124    /// infrastructure: a future kernel that resurrects a dead
125    /// counter (or exposes a new always-zero one) registers
126    /// with `is_dead: true` and the `[dead]` rendering path
127    /// fires.
128    pub is_dead: bool,
129    /// One-line operator-facing description of what this metric
130    /// counts. Surfaced by the `ctprof metric-list`
131    /// subcommand alongside the bracketed tag suffix so an
132    /// operator scanning a rendered table can map an unfamiliar
133    /// metric name to its semantics without leaving the CLI.
134    /// Plain ASCII. "Cumulative" is load-bearing — use it to
135    /// distinguish counters from gauges; the [`AggRule`] only
136    /// names the per-group reduction, not the per-thread
137    /// counter shape.
138    pub description: &'static str,
139    /// Section this metric belongs to for the `--sections`
140    /// per-row filter. Most rows tag [`Section::Primary`];
141    /// taskstats-sourced rows (the eight delay-accounting
142    /// categories plus the two memory watermarks) carry
143    /// [`Section::TaskstatsDelay`] so an operator can scope
144    /// the rendered table down to (or away from) the taskstats
145    /// rows. The primary-table emitter checks
146    /// `DisplayOptions::is_section_enabled` per row before
147    /// rendering — `--sections taskstats-delay` keeps only
148    /// taskstats rows, `--sections primary` excludes them, and
149    /// either alone keeps the primary table open. The default
150    /// (empty filter) renders every row regardless of section.
151    pub section: Section,
152}
153
154/// Registry of per-thread metrics. Order here is the default
155/// display order for rows that have no numeric delta to sort by
156/// (ties fall back to registry order). Names are the ASCII
157/// short-form used in capture code; long-form display is the
158/// same — no translation layer.
159///
160/// **PSI is intentionally not in this registry.** Each
161/// [`AggRule`] variant's accessor takes `&ThreadState` and
162/// returns a [`crate::metric_types`] newtype (or a primitive
163/// the dispatch coerces via `to_string()` for `ModeChar` /
164/// `ModeBool`); only per-thread data fits that signature, while
165/// Pressure Stall Information is per-snapshot (host-level) and
166/// per-cgroup. PSI surfaces in dedicated secondary tables
167/// under "## Host pressure / ..." and "## Pressure / ..."
168/// headers, rendered by `write_diff` / `write_show` directly
169/// rather than via [`AggRule`]. See `Psi` / `PsiResource` /
170/// `PsiHalf` for the data model.
171pub static CTPROF_METRICS: &[CtprofMetricDef] = &[
172    // structural: group population count
173    CtprofMetricDef {
174        name: "thread_count",
175        rule: AggRule::SumCount(|_| crate::metric_types::MonotonicCount(1)),
176        sched_class: None,
177        config_gates: &[],
178        is_dead: false,
179        description: "Number of threads in this group. Each thread contributes 1; the sum is the group population. Useful for --sort-by thread_count:desc to find groups where thread count changed the most.",
180        section: Section::Primary,
181    },
182    // identity / structural (non-numeric aggregation)
183    CtprofMetricDef {
184        name: "policy",
185        rule: AggRule::Mode(|t| t.policy.clone()),
186        sched_class: None,
187        config_gates: &[],
188        is_dead: false,
189        description: "Scheduling policy (SCHED_OTHER, SCHED_FIFO, SCHED_RR, SCHED_BATCH, SCHED_IDLE, SCHED_DEADLINE, SCHED_EXT).",
190        section: Section::Primary,
191    },
192    CtprofMetricDef {
193        name: "nice",
194        rule: AggRule::RangeI32(|t| t.nice),
195        sched_class: None,
196        config_gates: &[],
197        is_dead: false,
198        description: "Nice value (-20..19); CFS priority knob.",
199        section: Section::Primary,
200    },
201    // `task_prio()` value from `/proc/<tid>/stat` field 18.
202    // Per-thread ordinal — aggregate as OrdinalRange (mirrors
203    // `nice` directly above), not Sum. Kernel ranges per
204    // `task_prio()` at `kernel/sched/syscalls.c:170`:
205    // CFS=[0..39], RT=[-2..-100], DL=-101 — see the field
206    // doc on [`ThreadState::priority`].
207    CtprofMetricDef {
208        name: "priority",
209        rule: AggRule::RangeI32(|t| t.priority),
210        sched_class: None,
211        config_gates: &[],
212        is_dead: false,
213        description: "Kernel task priority from /proc/<tid>/stat field 18 (CFS=[0..39], RT=[-2..-100], DL=-101).",
214        section: Section::Primary,
215    },
216    // Real-time scheduler priority from `/proc/<tid>/stat`
217    // field 40. Bounded 0..99 in practice (SCHED_FIFO /
218    // SCHED_RR range); zero for CFS tasks. OrdinalRange to
219    // surface the spread across a group, like `nice` and
220    // `priority`.
221    CtprofMetricDef {
222        name: "rt_priority",
223        rule: AggRule::RangeU32(|t| t.rt_priority),
224        sched_class: None,
225        config_gates: &[],
226        is_dead: false,
227        description: "Real-time scheduler priority (0..99); 0 for non-RT tasks.",
228        section: Section::Primary,
229    },
230    CtprofMetricDef {
231        name: "cpu_affinity",
232        rule: AggRule::Affinity(|t| t.cpu_affinity.clone()),
233        sched_class: None,
234        config_gates: &[],
235        is_dead: false,
236        description: "Set of CPUs the task is allowed to run on (sched_getaffinity result).",
237        section: Section::Primary,
238    },
239    CtprofMetricDef {
240        name: "processor",
241        rule: AggRule::RangeI32(|t| t.processor),
242        sched_class: None,
243        config_gates: &[],
244        is_dead: false,
245        description: "Last CPU the task ran on.",
246        section: Section::Primary,
247    },
248    CtprofMetricDef {
249        name: "state",
250        rule: AggRule::ModeChar(|t| t.state),
251        sched_class: None,
252        config_gates: &[],
253        is_dead: false,
254        description: "Task state letter (R running, S sleeping, D uninterruptible, Z zombie, T stopped).",
255        section: Section::Primary,
256    },
257    // `ext_enabled` reflects whether the task is currently on
258    // the sched_ext class. Gated by CONFIG_SCHED_CLASS_EXT —
259    // when off, no task can land on ext, so the field reads
260    // `false` uniformly across every thread.
261    CtprofMetricDef {
262        name: "ext_enabled",
263        rule: AggRule::ModeBool(|t| t.ext_enabled),
264        sched_class: None,
265        config_gates: &["CONFIG_SCHED_CLASS_EXT"],
266        is_dead: false,
267        description: "Whether the task is currently dispatched on the sched_ext class.",
268        section: Section::Primary,
269    },
270    // Process-wide thread count (`signal_struct->nr_threads`)
271    // from `/proc/<tid>/status` `Threads:`. Capture-side
272    // populates only on tid == tgid threads (leader dedup), so
273    // every non-leader thread carries 0 — Sum across a group
274    // would render 0 for any bucket whose leader is not part of
275    // the bucket (e.g. `--group-by comm` puts non-leader threads
276    // in their own comm bucket). `Max` answers "largest process
277    // represented in this bucket"; the row count already covers
278    // "how many threads are here". Identity/structural rather
279    // than counter — placement here mirrors `state` and
280    // `ext_enabled` (per-thread snapshots, not deltas).
281    CtprofMetricDef {
282        name: "nr_threads",
283        rule: AggRule::MaxGaugeCount(|t| t.nr_threads),
284        sched_class: None,
285        config_gates: &[],
286        is_dead: false,
287        description: "Process-wide thread count (signal_struct->nr_threads); leader-only.",
288        section: Section::Primary,
289    },
290    // scheduling
291    // `run_time_ns` from `/proc/<tid>/schedstat` field 1 —
292    // gated by CONFIG_SCHED_INFO via `sched_info_on()` at
293    // `proc_pid_schedstat` (fs/proc/base.c:511-523).
294    CtprofMetricDef {
295        name: "run_time_ns",
296        rule: AggRule::SumNs(|t| t.run_time_ns),
297        sched_class: None,
298        config_gates: &["CONFIG_SCHED_INFO"],
299        is_dead: false,
300        description: "Cumulative on-CPU time, ns; /proc/<tid>/schedstat field 1.",
301        section: Section::Primary,
302    },
303    // `wait_time_ns` from `/proc/<tid>/schedstat` field 2 —
304    // gated by CONFIG_SCHED_INFO via `sched_info_on()` at
305    // `proc_pid_schedstat` (fs/proc/base.c:511-523).
306    CtprofMetricDef {
307        name: "wait_time_ns",
308        rule: AggRule::SumNs(|t| t.wait_time_ns),
309        sched_class: None,
310        config_gates: &["CONFIG_SCHED_INFO"],
311        is_dead: false,
312        description: "Cumulative time waiting on the runqueue, ns; schedstat field 2.",
313        section: Section::Primary,
314    },
315    // `timeslices` from `/proc/<tid>/schedstat` field 3 —
316    // same gate as `wait_time_ns`.
317    CtprofMetricDef {
318        name: "timeslices",
319        rule: AggRule::SumCount(|t| t.timeslices),
320        sched_class: None,
321        config_gates: &["CONFIG_SCHED_INFO"],
322        is_dead: false,
323        description: "Number of times the task was run on a CPU; schedstat field 3.",
324        section: Section::Primary,
325    },
326    CtprofMetricDef {
327        name: "voluntary_csw",
328        rule: AggRule::SumCount(|t| t.voluntary_csw),
329        sched_class: None,
330        config_gates: &[],
331        is_dead: false,
332        description: "Voluntary context switches (task gave up the CPU itself).",
333        section: Section::Primary,
334    },
335    CtprofMetricDef {
336        name: "nonvoluntary_csw",
337        rule: AggRule::SumCount(|t| t.nonvoluntary_csw),
338        sched_class: None,
339        config_gates: &[],
340        is_dead: false,
341        description: "Involuntary context switches (task was preempted).",
342        section: Section::Primary,
343    },
344    // `nr_wakeups`, `_local`, `_remote`, `_sync`, `_migrate`
345    // are class-agnostic — `__schedstat_inc` from
346    // `kernel/sched/core.c::ttwu_stat` (e.g. line 3677 for the
347    // base counter) fires for every task class. The macro
348    // expands to `do { } while (0)` under !CONFIG_SCHEDSTATS
349    // per `kernel/sched/stats.h:75-82`.
350    CtprofMetricDef {
351        name: "nr_wakeups",
352        rule: AggRule::SumCount(|t| t.nr_wakeups),
353        sched_class: None,
354        config_gates: &["CONFIG_SCHEDSTATS"],
355        is_dead: false,
356        description: "Total wakeups via try_to_wake_up().",
357        section: Section::Primary,
358    },
359    CtprofMetricDef {
360        name: "nr_wakeups_local",
361        rule: AggRule::SumCount(|t| t.nr_wakeups_local),
362        sched_class: None,
363        config_gates: &["CONFIG_SCHEDSTATS"],
364        is_dead: false,
365        description: "Wakeups landed on the same CPU as the waker.",
366        section: Section::Primary,
367    },
368    CtprofMetricDef {
369        name: "nr_wakeups_remote",
370        rule: AggRule::SumCount(|t| t.nr_wakeups_remote),
371        sched_class: None,
372        config_gates: &["CONFIG_SCHEDSTATS"],
373        is_dead: false,
374        description: "Wakeups landed on a different CPU than the waker.",
375        section: Section::Primary,
376    },
377    CtprofMetricDef {
378        name: "nr_wakeups_sync",
379        rule: AggRule::SumCount(|t| t.nr_wakeups_sync),
380        sched_class: None,
381        config_gates: &["CONFIG_SCHEDSTATS"],
382        is_dead: false,
383        description: "WF_SYNC wakeups (synchronous wakeup hint to scheduler).",
384        section: Section::Primary,
385    },
386    CtprofMetricDef {
387        name: "nr_wakeups_migrate",
388        rule: AggRule::SumCount(|t| t.nr_wakeups_migrate),
389        sched_class: None,
390        config_gates: &["CONFIG_SCHEDSTATS"],
391        is_dead: false,
392        description: "Wakeups where the task migrated to a different CPU than its prior one (WF_MIGRATED); distinct from nr_wakeups_remote (waker CPU != target CPU).",
393        section: Section::Primary,
394    },
395    // `nr_wakeups_affine`, `_attempts` are CFS-only —
396    // `kernel/sched/fair.c::wake_affine` calls
397    // `schedstat_inc(p->stats.nr_wakeups_affine_attempts)` at
398    // line 7681 and the matching `_affine` increment at line
399    // 7686. Both expand only under CFS task lifetime, so a
400    // task on SCHED_EXT / SCHED_FIFO / SCHED_RR / SCHED_DL
401    // never accumulates them.
402    CtprofMetricDef {
403        name: "nr_wakeups_affine",
404        rule: AggRule::SumCount(|t| t.nr_wakeups_affine),
405        sched_class: Some("cfs-only"),
406        config_gates: &["CONFIG_SCHEDSTATS"],
407        is_dead: false,
408        description: "Wakeups that succeeded under the wake_affine() heuristic.",
409        section: Section::Primary,
410    },
411    CtprofMetricDef {
412        name: "nr_wakeups_affine_attempts",
413        rule: AggRule::SumCount(|t| t.nr_wakeups_affine_attempts),
414        sched_class: Some("cfs-only"),
415        config_gates: &["CONFIG_SCHEDSTATS"],
416        is_dead: false,
417        description: "wake_affine() attempts; success rate = nr_wakeups_affine / attempts.",
418        section: Section::Primary,
419    },
420    // `nr_migrations` is incremented unconditionally at
421    // `kernel/sched/core.c:3346` (`p->se.nr_migrations++`) — no
422    // schedstat macro, no class gating. Always populated.
423    CtprofMetricDef {
424        name: "nr_migrations",
425        rule: AggRule::SumCount(|t| t.nr_migrations),
426        sched_class: None,
427        config_gates: &[],
428        is_dead: false,
429        description: "Cumulative cross-CPU migrations of the task.",
430        section: Section::Primary,
431    },
432    // `nr_forced_migrations` is set by
433    // `kernel/sched/fair.c:9857` (`schedstat_inc`) inside
434    // CFS-only load-balancing.
435    CtprofMetricDef {
436        name: "nr_forced_migrations",
437        rule: AggRule::SumCount(|t| t.nr_forced_migrations),
438        sched_class: Some("cfs-only"),
439        config_gates: &["CONFIG_SCHEDSTATS"],
440        is_dead: false,
441        description: "Migrations forced by the CFS load balancer.",
442        section: Section::Primary,
443    },
444    // `nr_failed_migrations_*` family — all CFS-only,
445    // incremented in `kernel/sched/fair.c::can_migrate_task`
446    // (lines 9783, 9817, 9843).
447    CtprofMetricDef {
448        name: "nr_failed_migrations_affine",
449        rule: AggRule::SumCount(|t| t.nr_failed_migrations_affine),
450        sched_class: Some("cfs-only"),
451        config_gates: &["CONFIG_SCHEDSTATS"],
452        is_dead: false,
453        description: "Load-balancer migrations rejected for cpu-affinity reasons.",
454        section: Section::Primary,
455    },
456    CtprofMetricDef {
457        name: "nr_failed_migrations_running",
458        rule: AggRule::SumCount(|t| t.nr_failed_migrations_running),
459        sched_class: Some("cfs-only"),
460        config_gates: &["CONFIG_SCHEDSTATS"],
461        is_dead: false,
462        description: "Load-balancer migrations rejected because the task was running.",
463        section: Section::Primary,
464    },
465    CtprofMetricDef {
466        name: "nr_failed_migrations_hot",
467        rule: AggRule::SumCount(|t| t.nr_failed_migrations_hot),
468        sched_class: Some("cfs-only"),
469        config_gates: &["CONFIG_SCHEDSTATS"],
470        is_dead: false,
471        description: "Load-balancer migrations rejected because the task was cache-hot.",
472        section: Section::Primary,
473    },
474    // `wait_sum` / `wait_count` / `wait_max` — written by
475    // `__update_stats_wait_end` (`kernel/sched/stats.c:21`),
476    // which is called from `update_stats_wait_end_fair`
477    // (kernel/sched/fair.c:1478), `update_stats_wait_end_dl`
478    // (kernel/sched/deadline.c:2114), and
479    // `update_stats_wait_end_rt` (kernel/sched/rt.c:1282) —
480    // i.e. CFS, RT, AND DL classes accumulate. Sched_ext bypasses
481    // these wrappers, so the counters stay at zero for SCHED_EXT
482    // tasks. Tagged `non-ext`. Expanded to a no-op under
483    // !CONFIG_SCHEDSTATS via the schedstat macros at
484    // `kernel/sched/stats.h:75-82`.
485    CtprofMetricDef {
486        name: "wait_sum",
487        rule: AggRule::SumNs(|t| t.wait_sum),
488        sched_class: Some("non-ext"),
489        config_gates: &["CONFIG_SCHEDSTATS"],
490        is_dead: false,
491        description: "Cumulative time the task waited on the runqueue, ns.",
492        section: Section::Primary,
493    },
494    CtprofMetricDef {
495        name: "wait_count",
496        rule: AggRule::SumCount(|t| t.wait_count),
497        sched_class: Some("non-ext"),
498        config_gates: &["CONFIG_SCHEDSTATS"],
499        is_dead: false,
500        description: "Number of distinct runqueue-wait intervals the task accumulated.",
501        section: Section::Primary,
502    },
503    CtprofMetricDef {
504        name: "wait_max",
505        rule: AggRule::MaxPeak(|t| t.wait_max),
506        sched_class: Some("non-ext"),
507        config_gates: &["CONFIG_SCHEDSTATS"],
508        is_dead: false,
509        description: "Longest single runqueue-wait interval observed, ns.",
510        section: Section::Primary,
511    },
512    // `voluntary_sleep_ns` / `sleep_max` / `block_sum` /
513    // `block_max` / `iowait_sum` / `iowait_count` — written by
514    // `__update_stats_enqueue_sleeper` (kernel/sched/stats.c:48),
515    // which is called from `update_stats_enqueue_sleeper_fair`
516    // (kernel/sched/fair.c:1504),
517    // `update_stats_enqueue_sleeper_dl`
518    // (kernel/sched/deadline.c:2122), and
519    // `update_stats_enqueue_sleeper_rt`
520    // (kernel/sched/rt.c:1252). Same shape as the wait_* family
521    // above: CFS+RT+DL accumulate, sched_ext bypasses, so the
522    // counters stay at zero for SCHED_EXT tasks. Tagged `non-ext`.
523    // Expanded to a no-op under !CONFIG_SCHEDSTATS via the
524    // schedstat macros at `kernel/sched/stats.h:75-82`.
525    // `voluntary_sleep_ns` is the capture-side normalization of
526    // the kernel's `sum_sleep_runtime` — the raw value
527    // double-counts block under sleep, so capture subtracts
528    // `sum_block_runtime` before storing.
529    CtprofMetricDef {
530        name: "voluntary_sleep_ns",
531        rule: AggRule::SumNs(|t| t.voluntary_sleep_ns),
532        sched_class: Some("non-ext"),
533        config_gates: &["CONFIG_SCHEDSTATS"],
534        is_dead: false,
535        description: "Pure voluntary sleep time (TASK_INTERRUPTIBLE only), ns; capture-side normalized as sum_sleep_runtime - sum_block_runtime so the kernel's sleep/block double-count is stripped before delta math.",
536        section: Section::Primary,
537    },
538    CtprofMetricDef {
539        name: "sleep_max",
540        rule: AggRule::MaxPeak(|t| t.sleep_max),
541        sched_class: Some("non-ext"),
542        config_gates: &["CONFIG_SCHEDSTATS"],
543        is_dead: false,
544        description: "Longest single sleep interval observed, ns.",
545        section: Section::Primary,
546    },
547    // No `sleep_count` metric: the kernel does not emit that
548    // counter — the wake-side tally is captured by `nr_wakeups`
549    // already.
550    CtprofMetricDef {
551        name: "block_sum",
552        rule: AggRule::SumNs(|t| t.block_sum),
553        sched_class: Some("non-ext"),
554        config_gates: &["CONFIG_SCHEDSTATS"],
555        is_dead: false,
556        description: "Cumulative time the task spent blocked (TASK_UNINTERRUPTIBLE), ns.",
557        section: Section::Primary,
558    },
559    CtprofMetricDef {
560        name: "block_max",
561        rule: AggRule::MaxPeak(|t| t.block_max),
562        sched_class: Some("non-ext"),
563        config_gates: &["CONFIG_SCHEDSTATS"],
564        is_dead: false,
565        description: "Longest single uninterruptible-block interval observed, ns.",
566        section: Section::Primary,
567    },
568    // No `block_count` metric: the kernel emits no per-event
569    // counter for `sum_block_runtime` (unlike `wait_sum/wait_count`
570    // and `iowait_sum/iowait_count` pairs).
571    CtprofMetricDef {
572        name: "iowait_sum",
573        rule: AggRule::SumNs(|t| t.iowait_sum),
574        sched_class: Some("non-ext"),
575        config_gates: &["CONFIG_SCHEDSTATS"],
576        is_dead: false,
577        description: "Cumulative time the task spent in iowait, ns.",
578        section: Section::Primary,
579    },
580    CtprofMetricDef {
581        name: "iowait_count",
582        rule: AggRule::SumCount(|t| t.iowait_count),
583        sched_class: Some("non-ext"),
584        config_gates: &["CONFIG_SCHEDSTATS"],
585        is_dead: false,
586        description: "Number of distinct iowait intervals the task accumulated.",
587        section: Section::Primary,
588    },
589    // delayacct_blkio_ticks (the procfs USER_HZ-ticks delivery
590    // of the same delay-accounting block-I/O bucket) was removed
591    // because `blkio_delay_total_ns` from the taskstats genetlink
592    // path supersedes it: same kernel data via the same
593    // CONFIG_TASK_DELAY_ACCT gate, but ns precision instead of
594    // USER_HZ truncation, no procfs round-trip, and one row in
595    // the rendered registry instead of two. ktstr always runs as
596    // root (CAP_NET_ADMIN is implicit), so the procfs fallback
597    // bought no extra coverage.
598    // `exec_max` is set inside `update_se`
599    // (`kernel/sched/fair.c:1353`), guarded by
600    // `if (schedstat_enabled())`. Reachable from sched_ext via
601    // `update_curr_common` (`kernel/sched/ext.c:1343`), so
602    // class-agnostic at runtime, gated only by CONFIG_SCHEDSTATS.
603    CtprofMetricDef {
604        name: "exec_max",
605        rule: AggRule::MaxPeak(|t| t.exec_max),
606        sched_class: None,
607        config_gates: &["CONFIG_SCHEDSTATS"],
608        is_dead: false,
609        description: "Longest single uninterrupted on-CPU run observed, ns.",
610        section: Section::Primary,
611    },
612    // `slice_max` is part of the CFS-class statistics struct.
613    // Per the kernel-field-semantics audit, zero under
614    // sched_ext / RT / DL because the populating call sites
615    // live in CFS-class entry points.
616    CtprofMetricDef {
617        name: "slice_max",
618        rule: AggRule::MaxPeak(|t| t.slice_max),
619        sched_class: Some("cfs-only"),
620        config_gates: &["CONFIG_SCHEDSTATS"],
621        is_dead: false,
622        description: "Longest CFS slice the task was granted, ns.",
623        section: Section::Primary,
624    },
625    // Cumulative core-scheduling forced-idle time, ns. Counter
626    // (Sum). Increment is class-agnostic: `__account_forceidle_time()`
627    // at `kernel/sched/cputime.c:244` does a plain
628    // `__schedstat_add(p->stats.core_forceidle_sum, delta)` on
629    // whichever task is running on each SMT sibling, called
630    // from `__sched_core_account_forceidle()` in
631    // `kernel/sched/core_sched.c:287`. Real gating is at
632    // build/rq level: CONFIG_SCHED_CORE + CONFIG_SCHEDSTATS +
633    // `core_forceidle_count > 0`. See [`ThreadState::core_forceidle_sum`]
634    // for the full caller chain.
635    // Auto_scale ns ladder takes ns → µs → ms → s. Lives next
636    // to `slice_max` because both relate to scheduler-decision
637    // moments rather than wait/sleep accumulation.
638    CtprofMetricDef {
639        name: "core_forceidle_sum",
640        rule: AggRule::SumNs(|t| t.core_forceidle_sum),
641        sched_class: None,
642        config_gates: &["CONFIG_SCHED_CORE", "CONFIG_SCHEDSTATS"],
643        is_dead: false,
644        description: "Cumulative time this task forced its SMT sibling idle, ns (core scheduling).",
645        section: Section::Primary,
646    },
647    // Current scheduler slice in ns (stale under SCHED_EXT —
648    // see field doc) from `/proc/<tid>/sched`'s `slice` line.
649    // Per-thread instantaneous gauge (NOT a high-water counter
650    // — `slice_max` directly above is the historical max).
651    // Aggregating across a group via Max surfaces the longest
652    // current slice any thread is running with — Sum would
653    // multiply a near-identical value across the group and
654    // obscure the signal. Name `fair_slice_ns` mirrors the
655    // kernel emission gate `fair_policy(p->policy)` at
656    // `kernel/sched/debug.c:1363`, which (per
657    // `kernel/sched/sched.h:194,203`) accepts SCHED_NORMAL,
658    // SCHED_BATCH, AND SCHED_EXT under CONFIG_SCHED_CLASS_EXT.
659    CtprofMetricDef {
660        name: "fair_slice_ns",
661        rule: AggRule::MaxGaugeNs(|t| t.fair_slice_ns),
662        sched_class: Some("fair-policy"),
663        config_gates: &[],
664        is_dead: false,
665        description: "Current scheduler slice, ns; snapshot from /proc/<tid>/sched (stale under sched_ext).",
666        section: Section::Primary,
667    },
668    // memory
669    CtprofMetricDef {
670        name: "allocated_bytes",
671        rule: AggRule::SumBytes(|t| t.allocated_bytes),
672        sched_class: None,
673        config_gates: &[],
674        is_dead: false,
675        description: "jemalloc per-thread allocated bytes (TSD thread_allocated counter).",
676        section: Section::Primary,
677    },
678    CtprofMetricDef {
679        name: "deallocated_bytes",
680        rule: AggRule::SumBytes(|t| t.deallocated_bytes),
681        sched_class: None,
682        config_gates: &[],
683        is_dead: false,
684        description: "jemalloc per-thread deallocated bytes (TSD thread_deallocated counter).",
685        section: Section::Primary,
686    },
687    CtprofMetricDef {
688        name: "minflt",
689        rule: AggRule::SumCount(|t| t.minflt),
690        sched_class: None,
691        config_gates: &[],
692        is_dead: false,
693        description: "Minor page faults (resolved without I/O).",
694        section: Section::Primary,
695    },
696    CtprofMetricDef {
697        name: "majflt",
698        rule: AggRule::SumCount(|t| t.majflt),
699        sched_class: None,
700        config_gates: &[],
701        is_dead: false,
702        description: "Major page faults (required disk I/O to resolve).",
703        section: Section::Primary,
704    },
705    CtprofMetricDef {
706        name: "utime_clock_ticks",
707        rule: AggRule::SumTicks(|t| t.utime_clock_ticks),
708        sched_class: None,
709        config_gates: &[],
710        is_dead: false,
711        description: "User-mode CPU time, USER_HZ ticks; /proc/<tid>/stat field 14.",
712        section: Section::Primary,
713    },
714    CtprofMetricDef {
715        name: "stime_clock_ticks",
716        rule: AggRule::SumTicks(|t| t.stime_clock_ticks),
717        sched_class: None,
718        config_gates: &[],
719        is_dead: false,
720        description: "Kernel-mode CPU time, USER_HZ ticks; /proc/<tid>/stat field 15.",
721        section: Section::Primary,
722    },
723    // I/O — `/proc/<tid>/io` is emitted by
724    // `do_io_accounting` (`fs/proc/base.c`) under a single
725    // `CONFIG_TASK_IO_ACCOUNTING` gate, and CONFIG_TASK_IO_ACCOUNTING
726    // `depends on` CONFIG_TASK_XACCT in init/Kconfig — so from
727    // the capture-pipeline perspective the file is
728    // all-or-nothing. All 6 fields share the same
729    // `CONFIG_TASK_IO_ACCOUNTING` gate.
730    CtprofMetricDef {
731        name: "rchar",
732        rule: AggRule::SumBytes(|t| t.rchar),
733        sched_class: None,
734        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
735        is_dead: false,
736        description: "Bytes read at the read syscall layer (incl. cached / pagecache hits).",
737        section: Section::Primary,
738    },
739    CtprofMetricDef {
740        name: "wchar",
741        rule: AggRule::SumBytes(|t| t.wchar),
742        sched_class: None,
743        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
744        is_dead: false,
745        description: "Bytes written at the write syscall layer (incl. pagecache / writeback).",
746        section: Section::Primary,
747    },
748    CtprofMetricDef {
749        name: "syscr",
750        rule: AggRule::SumCount(|t| t.syscr),
751        sched_class: None,
752        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
753        is_dead: false,
754        description: "Number of read syscalls.",
755        section: Section::Primary,
756    },
757    CtprofMetricDef {
758        name: "syscw",
759        rule: AggRule::SumCount(|t| t.syscw),
760        sched_class: None,
761        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
762        is_dead: false,
763        description: "Number of write syscalls.",
764        section: Section::Primary,
765    },
766    CtprofMetricDef {
767        name: "read_bytes",
768        rule: AggRule::SumBytes(|t| t.read_bytes),
769        sched_class: None,
770        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
771        is_dead: false,
772        description: "Bytes that hit the storage device on read (excludes pagecache hits).",
773        section: Section::Primary,
774    },
775    CtprofMetricDef {
776        name: "write_bytes",
777        rule: AggRule::SumBytes(|t| t.write_bytes),
778        sched_class: None,
779        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
780        is_dead: false,
781        description: "Bytes that hit the storage device on write (post-writeback).",
782        section: Section::Primary,
783    },
784    // `cancelled_write_bytes` from `/proc/<tid>/io` 7th line.
785    // `task_io_account_cancelled_write` (kernel
786    // include/linux/task_io_accounting_ops.h:39-42) increments
787    // `current->ioac.cancelled_write_bytes` from
788    // `folio_account_cleaned` (mm/page-writeback.c:2652) when a
789    // dirty folio is reclaimed without writeback (truncate /
790    // inode invalidation), so the per-thread value records on
791    // the truncating task — not necessarily the original writer.
792    // Group-level Sum is meaningful (total cancelled-write
793    // bytes for the bucket); per-thread `write_bytes -
794    // cancelled_write_bytes` is NOT a derived metric because
795    // the two counters track distinct parties — see the field
796    // doc on ThreadState::cancelled_write_bytes.
797    CtprofMetricDef {
798        name: "cancelled_write_bytes",
799        rule: AggRule::SumBytes(|t| t.cancelled_write_bytes),
800        sched_class: None,
801        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
802        is_dead: false,
803        description: "Bytes the kernel deaccounted from a prior dirty-write because the page was reclaimed without writeback (truncate / inode invalidation); recorded on the truncating task, not the writer. Per-thread `write_bytes - cancelled_write_bytes` is NOT a valid derivation — see field doc.",
804        section: Section::Primary,
805    },
806    // taskstats — captured via the kernel's genetlink TASKSTATS
807    // family ([`crate::taskstats`]). Two field families share the
808    // CONFIG_TASKSTATS netlink-family gate but differ in the
809    // per-family kconfig:
810    //
811    //   - delay-accounting fields (cpu/blkio/swapin/freepages/
812    //     thrashing/compact/wpcopy/irq × count/total/max/min,
813    //     32 entries) are gated on CONFIG_TASKSTATS +
814    //     CONFIG_TASK_DELAY_ACCT (the per-task counters in
815    //     `kernel/delayacct.c`); the runtime `delayacct=on` toggle
816    //     (sysctl `kernel.task_delayacct` or boot param
817    //     `delayacct`) is a separate condition that must hold for
818    //     the counters to actually update.
819    //   - memory-watermark fields (hiwater_rss_bytes,
820    //     hiwater_vm_bytes) are gated on CONFIG_TASKSTATS +
821    //     CONFIG_TASK_XACCT (the extended-accounting path in
822    //     `kernel/tsacct.c::xacct_add_tsk`); they do NOT respond
823    //     to the `delayacct=on` toggle.
824    //
825    // Calling the netlink family additionally requires
826    // `CAP_NET_ADMIN`. Any failed gate / missing cap collapses
827    // the affected fields to zero per the best-effort capture
828    // contract.
829    //
830    // CPU-delay block: cpu_count + cpu_delay_total are RACY —
831    // updated by the sched_info path without a lock, so a reader
832    // may observe count or total advance ahead of the other.
833    // (cpu_delay_max / cpu_delay_min are PeakNs lifetime
834    // watermarks updated at delayacct path entries; same race
835    // window in principle, but the watermark semantics already
836    // mask brief skew.)
837    CtprofMetricDef {
838        name: "cpu_delay_count",
839        rule: AggRule::SumCount(|t| t.cpu_delay_count),
840        sched_class: None,
841        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
842        is_dead: false,
843        description: "Number of off-CPU windows the task waited for the runqueue to schedule it (taskstats cpu_count). RACY: count + total are not updated atomically.",
844        section: Section::TaskstatsDelay,
845    },
846    CtprofMetricDef {
847        name: "cpu_delay_total_ns",
848        rule: AggRule::SumNs(|t| t.cpu_delay_total_ns),
849        sched_class: None,
850        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
851        is_dead: false,
852        description: "Cumulative ns the task waited on the runqueue (taskstats cpu_delay_total). Distinct from `wait_sum` (schedstat) which captures the same wait-for-CPU bucket via a different code path. RACY (see cpu_delay_count).",
853        section: Section::TaskstatsDelay,
854    },
855    CtprofMetricDef {
856        name: "cpu_delay_max_ns",
857        rule: AggRule::MaxPeak(|t| t.cpu_delay_max_ns),
858        sched_class: None,
859        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
860        is_dead: false,
861        description: "Longest single CPU-wait window observed, ns (taskstats cpu_delay_max).",
862        section: Section::TaskstatsDelay,
863    },
864    CtprofMetricDef {
865        name: "cpu_delay_min_ns",
866        rule: AggRule::MaxPeak(|t| t.cpu_delay_min_ns),
867        sched_class: None,
868        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
869        is_dead: false,
870        description: "Shortest non-zero CPU-wait window observed, ns (taskstats cpu_delay_min). Sentinel 0 means \"no events observed\" — compare against cpu_delay_count.",
871        section: Section::TaskstatsDelay,
872    },
873    // Block-I/O delay block: serializes through `task->delays->lock`
874    // so count + total are atomic (unlike cpu_*).
875    CtprofMetricDef {
876        name: "blkio_delay_count",
877        rule: AggRule::SumCount(|t| t.blkio_delay_count),
878        sched_class: None,
879        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
880        is_dead: false,
881        description: "Number of synchronous block-I/O wait windows (taskstats blkio_count).",
882        section: Section::TaskstatsDelay,
883    },
884    CtprofMetricDef {
885        name: "blkio_delay_total_ns",
886        rule: AggRule::SumNs(|t| t.blkio_delay_total_ns),
887        sched_class: None,
888        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
889        is_dead: false,
890        description: "Cumulative ns waiting on synchronous block I/O (taskstats blkio_delay_total). Distinct from `iowait_sum` (schedstat).",
891        section: Section::TaskstatsDelay,
892    },
893    CtprofMetricDef {
894        name: "blkio_delay_max_ns",
895        rule: AggRule::MaxPeak(|t| t.blkio_delay_max_ns),
896        sched_class: None,
897        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
898        is_dead: false,
899        description: "Longest single block-I/O wait observed, ns (taskstats blkio_delay_max).",
900        section: Section::TaskstatsDelay,
901    },
902    CtprofMetricDef {
903        name: "blkio_delay_min_ns",
904        rule: AggRule::MaxPeak(|t| t.blkio_delay_min_ns),
905        sched_class: None,
906        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
907        is_dead: false,
908        description: "Shortest non-zero block-I/O wait observed, ns (taskstats blkio_delay_min). Sentinel 0 means \"no events observed\".",
909        section: Section::TaskstatsDelay,
910    },
911    // Swap-in delay block: OVERLAPS with thrashing_* — every
912    // thrashing event is also a swapin event from the syscall
913    // layer. Do not sum swapin and thrashing.
914    CtprofMetricDef {
915        name: "swapin_delay_count",
916        rule: AggRule::SumCount(|t| t.swapin_delay_count),
917        sched_class: None,
918        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
919        is_dead: false,
920        description: "Number of swap-in wait windows (taskstats swapin_count). OVERLAPS with thrashing_delay_count — do not sum.",
921        section: Section::TaskstatsDelay,
922    },
923    CtprofMetricDef {
924        name: "swapin_delay_total_ns",
925        rule: AggRule::SumNs(|t| t.swapin_delay_total_ns),
926        sched_class: None,
927        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
928        is_dead: false,
929        description: "Cumulative ns waiting for swap-in to complete (taskstats swapin_delay_total).",
930        section: Section::TaskstatsDelay,
931    },
932    CtprofMetricDef {
933        name: "swapin_delay_max_ns",
934        rule: AggRule::MaxPeak(|t| t.swapin_delay_max_ns),
935        sched_class: None,
936        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
937        is_dead: false,
938        description: "Longest single swap-in wait observed, ns (taskstats swapin_delay_max).",
939        section: Section::TaskstatsDelay,
940    },
941    CtprofMetricDef {
942        name: "swapin_delay_min_ns",
943        rule: AggRule::MaxPeak(|t| t.swapin_delay_min_ns),
944        sched_class: None,
945        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
946        is_dead: false,
947        description: "Shortest non-zero swap-in wait observed, ns (taskstats swapin_delay_min). Sentinel 0 means \"no events observed\".",
948        section: Section::TaskstatsDelay,
949    },
950    // Direct memory reclaim (free-pages) block.
951    CtprofMetricDef {
952        name: "freepages_delay_count",
953        rule: AggRule::SumCount(|t| t.freepages_delay_count),
954        sched_class: None,
955        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
956        is_dead: false,
957        description: "Number of direct-reclaim wait windows (taskstats freepages_count).",
958        section: Section::TaskstatsDelay,
959    },
960    CtprofMetricDef {
961        name: "freepages_delay_total_ns",
962        rule: AggRule::SumNs(|t| t.freepages_delay_total_ns),
963        sched_class: None,
964        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
965        is_dead: false,
966        description: "Cumulative ns waiting in direct memory reclaim (taskstats freepages_delay_total).",
967        section: Section::TaskstatsDelay,
968    },
969    CtprofMetricDef {
970        name: "freepages_delay_max_ns",
971        rule: AggRule::MaxPeak(|t| t.freepages_delay_max_ns),
972        sched_class: None,
973        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
974        is_dead: false,
975        description: "Longest single direct-reclaim wait observed, ns (taskstats freepages_delay_max).",
976        section: Section::TaskstatsDelay,
977    },
978    CtprofMetricDef {
979        name: "freepages_delay_min_ns",
980        rule: AggRule::MaxPeak(|t| t.freepages_delay_min_ns),
981        sched_class: None,
982        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
983        is_dead: false,
984        description: "Shortest non-zero direct-reclaim wait observed, ns (taskstats freepages_delay_min). Sentinel 0 means \"no events observed\".",
985        section: Section::TaskstatsDelay,
986    },
987    // Thrashing block: OVERLAPS with swapin_* (see above).
988    CtprofMetricDef {
989        name: "thrashing_delay_count",
990        rule: AggRule::SumCount(|t| t.thrashing_delay_count),
991        sched_class: None,
992        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
993        is_dead: false,
994        description: "Number of thrashing wait windows (taskstats thrashing_count). OVERLAPS with swapin_delay_count — do not sum.",
995        section: Section::TaskstatsDelay,
996    },
997    CtprofMetricDef {
998        name: "thrashing_delay_total_ns",
999        rule: AggRule::SumNs(|t| t.thrashing_delay_total_ns),
1000        sched_class: None,
1001        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1002        is_dead: false,
1003        description: "Cumulative ns waiting under thrashing pressure (taskstats thrashing_delay_total).",
1004        section: Section::TaskstatsDelay,
1005    },
1006    CtprofMetricDef {
1007        name: "thrashing_delay_max_ns",
1008        rule: AggRule::MaxPeak(|t| t.thrashing_delay_max_ns),
1009        sched_class: None,
1010        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1011        is_dead: false,
1012        description: "Longest single thrashing wait observed, ns (taskstats thrashing_delay_max).",
1013        section: Section::TaskstatsDelay,
1014    },
1015    CtprofMetricDef {
1016        name: "thrashing_delay_min_ns",
1017        rule: AggRule::MaxPeak(|t| t.thrashing_delay_min_ns),
1018        sched_class: None,
1019        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1020        is_dead: false,
1021        description: "Shortest non-zero thrashing wait observed, ns (taskstats thrashing_delay_min). Sentinel 0 means \"no events observed\".",
1022        section: Section::TaskstatsDelay,
1023    },
1024    // Memory compaction block.
1025    CtprofMetricDef {
1026        name: "compact_delay_count",
1027        rule: AggRule::SumCount(|t| t.compact_delay_count),
1028        sched_class: None,
1029        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1030        is_dead: false,
1031        description: "Number of memory-compaction wait windows (taskstats compact_count).",
1032        section: Section::TaskstatsDelay,
1033    },
1034    CtprofMetricDef {
1035        name: "compact_delay_total_ns",
1036        rule: AggRule::SumNs(|t| t.compact_delay_total_ns),
1037        sched_class: None,
1038        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1039        is_dead: false,
1040        description: "Cumulative ns waiting on memory compaction (taskstats compact_delay_total).",
1041        section: Section::TaskstatsDelay,
1042    },
1043    CtprofMetricDef {
1044        name: "compact_delay_max_ns",
1045        rule: AggRule::MaxPeak(|t| t.compact_delay_max_ns),
1046        sched_class: None,
1047        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1048        is_dead: false,
1049        description: "Longest single compaction wait observed, ns (taskstats compact_delay_max).",
1050        section: Section::TaskstatsDelay,
1051    },
1052    CtprofMetricDef {
1053        name: "compact_delay_min_ns",
1054        rule: AggRule::MaxPeak(|t| t.compact_delay_min_ns),
1055        sched_class: None,
1056        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1057        is_dead: false,
1058        description: "Shortest non-zero compaction wait observed, ns (taskstats compact_delay_min). Sentinel 0 means \"no events observed\".",
1059        section: Section::TaskstatsDelay,
1060    },
1061    // Write-protect-copy (CoW) fault block.
1062    CtprofMetricDef {
1063        name: "wpcopy_delay_count",
1064        rule: AggRule::SumCount(|t| t.wpcopy_delay_count),
1065        sched_class: None,
1066        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1067        is_dead: false,
1068        description: "Number of write-protect-copy (CoW) fault wait windows (taskstats wpcopy_count).",
1069        section: Section::TaskstatsDelay,
1070    },
1071    CtprofMetricDef {
1072        name: "wpcopy_delay_total_ns",
1073        rule: AggRule::SumNs(|t| t.wpcopy_delay_total_ns),
1074        sched_class: None,
1075        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1076        is_dead: false,
1077        description: "Cumulative ns waiting on write-protect-copy faults (taskstats wpcopy_delay_total).",
1078        section: Section::TaskstatsDelay,
1079    },
1080    CtprofMetricDef {
1081        name: "wpcopy_delay_max_ns",
1082        rule: AggRule::MaxPeak(|t| t.wpcopy_delay_max_ns),
1083        sched_class: None,
1084        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1085        is_dead: false,
1086        description: "Longest single write-protect-copy fault wait observed, ns (taskstats wpcopy_delay_max).",
1087        section: Section::TaskstatsDelay,
1088    },
1089    CtprofMetricDef {
1090        name: "wpcopy_delay_min_ns",
1091        rule: AggRule::MaxPeak(|t| t.wpcopy_delay_min_ns),
1092        sched_class: None,
1093        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1094        is_dead: false,
1095        description: "Shortest non-zero write-protect-copy fault wait observed, ns (taskstats wpcopy_delay_min). Sentinel 0 means \"no events observed\".",
1096        section: Section::TaskstatsDelay,
1097    },
1098    // IRQ-handler delay block. Updates from `delayacct_irq` in
1099    // `kernel/delayacct.c` — counts kernel-IRQ time charged to
1100    // the task.
1101    CtprofMetricDef {
1102        name: "irq_delay_count",
1103        rule: AggRule::SumCount(|t| t.irq_delay_count),
1104        sched_class: None,
1105        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1106        is_dead: false,
1107        description: "Number of IRQ-handler windows charged to the task (taskstats irq_count).",
1108        section: Section::TaskstatsDelay,
1109    },
1110    CtprofMetricDef {
1111        name: "irq_delay_total_ns",
1112        rule: AggRule::SumNs(|t| t.irq_delay_total_ns),
1113        sched_class: None,
1114        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1115        is_dead: false,
1116        description: "Cumulative ns of IRQ handling charged to the task (taskstats irq_delay_total).",
1117        section: Section::TaskstatsDelay,
1118    },
1119    CtprofMetricDef {
1120        name: "irq_delay_max_ns",
1121        rule: AggRule::MaxPeak(|t| t.irq_delay_max_ns),
1122        sched_class: None,
1123        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1124        is_dead: false,
1125        description: "Longest single IRQ-handler window observed, ns (taskstats irq_delay_max).",
1126        section: Section::TaskstatsDelay,
1127    },
1128    CtprofMetricDef {
1129        name: "irq_delay_min_ns",
1130        rule: AggRule::MaxPeak(|t| t.irq_delay_min_ns),
1131        sched_class: None,
1132        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
1133        is_dead: false,
1134        description: "Shortest non-zero IRQ-handler window observed, ns (taskstats irq_delay_min). Sentinel 0 means \"no events observed\".",
1135        section: Section::TaskstatsDelay,
1136    },
1137    // Lifetime memory watermarks. Updates from `xacct_add_tsk` in
1138    // `kernel/tsacct.c` — kB → bytes conversion happens at parse
1139    // time in `crate::taskstats::parse_taskstats_payload`. Gated
1140    // on CONFIG_TASK_XACCT (the "extended accounting" path), NOT
1141    // CONFIG_TASK_DELAY_ACCT — `xacct_add_tsk` lives behind
1142    // `CONFIG_TASK_XACCT` while delayacct is the parallel
1143    // `CONFIG_TASK_DELAY_ACCT` subsystem; the two are
1144    // independently selectable.
1145    CtprofMetricDef {
1146        name: "hiwater_rss_bytes",
1147        rule: AggRule::MaxPeakBytes(|t| t.hiwater_rss_bytes),
1148        sched_class: None,
1149        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_XACCT"],
1150        is_dead: false,
1151        description: "Lifetime high-watermark of resident-set size, bytes (taskstats hiwater_rss). Distinct from smaps_rollup_kib[\"Rss\"] which is the CURRENT RSS.",
1152        section: Section::TaskstatsDelay,
1153    },
1154    CtprofMetricDef {
1155        name: "hiwater_vm_bytes",
1156        rule: AggRule::MaxPeakBytes(|t| t.hiwater_vm_bytes),
1157        sched_class: None,
1158        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_XACCT"],
1159        is_dead: false,
1160        description: "Lifetime high-watermark of virtual-memory size, bytes (taskstats hiwater_vm).",
1161        section: Section::TaskstatsDelay,
1162    },
1163];
1164
1165// ---------------------------------------------------------------------------
1166// Derived metrics
1167// ---------------------------------------------------------------------------
1168
1169/// Output value of a derived metric.
1170///
1171/// Derived metrics carry an `f64` scalar. The `f64` carrier is
1172/// chosen because the value range varies across derivations:
1173/// - `[0, 1]` ratios: `cpu_efficiency`, `affine_success_ratio`,
1174///   `involuntary_csw_ratio`.
1175/// - `[0, ∞)` ratios: `disk_io_fraction` (readahead can pull more
1176///   block-device bytes than the syscall requested, so the ratio
1177///   exceeds 1.0 in practice).
1178/// - `[0, ∞)` per-event means: `avg_wait_ns`, `avg_slice_ns`,
1179///   `avg_iowait_ns` — sum over count, both non-negative.
1180/// - `(-∞, ∞)` signed differences: `live_heap_estimate` =
1181///   `allocated_bytes - deallocated_bytes` can go negative when
1182///   the deallocation total exceeds the allocation total (a
1183///   freelist drains memory allocated before capture began, or
1184///   the per-thread TSD counters were sampled mid-update on a
1185///   thread that has just released a large arena).
1186///
1187/// All four shapes flow through the same `f64` carrier. The
1188/// per-derivation auto-scale ladder lives on
1189/// [`DerivedMetricDef::ladder`] (not on the value type) so the
1190/// renderer picks the right magnitude (ns / Bytes / unitless)
1191/// per row regardless of whether the value is positive, zero,
1192/// negative, fractional, or in the millions. The `is_ratio`
1193/// flag on [`DerivedMetricDef`] toggles between the auto-scaled
1194/// path (e.g. `1.500ms`, `7.500GiB`) and the raw three-decimal
1195/// path (`0.873` for ratios).
1196///
1197/// Sign preservation: the `auto_scale` step uses `abs()` for
1198/// the threshold check but propagates the original signed value
1199/// through the scaled output, and `format_derived_value_cell`
1200/// / `format_derived_delta_cell` both render with `{value:.2}`
1201/// or `{value:.3}` formatters that preserve the explicit `-` for
1202/// negatives. The `auto_scale_preserves_sign_on_negative_input`
1203/// regression test pins this for the Bytes and ns ladders.
1204#[derive(Debug, Clone, Copy, PartialEq)]
1205#[non_exhaustive]
1206pub enum DerivedValue {
1207    /// Floating-point value. Render via the
1208    /// [`DerivedMetricDef::ladder`] + [`DerivedMetricDef::is_ratio`]
1209    /// pair: ratios format with three decimals (`0.873`,
1210    /// `+0.100`); ladder-bearing values
1211    /// ([`ScaleLadder::Ns`] / [`ScaleLadder::Bytes`] / etc.)
1212    /// route through the same auto-scale ladders the main table
1213    /// uses.
1214    Scalar(f64),
1215}
1216
1217impl DerivedValue {
1218    /// Return the underlying `f64`. Helper for delta math
1219    /// downstream of `DerivedRow` consumers.
1220    pub fn as_f64(&self) -> f64 {
1221        match self {
1222            DerivedValue::Scalar(v) => *v,
1223        }
1224    }
1225}
1226
1227/// Definition of a derived metric: a function that consumes the
1228/// already-aggregated input metrics for a group and produces a
1229/// single scalar (with its own unit and operator-facing
1230/// description).
1231///
1232/// The compute fn returns `None` when an input metric is missing
1233/// from the group's metrics map (capture-side gated by a kernel
1234/// CONFIG that wasn't enabled, or jemalloc not linked) OR when
1235/// the formula would divide by zero. The renderer surfaces a
1236/// `None` cell as `-` so the operator can distinguish "not
1237/// computable" from "computed as zero".
1238#[derive(Debug, Clone, Copy)]
1239#[non_exhaustive]
1240pub struct DerivedMetricDef {
1241    pub name: &'static str,
1242    /// Auto-scale ladder for the cell. [`ScaleLadder::None`] for
1243    /// ratio rows (renders as a bare three-decimal scalar with
1244    /// no suffix), [`ScaleLadder::Ns`] / [`ScaleLadder::Bytes`] /
1245    /// etc. for unit-bearing derivations. The same closed-match
1246    /// dispatch [`AggRule::ladder`] feeds.
1247    pub ladder: ScaleLadder,
1248    /// Operator-facing one-line description; surfaced by the
1249    /// `ctprof metric-list` subcommand.
1250    pub description: &'static str,
1251    /// Names of input metrics from [`CTPROF_METRICS`]. Pure
1252    /// documentation — surfaces in the `metric-list` output so
1253    /// the operator sees what each derivation depends on.
1254    pub inputs: &'static [&'static str],
1255    /// Render-shape flag for dimensionless quantities. When true,
1256    /// the renderer (1) suppresses the `%` (delta_pct) column,
1257    /// (2) renders the value as `N.NNN` with three decimals
1258    /// instead of routing through the auto-scale ladder, and
1259    /// (3) renders the delta as `+/-N.NNN` (no scaled unit
1260    /// suffix).
1261    ///
1262    /// The `[0, 1]` interval is the common case where this flag
1263    /// applies: `cpu_efficiency`, `affine_success_ratio`, and
1264    /// `involuntary_csw_ratio` all live in `[0, 1]`. Delta on a
1265    /// `[0, 1]` ratio reads as percentage points
1266    /// (0.5 → 0.6 = +0.100 = +10pp), and `delta / baseline` as
1267    /// a fraction (the `%` column) becomes confusing — `+20%` on
1268    /// a `[0, 1]` ratio is already in percentage points, so a
1269    /// percentage-of-percentage readout double-encodes the
1270    /// signal.
1271    ///
1272    /// `disk_io_fraction` (range `[0, ∞)`) carries `is_ratio: true`
1273    /// for the rendering shape but does NOT satisfy the
1274    /// percentage-points interpretation: a value of 1.5 is
1275    /// possible (readahead pulls more block-device bytes than
1276    /// the syscall requested), so a delta of +0.100 reads as
1277    /// "ratio rose by 0.1" rather than "ratio rose by 10
1278    /// percentage points." The render shape is still correct
1279    /// (suppress `%`, three decimals, no auto-scale) — only the
1280    /// pp interpretation is invalid.
1281    pub is_ratio: bool,
1282    /// The computation. Pulls input scalars from the group's
1283    /// metrics map via `Aggregated::numeric()` and produces the
1284    /// derived scalar.
1285    pub compute: fn(&BTreeMap<String, Aggregated>) -> Option<DerivedValue>,
1286    /// Section this derived metric belongs to for the
1287    /// `--sections` per-row filter, mirroring
1288    /// [`CtprofMetricDef::section`]. Most derivations tag
1289    /// [`Section::Derived`]; the 9 derivations whose inputs are
1290    /// taskstats fields (the eight `avg_*_delay_ns` averages
1291    /// plus `total_offcpu_delay_ns`) tag
1292    /// [`Section::TaskstatsDelay`] so an operator running
1293    /// `--sections taskstats-delay` gets a full taskstats view
1294    /// — the 34 raw rows AND the 9 derivations that depend on
1295    /// them — without dragging in unrelated derived metrics.
1296    /// The `## Derived metrics` table emitter checks
1297    /// `DisplayOptions::is_section_enabled` per row before
1298    /// rendering, and the outer-table gate opens whenever EITHER
1299    /// section in the rendered set is enabled.
1300    pub section: Section,
1301}
1302
1303/// Helper: pull an input metric's `Aggregated::numeric()`
1304/// projection out of the group's metrics map.
1305fn input_scalar(metrics: &BTreeMap<String, Aggregated>, name: &str) -> Option<f64> {
1306    metrics.get(name).and_then(|a| a.numeric())
1307}
1308
1309/// Helper: compute `num / den` for a simple ratio. Returns
1310/// `None` when either input is missing OR `den == 0` (so the
1311/// renderer surfaces `-` rather than NaN/inf). Used by the
1312/// majority of derived metrics whose formula is a plain
1313/// quotient over two registry inputs.
1314fn ratio_compute(
1315    metrics: &BTreeMap<String, Aggregated>,
1316    numerator: &str,
1317    denominator: &str,
1318) -> Option<DerivedValue> {
1319    let num = input_scalar(metrics, numerator)?;
1320    let den = input_scalar(metrics, denominator)?;
1321    if den == 0.0 {
1322        return None;
1323    }
1324    Some(DerivedValue::Scalar(num / den))
1325}
1326
1327/// Helper: compute `num / (num + addend)` for ratios whose
1328/// denominator is a sum of two registry inputs. Returns `None`
1329/// when either input is missing OR the synthesized denominator
1330/// is zero. Used by `cpu_efficiency` (run / (run + wait)) and
1331/// `involuntary_csw_ratio` (nvcsw / (vcsw + nvcsw)).
1332fn ratio_of_sum_compute(
1333    metrics: &BTreeMap<String, Aggregated>,
1334    numerator: &str,
1335    addend: &str,
1336) -> Option<DerivedValue> {
1337    let num = input_scalar(metrics, numerator)?;
1338    let other = input_scalar(metrics, addend)?;
1339    let den = num + other;
1340    if den == 0.0 {
1341        return None;
1342    }
1343    Some(DerivedValue::Scalar(num / den))
1344}
1345
1346/// Registry of derived metrics. Each entry consumes one or more
1347/// already-aggregated input metrics from
1348/// [`CTPROF_METRICS`] and produces a single scalar with its
1349/// own unit. See the per-entry doc strings for the formula and
1350/// kernel-source rationale.
1351pub static CTPROF_DERIVED_METRICS: &[DerivedMetricDef] = &[
1352    DerivedMetricDef {
1353        name: "affine_success_ratio",
1354        ladder: ScaleLadder::None,
1355        description: "wake_affine() success ratio: nr_wakeups_affine / nr_wakeups_affine_attempts.",
1356        inputs: &["nr_wakeups_affine", "nr_wakeups_affine_attempts"],
1357        is_ratio: true,
1358        compute: |m| ratio_compute(m, "nr_wakeups_affine", "nr_wakeups_affine_attempts"),
1359        section: Section::Derived,
1360    },
1361    DerivedMetricDef {
1362        name: "avg_wait_ns",
1363        ladder: ScaleLadder::Ns,
1364        description: "Average runqueue-wait duration per scheduling event: wait_sum / wait_count (ns/event).",
1365        inputs: &["wait_sum", "wait_count"],
1366        is_ratio: false,
1367        compute: |m| ratio_compute(m, "wait_sum", "wait_count"),
1368        section: Section::Derived,
1369    },
1370    // `voluntary_sleep_sum` derived metric was removed when
1371    // `voluntary_sleep_ns` became a first-class capture field.
1372    // The kernel's `sum_sleep_runtime - sum_block_runtime`
1373    // computation now happens at capture time inside
1374    // `capture_thread_at_with_tally` so every consumer reads the
1375    // pre-normalized value without re-deriving.
1376    DerivedMetricDef {
1377        name: "cpu_efficiency",
1378        ladder: ScaleLadder::None,
1379        description: "Fraction of total scheduler-tracked time spent on-CPU: run_time_ns / (run_time_ns + wait_time_ns).",
1380        inputs: &["run_time_ns", "wait_time_ns"],
1381        is_ratio: true,
1382        compute: |m| ratio_of_sum_compute(m, "run_time_ns", "wait_time_ns"),
1383        section: Section::Derived,
1384    },
1385    DerivedMetricDef {
1386        name: "avg_slice_ns",
1387        ladder: ScaleLadder::Ns,
1388        description: "Average on-CPU slice length per timeslice: run_time_ns / timeslices (ns/timeslice).",
1389        inputs: &["run_time_ns", "timeslices"],
1390        is_ratio: false,
1391        compute: |m| ratio_compute(m, "run_time_ns", "timeslices"),
1392        section: Section::Derived,
1393    },
1394    DerivedMetricDef {
1395        name: "involuntary_csw_ratio",
1396        ladder: ScaleLadder::None,
1397        description: "Fraction of context switches that were preemptions: nonvoluntary_csw / (voluntary_csw + nonvoluntary_csw).",
1398        inputs: &["nonvoluntary_csw", "voluntary_csw"],
1399        is_ratio: true,
1400        compute: |m| ratio_of_sum_compute(m, "nonvoluntary_csw", "voluntary_csw"),
1401        section: Section::Derived,
1402    },
1403    DerivedMetricDef {
1404        name: "disk_io_fraction",
1405        ladder: ScaleLadder::None,
1406        description: "Fraction of read syscall bytes that hit storage: read_bytes / rchar. Typically <= 1.0 but can exceed when readahead pulls more block-device bytes than the syscall requested.",
1407        inputs: &["read_bytes", "rchar"],
1408        is_ratio: true,
1409        compute: |m| ratio_compute(m, "read_bytes", "rchar"),
1410        section: Section::Derived,
1411    },
1412    DerivedMetricDef {
1413        name: "live_heap_estimate",
1414        ladder: ScaleLadder::Bytes,
1415        description: "jemalloc live-heap estimate: allocated_bytes - deallocated_bytes. Signed: negative when deallocations dominate (freelist drains memory allocated before capture, or sampled mid-update on a thread that just released a large arena). Renders a negative value with an explicit minus and the IEC binary suffix (e.g. `-1.907MiB`). Absent (rendered `-`, no value) when the jemalloc family was not captured for the group — a non-jemalloc process, or the TSD probe could not attach — distinct from a measured zero.",
1416        inputs: &["allocated_bytes", "deallocated_bytes"],
1417        is_ratio: false,
1418        compute: |m| {
1419            let alloc = input_scalar(m, "allocated_bytes")?;
1420            let dealloc = input_scalar(m, "deallocated_bytes")?;
1421            Some(DerivedValue::Scalar(alloc - dealloc))
1422        },
1423        section: Section::Derived,
1424    },
1425    DerivedMetricDef {
1426        name: "avg_iowait_ns",
1427        ladder: ScaleLadder::Ns,
1428        description: "Average iowait interval per iowait event: iowait_sum / iowait_count (ns/event).",
1429        inputs: &["iowait_sum", "iowait_count"],
1430        is_ratio: false,
1431        compute: |m| ratio_compute(m, "iowait_sum", "iowait_count"),
1432        section: Section::Derived,
1433    },
1434    // -- taskstats per-category averages (delay_total / count) --
1435    //
1436    // One average per delay-accounting category. Same shape as
1437    // avg_wait_ns / avg_iowait_ns above (sum-over-count quotient,
1438    // ns ladder, non-ratio). The category-specific caveats from
1439    // the registry (cpu RACY, swapin/thrashing OVERLAP, sentinel
1440    // semantics) carry forward into the description so an operator
1441    // reading `metric-list` for the derived row sees the same
1442    // gating discipline they get for the raw count/total fields.
1443    DerivedMetricDef {
1444        name: "avg_cpu_delay_ns",
1445        ladder: ScaleLadder::Ns,
1446        description: "Average CPU-wait per scheduling event: cpu_delay_total_ns / cpu_delay_count (ns/event). RACY: the kernel updates count + total via the lockless sched_info path, so a concurrent reader may observe one ahead of the other; the quotient is approximate at the sub-event scale and stable at the integrated scale.",
1447        inputs: &["cpu_delay_total_ns", "cpu_delay_count"],
1448        is_ratio: false,
1449        compute: |m| ratio_compute(m, "cpu_delay_total_ns", "cpu_delay_count"),
1450        section: Section::TaskstatsDelay,
1451    },
1452    DerivedMetricDef {
1453        name: "avg_blkio_delay_ns",
1454        ladder: ScaleLadder::Ns,
1455        description: "Average synchronous block-I/O wait per event: blkio_delay_total_ns / blkio_delay_count (ns/event). Distinct from avg_iowait_ns (schedstat) — this travels through the delayacct path and is the canonical delay-accounting block-I/O reading.",
1456        inputs: &["blkio_delay_total_ns", "blkio_delay_count"],
1457        is_ratio: false,
1458        compute: |m| ratio_compute(m, "blkio_delay_total_ns", "blkio_delay_count"),
1459        section: Section::TaskstatsDelay,
1460    },
1461    DerivedMetricDef {
1462        name: "avg_swapin_delay_ns",
1463        ladder: ScaleLadder::Ns,
1464        description: "Average swap-in wait per event: swapin_delay_total_ns / swapin_delay_count (ns/event). OVERLAPS with thrashing — every thrashing event is also a swapin event from the syscall layer; do not sum the two averages or the underlying totals directly.",
1465        inputs: &["swapin_delay_total_ns", "swapin_delay_count"],
1466        is_ratio: false,
1467        compute: |m| ratio_compute(m, "swapin_delay_total_ns", "swapin_delay_count"),
1468        section: Section::TaskstatsDelay,
1469    },
1470    DerivedMetricDef {
1471        name: "avg_freepages_delay_ns",
1472        ladder: ScaleLadder::Ns,
1473        description: "Average direct-reclaim wait per event: freepages_delay_total_ns / freepages_delay_count (ns/event).",
1474        inputs: &["freepages_delay_total_ns", "freepages_delay_count"],
1475        is_ratio: false,
1476        compute: |m| ratio_compute(m, "freepages_delay_total_ns", "freepages_delay_count"),
1477        section: Section::TaskstatsDelay,
1478    },
1479    DerivedMetricDef {
1480        name: "avg_thrashing_delay_ns",
1481        ladder: ScaleLadder::Ns,
1482        description: "Average thrashing wait per event: thrashing_delay_total_ns / thrashing_delay_count (ns/event). OVERLAPS with swapin (see avg_swapin_delay_ns).",
1483        inputs: &["thrashing_delay_total_ns", "thrashing_delay_count"],
1484        is_ratio: false,
1485        compute: |m| ratio_compute(m, "thrashing_delay_total_ns", "thrashing_delay_count"),
1486        section: Section::TaskstatsDelay,
1487    },
1488    DerivedMetricDef {
1489        name: "avg_compact_delay_ns",
1490        ladder: ScaleLadder::Ns,
1491        description: "Average memory-compaction wait per event: compact_delay_total_ns / compact_delay_count (ns/event).",
1492        inputs: &["compact_delay_total_ns", "compact_delay_count"],
1493        is_ratio: false,
1494        compute: |m| ratio_compute(m, "compact_delay_total_ns", "compact_delay_count"),
1495        section: Section::TaskstatsDelay,
1496    },
1497    DerivedMetricDef {
1498        name: "avg_wpcopy_delay_ns",
1499        ladder: ScaleLadder::Ns,
1500        description: "Average write-protect-copy fault wait per event: wpcopy_delay_total_ns / wpcopy_delay_count (ns/event).",
1501        inputs: &["wpcopy_delay_total_ns", "wpcopy_delay_count"],
1502        is_ratio: false,
1503        compute: |m| ratio_compute(m, "wpcopy_delay_total_ns", "wpcopy_delay_count"),
1504        section: Section::TaskstatsDelay,
1505    },
1506    DerivedMetricDef {
1507        name: "avg_irq_delay_ns",
1508        ladder: ScaleLadder::Ns,
1509        description: "Average IRQ-handler window per event: irq_delay_total_ns / irq_delay_count (ns/event).",
1510        inputs: &["irq_delay_total_ns", "irq_delay_count"],
1511        is_ratio: false,
1512        compute: |m| ratio_compute(m, "irq_delay_total_ns", "irq_delay_count"),
1513        section: Section::TaskstatsDelay,
1514    },
1515    // -- taskstats off-CPU rollup --
1516    //
1517    // Sum of every meaningful off-CPU delay category. Combines
1518    // cpu (runqueue wait), blkio (sync I/O wait), freepages
1519    // (direct reclaim), compact (compaction), wpcopy (CoW fault),
1520    // irq (IRQ-handler windows), and the LARGER of (swapin,
1521    // thrashing) — the two share the same syscall-layer event,
1522    // so summing both would double-count a thrashing-induced
1523    // swapin. `?` propagates None when any input is missing
1524    // (gating off, kernel pre-v14, etc.); `.max()` over the
1525    // overlap pair picks the dominant signal.
1526    DerivedMetricDef {
1527        name: "total_offcpu_delay_ns",
1528        ladder: ScaleLadder::Ns,
1529        description: "Sum of all off-CPU delay-accounting buckets, ns: cpu + blkio + freepages + compact + wpcopy + irq + max(swapin, thrashing). The swapin/thrashing pair is OR'd with .max() rather than summed because the two share syscall-layer events (every thrashing event is also a swapin). Returns `-` when any input is missing (CONFIG_TASK_DELAY_ACCT off, runtime toggle off, or kernel older than the bucket's introduction version).",
1530        inputs: &[
1531            "cpu_delay_total_ns",
1532            "blkio_delay_total_ns",
1533            "swapin_delay_total_ns",
1534            "freepages_delay_total_ns",
1535            "thrashing_delay_total_ns",
1536            "compact_delay_total_ns",
1537            "wpcopy_delay_total_ns",
1538            "irq_delay_total_ns",
1539        ],
1540        is_ratio: false,
1541        compute: |m| {
1542            let cpu = input_scalar(m, "cpu_delay_total_ns")?;
1543            let blkio = input_scalar(m, "blkio_delay_total_ns")?;
1544            let swapin = input_scalar(m, "swapin_delay_total_ns")?;
1545            let freepages = input_scalar(m, "freepages_delay_total_ns")?;
1546            let thrashing = input_scalar(m, "thrashing_delay_total_ns")?;
1547            let compact = input_scalar(m, "compact_delay_total_ns")?;
1548            let wpcopy = input_scalar(m, "wpcopy_delay_total_ns")?;
1549            let irq = input_scalar(m, "irq_delay_total_ns")?;
1550            let mem_overlap = swapin.max(thrashing);
1551            Some(DerivedValue::Scalar(
1552                cpu + blkio + freepages + compact + wpcopy + irq + mem_overlap,
1553            ))
1554        },
1555        section: Section::TaskstatsDelay,
1556    },
1557];
1558
1559/// Borrow the metric's bare name from the registry. The
1560/// `&'static str` lifetime piggybacks on
1561/// [`CtprofMetricDef::name`]'s static-string storage —
1562/// callers may borrow the static name without allocation;
1563/// render sites that need owned `String`s allocate at the
1564/// table-cell boundary (see `super::render` at the
1565/// `metric_display_name(metric_def).to_string()` call site
1566/// and [`super::runner::write_metric_list`]).
1567///
1568/// Companion to [`metric_tags`], which renders the bracketed
1569/// `[<class>] [<tag>] ...` suffix separately. Render sites
1570/// concatenate the two into the final display column.
1571pub fn metric_display_name(metric: &CtprofMetricDef) -> &'static str {
1572    metric.name
1573}
1574
1575/// Render a metric's bracketed gating tags as a single
1576/// space-separated string. Returns the empty string when
1577/// `sched_class` is `None`, `is_dead` is false, AND
1578/// `config_gates` is empty.
1579///
1580/// Tag emission order: `[<sched_class>]` first when
1581/// `sched_class` is `Some`, then `[dead]` when `is_dead`, then
1582/// each `config_gate` in registry-declared order. Examples:
1583/// - `nr_wakeups_affine` → `[cfs-only] [SCHEDSTATS]`
1584/// - `core_forceidle_sum` → `[SCHED_CORE] [SCHEDSTATS]`
1585/// - `fair_slice_ns` → `[fair-policy]`
1586///
1587/// Compact rendering: each `config_gate` is stripped of its
1588/// `CONFIG_` prefix before emission so the rendered cell stays
1589/// scannable in narrow tables. The data field
1590/// [`CtprofMetricDef::config_gates`] keeps the full `CONFIG_X`
1591/// spelling so an operator can grep their kconfig directly.
1592/// `sched_class` tags are rendered as-is (already short, e.g.
1593/// `[cfs-only]`, `[fair-policy]`, `[non-ext]`).
1594///
1595/// Pure formatting layer — does not interpret tag values; the
1596/// metric's own [`CtprofMetricDef::sched_class`] /
1597/// [`CtprofMetricDef::config_gates`] / [`CtprofMetricDef::is_dead`]
1598/// docs are the source of truth for what each spelling means.
1599pub fn metric_tags(metric: &CtprofMetricDef) -> String {
1600    let mut out = String::new();
1601    if let Some(class) = metric.sched_class {
1602        out.push('[');
1603        out.push_str(class);
1604        out.push(']');
1605    }
1606    if metric.is_dead {
1607        if !out.is_empty() {
1608            out.push(' ');
1609        }
1610        out.push_str("[dead]");
1611    }
1612    for gate in metric.config_gates {
1613        if !out.is_empty() {
1614            out.push(' ');
1615        }
1616        out.push('[');
1617        let short = gate.strip_prefix("CONFIG_").unwrap_or(gate);
1618        out.push_str(short);
1619        out.push(']');
1620    }
1621    out
1622}