ktstr/monitor/dump/
mod.rs

1//! BPF map state dump for scheduler-failure post-mortem.
2//!
3//! [`dump_state`] is invoked by the freeze coordinator after the vCPU
4//! rendezvous succeeds (see `src/vmm/freeze_coord/mod.rs`). It enumerates every
5//! BPF map in the guest via [`BpfMapAccessor::maps`], filters out
6//! ktstr-internal probes (the framework's own probe and fentry skel
7//! maps), and dispatches per map type:
8//!
9//! - `BPF_MAP_TYPE_ARRAY` (and the `.bss` / `.data` / `.rodata`
10//!   global-section maps libbpf creates as single-key arrays) — read
11//!   the whole value buffer and render it via [`super::btf_render::render_value_with_mem`]
12//!   so embedded `__arena` pointers chase into the captured arena pages.
13//! - `BPF_MAP_TYPE_HASH` — iterate (key, value) pairs, capped at
14//!   [`MAX_HASH_ENTRIES`].
15//! - `BPF_MAP_TYPE_PERCPU_ARRAY` — read each CPU's slot for keys
16//!   `0..min(max_entries, MAX_PERCPU_KEYS)`.
17//! - Other types — recorded as [`FailureDumpMap::error`] so the operator
18//!   sees the gap rather than a silent omission.
19//!
20//! # BTF source — per-map program BTF loading
21//!
22//! The renderer loads each map's program BTF from guest memory at
23//! [`BpfMapInfo::btf_kva`], following the kernel `struct btf`'s
24//! `data`/`data_size`/`base_btf` fields. Split BTF (program types
25//! extending vmlinux) is parsed via [`Btf::from_split_bytes`] with
26//! the host's vmlinux BTF as the base (correct when host kernel ==
27//! guest kernel — ktstr's default and the common CI configuration).
28//! A per-`btf_kva` cache dedupes parses across maps sharing a
29//! program's BTF object. When per-map load fails (still-booting
30//! guest, untranslatable page, corrupted blob), the renderer falls
31//! back to the caller-supplied vmlinux BTF.
32//!
33//! # sdt_alloc pre-pass
34//!
35//! Before the per-map walk runs, [`dump_state`] runs a pre-pass
36//! that locates `sdt_alloc`-backed allocator instances inside the
37//! scheduler's `.bss` and surfaces every live per-task / per-cgroup
38//! allocation as structured records under
39//! [`FailureDumpReport::sdt_allocations`]. The walk runs only when
40//! every prerequisite is present:
41//!   - the per-map dump deadline has not been exceeded (the
42//!     pre-pass runs before every map render, and an earlier
43//!     phase exhausting the budget skips the walk to keep the
44//!     dump bounded),
45//!   - the arena's `user_vm_start` is 4 GiB-aligned (low 32 bits
46//!     zero — the bridge's address arithmetic treats slot starts as
47//!     low-32 keys against a `[user_vm_start, user_vm_start + 4 GiB)`
48//!     window; misalignment would silently mismap chases),
49//!   - the scheduler exposes a `.bss` ARRAY map with non-zero
50//!     `btf_kva` (so we can read its raw bytes and have a program
51//!     BTF to resolve types against),
52//!   - at least one `BPF_MAP_TYPE_ARENA` map snapshot succeeded
53//!     (so we have `kern_vm_start` for arena pointer translation),
54//!   - the program BTF carries `struct scx_allocator` (the scheduler
55//!     links `lib/sdt_alloc.bpf.c`).
56//!
57//! When any prerequisite is missing, the pre-pass leaves
58//! `sdt_allocations` empty rather than failing the dump — the
59//! per-map page-granular [`super::arena::ArenaSnapshot`] still
60//! captures raw arena content for callers that don't need
61//! structured rendering. See [`super::sdt_alloc`] for the walker
62//! design.
63
64mod display;
65mod render_map;
66#[cfg(test)]
67mod tests;
68use render_map::*;
69
70use serde::{Deserialize, Serialize};
71
72use btf_rs::Btf;
73
74use super::arena::{ArenaSnapshot, BpfArenaOffsets, snapshot_arena};
75use super::bpf_map::{
76    BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_ARRAY, BpfMapAccessor, BpfMapInfo, GuestMemMapAccessor,
77};
78use super::btf_render::RenderedValue;
79use super::sdt_alloc::{
80    SdtAllocOffsets, SdtAllocatorSnapshot, discover_payload_btf_id, walk_sdt_allocator,
81};
82
83/// Borrow-only capture context for per-program runtime stats
84/// (cnt/nsecs/misses) populated alongside the BPF map dump.
85///
86/// Carries a borrowed [`super::bpf_prog::BpfProgAccessor`] plus the
87/// per-CPU offset array obtained from
88/// [`super::symbols::read_per_cpu_offsets`]. [`dump_state`] calls
89/// [`super::bpf_prog::BpfProgAccessor::struct_ops_runtime_stats`]
90/// with the supplied offsets and stores the resulting
91/// [`super::bpf_prog::ProgRuntimeStats`] vector in
92/// [`FailureDumpReport::prog_runtime_stats`].
93///
94/// Pass `None` to skip prog-runtime capture (e.g. when the
95/// accessor could not be constructed because `prog_idr` is
96/// missing or the BPF prog offsets did not resolve). The dump still
97/// renders every map the [`super::bpf_map::BpfMapAccessor`] enumerates.
98pub struct ProgRuntimeCapture<'a> {
99    /// Accessor for walking `prog_idr` and reading per-program
100    /// `bpf_prog_stats` slots. Trait dispatch lets the same dump
101    /// site consume either the guest-memory backend or the planned
102    /// live-host backend without committing to a concrete type.
103    pub accessor: &'a dyn super::bpf_prog::BpfProgAccessor,
104    /// Per-CPU offset array (`__per_cpu_offset[cpu]`) used to address
105    /// each CPU's `bpf_prog_stats` slot for summation.
106    pub per_cpu_offsets: &'a [u64],
107}
108
109/// Borrow-only capture context for per-CPU CPU-time / softirq / IRQ
110/// counters populated alongside the BPF map dump.
111///
112/// Carries the BTF-resolved field offsets for `kernel_cpustat`,
113/// `kernel_stat`, and `tick_sched`, the resolved `.data..percpu`
114/// section offsets of the three per-CPU symbols, and the
115/// `__per_cpu_offset[cpu]` array used to address each CPU's slot.
116///
117/// [`dump_state`] reads each CPU's slot via direct guest-memory
118/// reads against the supplied [`super::reader::GuestMem`] and
119/// records the result into [`FailureDumpReport::per_cpu_time`].
120/// Mirrors [`ProgRuntimeCapture`]'s "borrowed-only, optional"
121/// shape — when `None`, the dump skips the per-CPU time capture
122/// and leaves the field empty.
123///
124/// Skipped silently when the resolver could not locate any of the
125/// three per-CPU symbols (stripped vmlinux), the BTF offsets are
126/// not present (CPU-time accounting types missing), or
127/// `__per_cpu_offset` resolution returned an empty array. The
128/// capture is best-effort diagnostic data; its absence does not
129/// fail the dump.
130pub struct CpuTimeCapture<'a> {
131    /// Guest memory handle used to read each per-CPU slot.
132    pub mem: &'a super::reader::GuestMem,
133    /// BTF-resolved offsets for `kernel_cpustat::cpustat[]`,
134    /// `kernel_stat::softirqs[]`, `kernel_stat::irqs_sum`, and
135    /// optionally `tick_sched::iowait_sleeptime`.
136    pub offsets: &'a super::btf_offsets::CpuTimeOffsets,
137    /// Link-time KVA of the `kernel_cpustat` per-CPU symbol (the
138    /// value `st_value` carries in the vmlinux symbol table — the
139    /// template address the linker assigned). The runtime KVA on
140    /// CPU `cpu` is
141    /// [`super::symbols::per_cpu_kva`]`(kernel_cpustat_kva,
142    /// kaslr_offset, per_cpu_offsets[cpu])`.
143    pub kernel_cpustat_kva: u64,
144    /// Link-time KVA of the `kstat` per-CPU symbol. See
145    /// `kernel_cpustat_kva` for the runtime KVA formula.
146    pub kstat_kva: u64,
147    /// Link-time KVA of the `tick_cpu_sched` per-CPU symbol.
148    /// `None` when the kernel was built without
149    /// `CONFIG_NO_HZ_COMMON`; iowait_sleeptime capture is skipped.
150    pub tick_cpu_sched_kva: Option<u64>,
151    /// Per-CPU offset array (`__per_cpu_offset[cpu]`) — same array
152    /// the BPF prog-stats walker uses (see
153    /// [`super::symbols::read_per_cpu_offsets`]). Length determines
154    /// how many CPUs the walker visits.
155    pub per_cpu_offsets: &'a [u64],
156    /// Guest's `PAGE_OFFSET` (resolved via
157    /// [`super::symbols::resolve_page_offset`]). Used to translate
158    /// each CPU's per-CPU KVA to a guest physical address for the
159    /// memory read.
160    pub page_offset: u64,
161    /// Virtual KASLR offset that per-CPU KVA derivation needs to
162    /// bridge the link-time (`__per_cpu_start_LINK`) and runtime
163    /// (`__per_cpu_start_RUNTIME`) bases. Sourced from the shared
164    /// `kern_virt_kaslr` Arc populated by either the BSP MSR_LSTAR
165    /// derive (`crate::vmm::x86_64::msr_kaslr::read_and_derive`,
166    /// x86_64-only) or the guest-channel KERN_ADDRS `_text`
167    /// subtraction (`crate::vmm::freeze_coord::dispatch`, both
168    /// arches). 0 fallback matches KASLR-off / nokaslr-karg
169    /// semantics and collapses [`super::symbols::per_cpu_kva`] to
170    /// the no-slide formula. On aarch64 without `_text` in
171    /// /proc/kallsyms (kptr_restrict masked) the value stays 0
172    /// and per-CPU resolution relies on the `nokaslr` karg
173    /// (`src/vmm/setup.rs`) instead.
174    pub kaslr_offset: u64,
175}
176
177/// Per-cgroup PSI-irq host-walk inputs (Phase A). Borrowed-only/optional,
178/// mirroring [`CpuTimeCapture`]: the freeze coordinator builds it only when the
179/// cgroup-walk offsets, the `cgrp_dfl_root` symbol, and the `psi_group` offsets
180/// all resolve; otherwise [`DumpContext::cgroup_psi_capture`] is `None` and the
181/// per-cgroup axis reads loud-absent. The walk descends `cgrp_dfl_root` → the
182/// host-held workload-root path → leaf cgroups and reads each leaf's
183/// `cgroup->psi` PSI_IRQ_FULL (see [`super::cgroup_walk`]).
184pub struct CgroupPsiCapture<'a> {
185    /// Guest memory handle used to read the cgroup hierarchy + each psi_group.
186    pub mem: &'a super::reader::GuestMem,
187    /// BTF-resolved cgroup-hierarchy field offsets (`cgroup.{self,kn,psi}`,
188    /// `cgroup_subsys_state.{sibling,children}`, `cgroup_root.cgrp`,
189    /// `kernfs_node.name`).
190    pub cgroup_offsets: &'a super::btf_offsets::CgroupWalkOffsets,
191    /// BTF-resolved `psi_group` offsets (shared with the system-wide walk —
192    /// a per-cgroup psi_group is the same `struct psi_group`).
193    pub psi_offsets: &'a super::btf_offsets::PsiGroupOffsets,
194    /// RUNTIME KVA of the hierarchy root cgroup (`cgrp_dfl_root +
195    /// offsetof(cgroup_root, cgrp)`, with any virtual-KASLR slide applied by
196    /// the caller) — used for the children-list anchor compare at the root
197    /// level. Every descendant is a direct-mapped slab object.
198    pub root_cgroup_kva: u64,
199    /// Guest physical address of the hierarchy root cgroup (the kernel-image
200    /// translation of the link-time root KVA, done by the caller via
201    /// `GuestKernel::text_kva_to_pa`) — the walk's entry read.
202    pub root_cgroup_pa: u64,
203    /// The test's workload-root cgroup path (host-held VM config, default
204    /// `/sys/fs/cgroup/ktstr`). The walk descends ONLY this subtree, so the
205    /// scheduler's separate cgroup does not confound the per-cgroup axis.
206    pub workload_root_path: &'a str,
207    /// Guest `PAGE_OFFSET` for the direct-map (`kva_to_pa`) hops to every
208    /// descendant cgroup / kernfs_node / psi_group.
209    pub page_offset: u64,
210}
211
212/// Borrow-only capture context for per-task enrichment.
213///
214/// Carries the [`super::guest::GuestKernel`] (guest memory + symbol
215/// table), the BTF-resolved task/signal/pid/upid offsets, the cached
216/// sched_class symbol KVAs (for class-name decode and the
217/// PI-boost-out-of-SCX flag), the lock-slowpath symbol cache (for
218/// stack-trace pattern matching), AND the task list itself — a
219/// pre-collected `&[TaskWalkerEntry]` produced by a task walker
220/// (rq->scx walk, DSQ walk, init_task→tasks enumeration).
221///
222/// Mirrors the [`ProgRuntimeCapture`] / [`CpuTimeCapture`]
223/// borrowed-only-optional shape. When `dump_state` receives
224/// `Some(TaskEnrichmentCapture)`, it iterates `tasks` and calls
225/// [`super::task_enrichment::walk_task_enrichment`] for each entry,
226/// pushing results into [`FailureDumpReport::task_enrichments`]. When
227/// `None`, the field stays empty and
228/// [`FailureDumpReport::task_enrichments_unavailable`] gets a
229/// "no task walker available" diagnostic.
230///
231/// The walker producer (rq->scx walker etc.) is responsible for
232/// building this struct. Until walker dispatch lands, no walker
233/// exists; the freeze coordinator passes `None` and the field is
234/// plumbed but empty.
235pub struct TaskEnrichmentCapture<'a> {
236    /// Borrowed GuestKernel — provides memory access, page-table
237    /// translation context, and the vmlinux symbol table.
238    pub kernel: &'a super::guest::GuestKernel,
239    /// BTF-resolved offsets for the task/signal/pid/upid walk.
240    pub offsets: &'a super::btf_offsets::TaskEnrichmentOffsets,
241    /// Cached sched_class symbol KVAs for class decode + PI-boost
242    /// flag.
243    pub sched_classes: &'a super::task_enrichment::SchedClassRegistry,
244    /// Cached lock-slowpath symbol KVAs for stack-PC pattern
245    /// matching.
246    pub lock_slowpaths: &'a super::task_enrichment::LockSlowpathRegistry,
247    /// Tasks the walker discovered, plus per-task metadata
248    /// `walk_task_enrichment` needs (see [`TaskWalkerEntry`]).
249    pub tasks: &'a [TaskWalkerEntry],
250}
251
252/// One entry produced by a task walker (rq->scx, DSQ, etc.) for the
253/// enrichment capture pipeline.
254///
255/// Each task walker discovers task KVAs by traversing the kernel's
256/// own scheduling data structures; the walker also knows which task
257/// was reachable via `rq->scx.runnable_list` (used for the
258/// PI-boost-out-of-SCX flag) and which vCPU's instruction-pointer
259/// matches the running task (used for the lock-slowpath stack
260/// matcher). Capturing those signals at the walker site keeps the
261/// enrichment surface side-effect free — `walk_task_enrichment` only
262/// reads guest memory; it does not perform discovery itself.
263#[derive(Debug, Clone, Copy)]
264pub struct TaskWalkerEntry {
265    /// Kernel virtual address of the `task_struct`.
266    pub task_kva: u64,
267    /// True iff the task was reached via `rq->scx.runnable_list`.
268    /// Required for the PI-boost-out-of-SCX flag — see
269    /// [`super::task_enrichment::TaskEnrichment::pi_boosted_out_of_scx`].
270    pub is_runnable_in_scx: bool,
271    /// Optional instruction pointer for the lock-slowpath stack
272    /// matcher. Pass the corresponding vCPU's
273    /// [`VcpuRegSnapshot::instruction_pointer`] when this task was
274    /// running on that vCPU at freeze time; pass `None` for tasks
275    /// not actively running.
276    pub running_pc: Option<u64>,
277}
278
279/// Per-CPU CPU-time / softirq / IRQ snapshot captured at freeze
280/// time. One entry per CPU index visible to the host walker.
281///
282/// All counter fields are monotonic in the kernel — the freeze
283/// captures the instantaneous value at the moment the vCPUs
284/// rendezvous-park. Diffing two snapshots (or comparing against a
285/// pre-test baseline) is the consumer's job; this type does not
286/// derive deltas.
287///
288/// Field semantics match the kernel sources:
289///   - `cpustat_*_ns`: ns counter from
290///     `kernel_cpustat::cpustat[CPUTIME_*]`. Updated by
291///     `account_user_time` / `account_system_index_time` and
292///     siblings (`kernel/sched/cputime.c`). The kernel stores
293///     nanoseconds; `/proc/stat` divides by `cputime_to_clock_t`.
294///   - `softirqs[i]`: `kernel_stat::softirqs[i]` cumulative count
295///     incremented by `kstat_incr_softirqs_this_cpu` on every
296///     softirq raise. Indexed by `super::btf_offsets::SOFTIRQ_NAMES`.
297///   - `irqs_sum`: `kernel_stat::irqs_sum` cumulative count
298///     incremented by `kstat_incr_irq_this_cpu` on every hardirq.
299///   - `iowait_sleeptime_ns`: `tick_sched::iowait_sleeptime`
300///     accumulated only under NO_HZ when the CPU enters idle with
301///     `nr_iowait > 0`. `None` when CONFIG_NO_HZ_COMMON is off or
302///     the resolver couldn't locate `tick_cpu_sched`.
303#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
304#[non_exhaustive]
305pub struct PerCpuTimeStats {
306    /// CPU index (0-based) this entry describes.
307    pub cpu: u32,
308    /// `cpustat[CPUTIME_USER]` (ns).
309    pub cpustat_user_ns: u64,
310    /// `cpustat[CPUTIME_NICE]` (ns).
311    pub cpustat_nice_ns: u64,
312    /// `cpustat[CPUTIME_SYSTEM]` (ns).
313    pub cpustat_system_ns: u64,
314    /// `cpustat[CPUTIME_SOFTIRQ]` (ns).
315    pub cpustat_softirq_ns: u64,
316    /// `cpustat[CPUTIME_IRQ]` (ns).
317    pub cpustat_irq_ns: u64,
318    /// `cpustat[CPUTIME_IDLE]` (ns).
319    pub cpustat_idle_ns: u64,
320    /// `cpustat[CPUTIME_IOWAIT]` (ns).
321    pub cpustat_iowait_ns: u64,
322    /// `cpustat[CPUTIME_STEAL]` (ns).
323    pub cpustat_steal_ns: u64,
324    /// `kernel_stat::softirqs[]` per-vector cumulative counts.
325    /// Indexed by `super::btf_offsets::SOFTIRQ_NAMES`.
326    pub softirqs: [u64; super::btf_offsets::NR_SOFTIRQS],
327    /// `kernel_stat::irqs_sum` cumulative hardirq count.
328    pub irqs_sum: u64,
329    /// `tick_sched::iowait_sleeptime` accumulated NO_HZ idle time
330    /// with outstanding IO (ns). `None` when NO_HZ disabled or
331    /// `tick_cpu_sched` symbol was absent at resolve time.
332    #[serde(default, skip_serializing_if = "Option::is_none")]
333    pub iowait_sleeptime_ns: Option<u64>,
334}
335
336/// Per-node NUMA event counters captured from
337/// `pglist_data->node_zones[]->vm_numa_event[]` at freeze time.
338///
339/// Each row is one row of NUMA event counters summed across all
340/// zones on a single node. The six counters mirror the kernel's
341/// `enum numa_stat_item` (see `super::btf_offsets::NUMA_HIT`
342/// etc. for the enum-stable indices). All counters are
343/// monotonic-since-boot; consumers diff against a baseline (or
344/// against another node's row) to extract the test-window delta.
345///
346/// Diagnostic value for sched_ext stalls is informational only —
347/// the NUMA balancer is not active for ext tasks. The rows
348/// surface here so an operator triaging a NUMA-aware workload
349/// (e.g. a memory-tiering test) can verify the kernel actually
350/// observed the expected node-locality distribution.
351///
352/// **Live walker status:** the wire shape, BTF offsets
353/// (`super::btf_offsets::NumaStatsOffsets`), and report field
354/// are wired through. The actual host-side walker that resolves
355/// `node_data[]` and reads per-zone counters is pending; until it
356/// lands, the report's [`FailureDumpReport::per_node_numa`] vec
357/// stays empty and
358/// [`FailureDumpReport::per_node_numa_unavailable`] carries the
359/// `"no NUMA walker"` reason.
360#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
361#[non_exhaustive]
362pub struct PerNodeNumaStats {
363    /// NUMA node id this row describes.
364    pub node: u32,
365    /// `vm_numa_event[NUMA_HIT]` summed across zones — pages
366    /// allocated on the requested node when local was preferred.
367    pub numa_hit: u64,
368    /// `vm_numa_event[NUMA_MISS]` — local node full, allocation
369    /// landed on a non-local node.
370    pub numa_miss: u64,
371    /// `vm_numa_event[NUMA_FOREIGN]` — process-policy targeted a
372    /// different node, this node honored the policy.
373    pub numa_foreign: u64,
374    /// `vm_numa_event[NUMA_INTERLEAVE_HIT]` — interleave policy
375    /// allocations that landed on this node.
376    pub numa_interleave_hit: u64,
377    /// `vm_numa_event[NUMA_LOCAL]` — allocations on this node by
378    /// processes running on this node.
379    pub numa_local: u64,
380    /// `vm_numa_event[NUMA_OTHER]` — allocations on this node by
381    /// processes running on a different node.
382    pub numa_other: u64,
383}
384
385/// Reason string written into [`FailureDumpReport::per_node_numa_unavailable`]
386/// when the per-node NUMA walker has not landed yet. Distinct from
387/// other unavailable reasons so a downstream consumer can tell
388/// "walker not implemented" apart from "walker ran and produced
389/// no data" once the live producer ships.
390pub const REASON_NO_NUMA_WALKER: &str = "no NUMA walker (host-side walker pending)";
391
392/// Borrow-only capture context for the per-sample SCX event counter
393/// timeline.
394///
395/// The freeze coordinator forwards the monitor sampler's accumulated
396/// `super::MonitorSample` vec via [`Self::samples`]; the dump path
397/// folds each sample's per-CPU `super::ScxEventCounters` into a
398/// single cross-CPU sum and produces one [`EventCounterSample`] per
399/// monitor tick.
400///
401/// `None` skips the timeline capture; the dump still renders the
402/// rest of the report. Mirrors [`ProgRuntimeCapture`] /
403/// [`CpuTimeCapture`]'s "borrowed-only, optional" shape.
404pub struct EventCounterCapture<'a> {
405    /// Periodic monitor samples gathered between VM start and the
406    /// freeze trigger. Each sample carries per-CPU
407    /// `super::ScxEventCounters` when scx event-stat offsets
408    /// resolved; the dump folder skips samples whose CPUs all
409    /// reported `event_counters: None`.
410    pub samples: &'a [super::MonitorSample],
411}
412
413/// Borrow-only capture context for the rq->scx + DSQ walkers.
414/// Mirrors [`TaskEnrichmentCapture`] / [`CpuTimeCapture`] shape —
415/// `dump_state` consumes everything by reference.
416///
417/// Carries:
418/// - `kernel`: GuestKernel handle for guest-memory reads
419///   (PTE walks, symbol resolution).
420/// - `offsets`: BTF-resolved
421///   [`super::btf_offsets::ScxWalkerOffsets`] covering scx_rq,
422///   scx_sched, scx_sched_pcpu, scx_sched_pnode, scx_dispatch_q,
423///   sched_ext_entity, scx_dsq_list_node, rhashtable, bucket_table,
424///   rhash_head.
425/// - `scx_root_kva`: kernel-text-mapped pointer the walker
426///   dereferences to find the active `scx_sched`.
427/// - `rq_kvas` / `rq_pas`: per-CPU rq KVA + PA arrays; same vecs
428///   the runnable_at scanner uses.
429/// - `per_cpu_offsets`: `__per_cpu_offset[]` array — needed for
430///   per-CPU bypass DSQ resolution.
431/// - `nr_nodes`: NUMA node count, for the per-node global-DSQ
432///   walk. Pass `1` on UMA / unknown configurations; the walker
433///   gracefully skips slots whose pnode pointers are NULL.
434///
435/// When `None` is passed in [`DumpContext::scx_walker_capture`],
436/// the dump emits empty `rq_scx_states` / `dsq_states` and
437/// records `scx_walker_unavailable` with a diagnostic reason.
438pub struct ScxWalkerCapture<'a> {
439    /// Borrowed GuestKernel — provides memory access, page-table
440    /// translation context, and the vmlinux symbol table.
441    pub kernel: &'a super::guest::GuestKernel,
442    /// BTF-resolved offsets for the scx walker.
443    pub offsets: &'a super::btf_offsets::ScxWalkerOffsets,
444    /// `scx_root` symbol KVA (resolved via vmlinux ELF symtab).
445    /// The walker reads `*scx_root` to find the active scx_sched.
446    pub scx_root_kva: u64,
447    /// Per-CPU rq kernel virtual addresses (one per CPU).
448    pub rq_kvas: &'a [u64],
449    /// Per-CPU rq guest physical addresses (parallel to rq_kvas).
450    pub rq_pas: &'a [u64],
451    /// `__per_cpu_offset[]` array, used to address each CPU's
452    /// `scx_sched_pcpu.bypass_dsq`.
453    pub per_cpu_offsets: &'a [u64],
454    /// NUMA node count for the per-node global-DSQ walk. Pass `1`
455    /// on UMA / unknown configurations.
456    pub nr_nodes: u32,
457}
458
459/// One per-monitor-tick snapshot of the 13 SCX_EV_* event counters
460/// summed across every CPU at that tick.
461///
462/// The kernel stores per-CPU `s64` counters in `scx_sched_pcpu`
463/// (kernel/sched/ext.c); the monitor sampler reads them at every
464/// tick and stores per-CPU `event_counters` on each
465/// `super::CpuSnapshot`. The dump path sums across CPUs into the
466/// fields here so a downstream consumer can render the run's
467/// counter timeline (sparkline, delta plot, ...) without
468/// re-iterating the per-CPU vec.
469///
470/// Field semantics match
471/// `super::ScxEventCounters` one-to-one — see that struct's
472/// per-field doc for kernel-source provenance. `total_*` naming
473/// here echoes `super::ScxEventDeltas`'s aggregate-across-window
474/// fields but with per-tick (not per-window) granularity.
475#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
476#[non_exhaustive]
477pub struct EventCounterSample {
478    /// Milliseconds since VM start (mirrors
479    /// `super::MonitorSample::elapsed_ms`). Zero on the first
480    /// sample.
481    pub elapsed_ms: u64,
482    /// Sum of `select_cpu_fallback` across all CPUs at this tick.
483    pub select_cpu_fallback: i64,
484    /// Sum of `dispatch_local_dsq_offline` across all CPUs.
485    pub dispatch_local_dsq_offline: i64,
486    /// Sum of `dispatch_keep_last` across all CPUs.
487    pub dispatch_keep_last: i64,
488    /// Sum of `enq_skip_exiting` across all CPUs.
489    pub enq_skip_exiting: i64,
490    /// Sum of `enq_skip_migration_disabled` across all CPUs.
491    pub enq_skip_migration_disabled: i64,
492    /// Sum of `reenq_immed` across all CPUs.
493    pub reenq_immed: i64,
494    /// Sum of `reenq_local_repeat` across all CPUs.
495    pub reenq_local_repeat: i64,
496    /// Sum of `refill_slice_dfl` across all CPUs.
497    pub refill_slice_dfl: i64,
498    /// Sum of `bypass_duration` across all CPUs (ns).
499    pub bypass_duration: i64,
500    /// Sum of `bypass_dispatch` across all CPUs.
501    pub bypass_dispatch: i64,
502    /// Sum of `bypass_activate` across all CPUs.
503    pub bypass_activate: i64,
504    /// Sum of `insert_not_owned` across all CPUs.
505    pub insert_not_owned: i64,
506    /// Sum of `sub_bypass_dispatch` across all CPUs.
507    pub sub_bypass_dispatch: i64,
508}
509
510impl EventCounterSample {
511    /// Construct from a `super::MonitorSample` by summing every
512    /// CPU's `super::ScxEventCounters`. CPUs whose
513    /// `event_counters` is `None` (event-stat offsets unresolved)
514    /// contribute 0 to every field.
515    ///
516    /// Returns `None` when no CPU on the sample reported event
517    /// counters — propagating that to the timeline would emit a
518    /// row of all zeros that's indistinguishable from a real
519    /// "every counter at zero" tick. Callers filter `None` out.
520    pub fn from_monitor_sample(sample: &super::MonitorSample) -> Option<Self> {
521        let mut any = false;
522        let mut out = Self {
523            elapsed_ms: sample.elapsed_ms,
524            ..Self::default()
525        };
526        for cpu in &sample.cpus {
527            if let Some(ev) = &cpu.event_counters {
528                any = true;
529                // Per-CPU SCX event counters are s64 in the kernel
530                // and originate from BPF map reads of guest memory.
531                // A corrupt counter could trip i64 addition overflow
532                // when summed across many CPUs; saturating_add pins
533                // the sum at i64::{MIN,MAX} rather than panicking
534                // (debug) or wrapping (release) into a misleading
535                // value.
536                out.select_cpu_fallback = out
537                    .select_cpu_fallback
538                    .saturating_add(ev.select_cpu_fallback);
539                out.dispatch_local_dsq_offline = out
540                    .dispatch_local_dsq_offline
541                    .saturating_add(ev.dispatch_local_dsq_offline);
542                out.dispatch_keep_last =
543                    out.dispatch_keep_last.saturating_add(ev.dispatch_keep_last);
544                out.enq_skip_exiting = out.enq_skip_exiting.saturating_add(ev.enq_skip_exiting);
545                out.enq_skip_migration_disabled = out
546                    .enq_skip_migration_disabled
547                    .saturating_add(ev.enq_skip_migration_disabled);
548                out.reenq_immed = out.reenq_immed.saturating_add(ev.reenq_immed);
549                out.reenq_local_repeat =
550                    out.reenq_local_repeat.saturating_add(ev.reenq_local_repeat);
551                out.refill_slice_dfl = out.refill_slice_dfl.saturating_add(ev.refill_slice_dfl);
552                out.bypass_duration = out.bypass_duration.saturating_add(ev.bypass_duration);
553                out.bypass_dispatch = out.bypass_dispatch.saturating_add(ev.bypass_dispatch);
554                out.bypass_activate = out.bypass_activate.saturating_add(ev.bypass_activate);
555                out.insert_not_owned = out.insert_not_owned.saturating_add(ev.insert_not_owned);
556                out.sub_bypass_dispatch = out
557                    .sub_bypass_dispatch
558                    .saturating_add(ev.sub_bypass_dispatch);
559            }
560        }
561        if any { Some(out) } else { None }
562    }
563}
564
565/// Render a u64 counter series as a 1-line UTF-8 sparkline.
566///
567/// Maps each value into one of 8 unicode block-element glyphs
568/// (`▁▂▃▄▅▆▇█`) by min-max scaling. Empty input renders as the
569/// empty string; a constant non-zero series renders as repeated
570/// mid-tier glyphs (matches the "no variation" reading in the
571/// data, not as misleading monotonic up-bars). A constant zero
572/// series renders as repeated lowest glyphs.
573///
574/// Used by the `Display` impl for the event-counter timeline. Pure
575/// helper — no allocation outside the returned `String`.
576pub fn render_sparkline(values: &[u64]) -> String {
577    const GLYPHS: &[char] = &['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█'];
578    if values.is_empty() {
579        return String::new();
580    }
581    let min = *values.iter().min().expect("non-empty");
582    let max = *values.iter().max().expect("non-empty");
583    let mut s = String::with_capacity(values.len() * 4);
584    if max == min {
585        let glyph = if max == 0 {
586            GLYPHS[0]
587        } else {
588            GLYPHS[GLYPHS.len() / 2]
589        };
590        for _ in values {
591            s.push(glyph);
592        }
593        return s;
594    }
595    let span = max - min;
596    let last_idx = (GLYPHS.len() - 1) as u64;
597    for &v in values {
598        // Linear scale [min, max] → [0, GLYPHS.len()-1]. Integer
599        // math is sufficient (no rounding artifact at the cost
600        // of one extra glyph step at boundaries).
601        let scaled = ((v - min) * last_idx) / span;
602        let idx = scaled.min(last_idx) as usize;
603        s.push(GLYPHS[idx]);
604    }
605    s
606}
607
608/// Saturating-cast wrapper around [`render_sparkline`] for signed
609/// (i64) counter series. Negative values clamp to 0; the kernel
610/// stores SCX_EV_* as `s64` but every counter is non-negative in
611/// practice, so the saturation only fires on a corrupt read.
612pub fn render_sparkline_i64(values: &[i64]) -> String {
613    let widened: Vec<u64> = values.iter().map(|&v| v.max(0) as u64).collect();
614    render_sparkline(&widened)
615}
616
617/// Snapshot of one vCPU's instruction-pointer / stack-pointer / page-
618/// table-root at freeze time. Re-export of the freeze-side type so
619/// dump consumers don't have to depend on `vmm::exit_dispatch`
620/// internals.
621pub use crate::vmm::exit_dispatch::VcpuRegSnapshot;
622
623/// Schema discriminant value emitted in `FailureDumpReport.schema`.
624///
625/// Consumers that read a `.failure-dump.json` file use the `schema`
626/// field's value to choose between [`FailureDumpReport`],
627/// [`DualFailureDumpReport`], and [`DegradedFailureDumpReport`]
628/// before attempting deserialization. The
629/// [`FailureDumpReportAny::from_json`] dispatcher handles this
630/// routing for in-process consumers.
631/// Values are stable wire constants — extending the dump pipeline
632/// with a new shape adds a new constant rather than changing this
633/// one.
634pub const SCHEMA_SINGLE: &str = "single";
635
636/// Schema discriminant value emitted in `DualFailureDumpReport.schema`.
637/// See [`SCHEMA_SINGLE`] for the discriminant contract.
638pub const SCHEMA_DUAL: &str = "dual";
639
640/// Schema discriminant value emitted in `DegradedFailureDumpReport.schema`.
641///
642/// Carried by failure-dumps the freeze coordinator was unable to
643/// capture as a full [`SCHEMA_SINGLE`] / [`SCHEMA_DUAL`] report — the
644/// trigger fired but rendezvous, gate cross-reference, or KVA
645/// translation aborted the dump path. Per the wire-format contract on
646/// [`SCHEMA_SINGLE`], degraded dumps are a stable variant added by
647/// new constant, not a mutation of the existing two.
648pub const SCHEMA_DEGRADED: &str = "degraded";
649
650/// Reason string written into [`DegradedFailureDumpReport::reason`]
651/// when the freeze coordinator's vCPU rendezvous timed out before
652/// every parked acknowledgement arrived. Wire-format-stable: matches
653/// the operator-grep contract used by every other `REASON_*` constant
654/// in this module. The dynamic detail appended at emit time
655/// (`<timeout_ms>` / `<parked>` / `<expected>`) lets an operator see
656/// which vCPUs stalled without a separate field.
657pub const REASON_DEGRADED_RENDEZVOUS_TIMEOUT: &str =
658    "vCPU rendezvous timed out before parked acknowledgement";
659
660/// Reason string written into [`DegradedFailureDumpReport::reason`]
661/// when the host kill signal flipped during the vCPU freeze
662/// rendezvous, short-circuiting the wait before every parked
663/// acknowledgement arrived. Distinct from
664/// [`REASON_DEGRADED_RENDEZVOUS_TIMEOUT`] (which fires only when the
665/// 30s deadline expires): a kill-mid-rendezvous typically lands in
666/// the rendezvous loop within milliseconds, so the elapsed_ms in the
667/// dynamic detail (appended at emit time) reads as a small number
668/// and the "timed out" label is internally contradictory. The kill
669/// sources are SCHED_EXIT propagation from a vCPU thread, watchdog
670/// hard-deadline expiry, and panic-hook flips. Wire-format-stable;
671/// matches the operator-grep contract used by every other `REASON_*`
672/// constant.
673pub const REASON_DEGRADED_KILL_DURING_RENDEZVOUS: &str =
674    "vCPU rendezvous aborted by external kill before parked acknowledgement";
675
676/// Snapshot tag used when the early-snapshot trigger fires but
677/// `freeze_and_dispatch(FreezeMode::Capture { gate_on_exit_kind: false })` returns `Degraded` (early-half
678/// rendezvous timeout). The freeze coordinator writes the degraded
679/// JSON to a sibling path named via
680/// `super::super::vmm::freeze_coord::snapshot_tagged_path` using
681/// this tag — main `{stem}.failure-dump.json` is preserved for the
682/// subsequent late-trigger emission. Operator-readable wire-format
683/// constant: kebab-case, stable across releases.
684///
685/// MAINTENANCE: adding a new `SNAPSHOT_TAG_*` const requires updating
686/// BOTH (a) the `ALL_SNAPSHOT_TAGS` slice below AND (b) the
687/// `expected` hand-list in
688/// `all_snapshot_tags_enumerates_every_pub_const_in_module` in
689/// `src/monitor/dump/tests.rs`. The pinning test catches slice-vs-
690/// expected divergence; it does NOT catch a const added without
691/// updating either (both arrays stay at the same length).
692pub const SNAPSHOT_TAG_EARLY_DEGRADED: &str = "early-degraded";
693
694/// Snapshot tag used when dual-snapshot mode held a Captured early
695/// snapshot AND the late-trigger path returned `Degraded`. The early
696/// snapshot is written to a sibling path with this tag while the
697/// late degraded JSON occupies the main dump path. Distinguishes
698/// "early itself degraded" ([`SNAPSHOT_TAG_EARLY_DEGRADED`]) from
699/// "early captured, late degraded" (this tag) so an operator browsing
700/// the dump directory knows which case produced which file. Every
701/// captured snapshot reaches disk.
702pub const SNAPSHOT_TAG_EARLY_PRE_LATE_DEGRADED: &str = "early-pre-late-degraded";
703
704/// Snapshot tag used when dual-snapshot mode held a Captured early
705/// snapshot AND the late-trigger path ran AND returned `Suppressed`
706/// (the gate examined `*scx_root->exit_kind`, found it below
707/// SCX_EXIT_ERROR, and decided no failure dump warranted). Distinct
708/// from [`SNAPSHOT_TAG_EARLY_ONLY_LATE_NEVER_FIRED`]: this tag
709/// means the late trigger DID fire and the gate explicitly decided
710/// clean exit. Operator triage: scheduler recovered from the early
711/// stall and reached a clean shutdown via the SCX_EXIT_NONE /
712/// SCX_EXIT_DONE path. The early observation (runnable-age spike)
713/// is independently meaningful and reaches disk at the tagged
714/// sibling. Symmetric with
715/// [`SNAPSHOT_TAG_EARLY_PRE_LATE_DEGRADED`] — tagged sibling rather
716/// than main path so the main `{stem}.failure-dump.json` keeps the
717/// "scheduler had a failure-class exit" semantic.
718pub const SNAPSHOT_TAG_EARLY_ONLY_LATE_SUPPRESSED: &str = "early-only-late-suppressed";
719
720/// Snapshot tag used when dual-snapshot mode held a Captured early
721/// snapshot AND the late-trigger path NEVER FIRED for the run (no
722/// `err_exit_detected` BPF latch flip; the scheduler never reached
723/// an error-class late event). Distinct from
724/// [`SNAPSHOT_TAG_EARLY_ONLY_LATE_SUPPRESSED`]: this tag means the
725/// late trigger never ran at all, NOT that it ran and decided
726/// clean. Operator triage: scheduler crossed the half-watchdog
727/// runnable-age threshold (early-trigger fired) but then either
728/// recovered or terminated before reaching the late-trigger path —
729/// `freeze_state` stayed at `Idle` or `TookEarly` through coord
730/// exit. The end-of-coord drain emits the early observation to the
731/// tagged sibling rather than letting it drop with the closure.
732pub const SNAPSHOT_TAG_EARLY_ONLY_LATE_NEVER_FIRED: &str = "early-only-late-never-fired";
733
734/// Canonical enumeration of every `SNAPSHOT_TAG_*` constant in this
735/// module. Tests that negative-scan tag locations (e.g. asserting no
736/// file landed at any wrong tag) iterate this slice rather than
737/// hardcoding the 4-element list. Adding a new `SNAPSHOT_TAG_*`
738/// constant requires updating this slice — the pinning test
739/// `all_snapshot_tags_enumerates_every_pub_const_in_module` in
740/// `src/monitor/dump/tests.rs` will fail until the new tag is added,
741/// flagging the inconsistency at test-time.
742///
743/// Order: NEVER_FIRED first because it is the default tag in
744/// `EarlySnapshotGuard::drain_to_disk`'s `unwrap_or` arm at
745/// src/vmm/freeze_coord/mod.rs (the guard struct is `pub(super)` so
746/// rustdoc cannot intra-doc-link it from this module; cite by file
747/// path matches the cross-ref convention used elsewhere in this
748/// batch). The other three follow in dispatch-arm order:
749/// SUPPRESSED (late-trigger Suppressed write-failure) → PRE_LATE_DEGRADED
750/// (late-trigger Degraded write-failure) → EARLY_DEGRADED (early-
751/// trigger Degraded direct write) — so readers can map slice index
752/// to dispatch arm by inspection.
753///
754/// `#[cfg(test)]` — production code never iterates this slice;
755/// only the negative-scan tests and the pinning canary reference it.
756/// Gated to keep the lib's dead-code lint clean.
757#[cfg(test)]
758pub const ALL_SNAPSHOT_TAGS: &[&str] = &[
759    SNAPSHOT_TAG_EARLY_ONLY_LATE_NEVER_FIRED,
760    SNAPSHOT_TAG_EARLY_ONLY_LATE_SUPPRESSED,
761    SNAPSHOT_TAG_EARLY_PRE_LATE_DEGRADED,
762    SNAPSHOT_TAG_EARLY_DEGRADED,
763];
764
765/// Reason string written into [`FailureDumpReport::prog_runtime_stats_unavailable`]
766/// when [`DumpContext::prog_capture`] was supplied but the per-program
767/// walker found no struct_ops programs in `prog_idr` at freeze time.
768/// Wire-format-stable: an operator parsing the sidecar JSON looks for
769/// this exact string to distinguish from the prog-accessor-missing
770/// case.
771pub const REASON_NO_STRUCT_OPS_LOADED: &str = "no struct_ops programs loaded";
772
773/// Reason string written into [`FailureDumpReport::prog_runtime_stats_unavailable`]
774/// when [`DumpContext::prog_capture`] was `None`. Distinguishes from
775/// [`REASON_NO_STRUCT_OPS_LOADED`] — the walker never ran in this case
776/// because the accessor wasn't constructed (e.g. `prog_idr` symbol
777/// missing).
778pub const REASON_PROG_ACCESSOR_UNAVAILABLE: &str = "prog accessor unavailable";
779
780/// Reason string written into [`FailureDumpReport::task_enrichments_unavailable`]
781/// when [`DumpContext::task_enrichment_capture`] was supplied but
782/// every walker entry produced no enrichment (idle guest with no
783/// runnable scx tasks at the freeze instant).
784pub const REASON_TASK_WALKER_ZERO_TASKS: &str = "task walker yielded zero tasks";
785
786/// Reason string written into [`FailureDumpReport::task_enrichments_unavailable`]
787/// when [`DumpContext::task_enrichment_capture`] was `None`.
788/// Distinguishes from [`REASON_TASK_WALKER_ZERO_TASKS`] — the walker
789/// never ran because the capture wasn't supplied.
790pub const REASON_NO_TASK_WALKER: &str = "no task walker available";
791
792/// Reason string written into [`FailureDumpReport::scx_walker_unavailable`]
793/// when offsets resolved AND the walker found rq->scx + local DSQ
794/// data BUT `*scx_root == 0` — no scheduler attached at the freeze
795/// instant. The sched-rooted passes (bypass / global / user-hash)
796/// have nothing to walk, but the rq->scx and per-CPU local DSQ
797/// captures still produced data. Surfaces a distinct reason so the
798/// operator knows the scheduler isn't loaded vs. the walker is
799/// broken.
800pub const REASON_SCX_ROOT_NULL: &str = "scx_root is NULL (no scheduler attached)";
801
802/// Reason string written into [`FailureDumpReport::scx_walker_unavailable`]
803/// when [`DumpContext::scx_walker_capture`] was supplied AND every
804/// offset sub-group resolved BUT the walker reached no rq, no DSQ,
805/// and no scx_sched state at all (every read failed). Distinct from
806/// [`REASON_SCX_ROOT_NULL`]: that case has rq->scx + local DSQ data
807/// but no sched_state; this case has nothing.
808pub const REASON_SCX_WALKER_NO_STATE: &str = "scx walker reached no state";
809
810/// Reason string written into [`FailureDumpReport::scx_walker_unavailable`]
811/// when [`DumpContext::scx_walker_capture`] was `None`. Distinguishes
812/// from [`REASON_SCX_WALKER_NO_STATE`] — the walker never ran at all
813/// because no capture was supplied.
814pub const REASON_NO_SCX_WALKER: &str = "no scx walker capture";
815
816/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
817/// when the sdt_alloc pre-pass could not run because the scheduler's
818/// arena `user_vm_start` was not 4 GiB-aligned. See the gate in
819/// [`dump_state`]: low-32 keying of the per-pass
820/// [`crate::monitor::dump::render_map::ArenaSlotIndex`] is only
821/// correct when `user_vm_start & 0xFFFF_FFFF == 0`.
822pub const REASON_SDT_ALLOC_UNALIGNED_USER_VM: &str =
823    "user_vm_start is not 4 GiB-aligned; low-32 keying disabled";
824
825/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
826/// when the dump enumerated no `*.bss` ARRAY map with a non-zero
827/// `btf_kva`. Without the scheduler's `.bss` bytes the pre-pass cannot
828/// read any allocator's in-memory state, and without `btf_kva` the
829/// program BTF that names `struct scx_allocator` is not loadable.
830pub const REASON_SDT_ALLOC_NO_BSS: &str = "no scheduler .bss map (or .bss has no program BTF)";
831
832/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
833/// when no `BPF_MAP_TYPE_ARENA` map snapshot was captured: the
834/// pre-pass has no `kern_vm_start` to translate arena pointers
835/// against, so no allocator slot can be walked.
836pub const REASON_SDT_ALLOC_NO_ARENA: &str = "no arena map captured (kern_vm_start unavailable)";
837
838/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
839/// when the scheduler's program BTF does not carry `struct scx_allocator`
840/// or its peer types (`sdt_pool`, `sdt_desc`, `sdt_chunk`) — the scheduler
841/// does not link `lib/sdt_alloc.bpf.c` and there are no allocator slots
842/// to walk. [`crate::monitor::sdt_alloc::SdtAllocOffsets::from_btf`]
843/// returns the underlying `anyhow::Error` describing which struct was
844/// missing; the dump caller folds that into this reason string.
845pub const REASON_SDT_ALLOC_NO_TYPE: &str = "scheduler BTF does not declare struct scx_allocator";
846
847/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
848/// when every prerequisite resolved but no `.bss` variable of type
849/// `struct scx_allocator` was discovered. The scheduler links
850/// `lib/sdt_alloc.bpf.c` for its types but has not declared a typed
851/// allocator instance.
852pub const REASON_SDT_ALLOC_NO_INSTANCE: &str = "no scx_allocator instance in .bss";
853
854/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
855/// when the per-map dump deadline was exhausted before the pre-pass
856/// could run. The `dump_truncated_at_us` field also surfaces the
857/// truncation; this string disambiguates "deadline" from the other
858/// no-data causes when a consumer is scanning sdt_alloc-related
859/// diagnostics specifically.
860pub const REASON_SDT_ALLOC_DEADLINE_EXCEEDED: &str =
861    "dump deadline exceeded before sdt_alloc pre-pass could run";
862
863/// Cross-CPU sum of every per-CPU diagnostic counter slot in the
864/// probe BPF program's `.bss` `ktstr_pcpu_counters` array.
865///
866/// The probe declares one fixed-shape per-CPU array
867/// (`pcpu_counter ktstr_pcpu_counters[MAX_CPUS][KTSTR_PCPU_NR]` —
868/// see `src/bpf/probe.bpf.c`); each tracepoint / kprobe handler
869/// bumps a slot via `ktstr_pcpu_inc(KTSTR_PCPU_<NAME>)`. The host
870/// reader sums across the CPU axis to recover the cumulative count
871/// each handler reports. Field names mirror the slot names from
872/// `enum ktstr_pcpu_idx` so an operator can walk back from the
873/// failure-dump field to the probe source by exact name.
874///
875/// All counters are monotonic-since-probe-attach. Zero values
876/// indicate either "the corresponding tracepoint never fired" (the
877/// common case for `pi_*` and `lock_contend_*` on tests that don't
878/// exercise PI / lock contention) or "the tracepoint never attached"
879/// (e.g. `preempt_*` on a kernel without
880/// `CONFIG_TRACE_PREEMPT_TOGGLE`); the counter alone cannot
881/// distinguish those two cases — pair with the attach-state surface
882/// in `crate::probe::process::ProbeDiagnostics` when the
883/// distinction matters.
884#[derive(Debug, Clone, Default, Serialize, Deserialize)]
885#[non_exhaustive]
886pub struct ProbeBssCounters {
887    /// `KTSTR_PCPU_PROBE_COUNT` summed across CPUs — total kprobe
888    /// fires past the `ktstr_enabled` gate.
889    pub probe_count: u64,
890    /// `KTSTR_PCPU_KPROBE_RETURNS` summed across CPUs — kprobe fires
891    /// that committed an entry to `probe_data` (past `func_meta_map`
892    /// lookup and scratch-slot allocation).
893    pub kprobe_returns: u64,
894    /// `KTSTR_PCPU_META_MISS` summed across CPUs — kprobe fires
895    /// whose IP missed `func_meta_map`. `probe_count -
896    /// kprobe_returns` is the total bail count; `meta_miss` is the
897    /// subset whose bail came from the `func_meta_map` lookup.
898    pub meta_miss: u64,
899    /// `KTSTR_PCPU_RINGBUF_DROPS` summed across CPUs — failed
900    /// `bpf_ringbuf_reserve` calls inside the trigger handler.
901    pub ringbuf_drops: u64,
902    /// `KTSTR_PCPU_TIMELINE_COUNT` summed across CPUs — successful
903    /// timeline-event submissions across the three timeline
904    /// tracepoints (sched_switch + sched_migrate_task + sched_wakeup).
905    pub timeline_count: u64,
906    /// `KTSTR_PCPU_TIMELINE_DROPS` summed across CPUs — timeline
907    /// submissions that failed because the dedicated
908    /// `timeline_events` ringbuf was full at submit time.
909    pub timeline_drops: u64,
910    /// `KTSTR_PCPU_PI_COUNT` summed across CPUs — PI boost / unboost
911    /// records committed via `fexit/rt_mutex_setprio`.
912    pub pi_count: u64,
913    /// `KTSTR_PCPU_PI_ORPHAN_FEXITS` summed across CPUs — fexit
914    /// fires whose entry-side snapshot was never recorded (attach
915    /// race or `pi_scratch` overflow).
916    pub pi_orphan_fexits: u64,
917    /// `KTSTR_PCPU_PI_CLASS_CHANGE_COUNT` summed across CPUs —
918    /// PI events that observed a `sched_class` flip from fentry
919    /// to fexit (e.g. CFS → RT under a boost).
920    pub pi_class_change_count: u64,
921    /// `KTSTR_PCPU_PI_DROPS` summed across CPUs — TL_EVT_PI_BOOST
922    /// submissions that failed because the timeline ringbuf was
923    /// full at the PI fexit handler.
924    pub pi_drops: u64,
925    /// `KTSTR_PCPU_LOCK_CONTEND_COUNT` summed across CPUs —
926    /// `tp_btf/contention_begin` fires that committed a
927    /// TL_EVT_LOCK_CONTEND timeline record.
928    pub lock_contend_count: u64,
929    /// `KTSTR_PCPU_LOCK_CONTEND_DROPS` summed across CPUs —
930    /// TL_EVT_LOCK_CONTEND submissions that failed because the
931    /// timeline ringbuf was full.
932    pub lock_contend_drops: u64,
933    /// `KTSTR_PCPU_PREEMPT_DISABLE_COUNT` summed across CPUs —
934    /// `tp_btf/preempt_disable` outermost-transition fires.
935    pub preempt_disable_count: u64,
936    /// `KTSTR_PCPU_PREEMPT_ENABLE_COUNT` summed across CPUs —
937    /// `tp_btf/preempt_enable` outermost-transition fires.
938    pub preempt_enable_count: u64,
939    /// `KTSTR_PCPU_TRIGGER_COUNT` summed across CPUs — every
940    /// `tp_btf/sched_ext_exit` fire (including non-error
941    /// kinds like DONE / UNREG, not just error-class exits).
942    pub trigger_count: u64,
943}
944
945/// Top-level failure-dump report. One per freeze trigger.
946#[derive(Debug, Clone, Serialize, Deserialize)]
947#[non_exhaustive]
948pub struct FailureDumpReport {
949    /// Wire-format discriminant. Always `"single"` for this variant,
950    /// pinning [`SCHEMA_SINGLE`]. Consumers branch on this to
951    /// choose between [`FailureDumpReport`], [`DualFailureDumpReport`],
952    /// and [`DegradedFailureDumpReport`] before deserializing. Single
953    /// and Dual share top-level field names that would collide without
954    /// an explicit tag; Degraded carries a distinct field set
955    /// (`reason`, `watchpoint_hit`, `bss_latch_state`, `exit_kind`,
956    /// `elapsed_ms`) but still gets the tag so
957    /// [`FailureDumpReportAny::from_json`] can dispatch uniformly.
958    pub schema: String,
959    /// One entry per BPF map enumerated. Order matches the IDR walk
960    /// (i.e. allocation order); the report is otherwise unsorted so
961    /// callers that want a stable view should sort by name.
962    pub maps: Vec<FailureDumpMap>,
963    /// Per-vCPU register snapshots captured on each vCPU thread at
964    /// freeze time. Index matches vCPU id (BSP at 0, APs at 1..N).
965    /// `None` when a vCPU never parked (rendezvous timeout) or its
966    /// `KVM_GET_REGS` failed mid-shutdown. Attached to the report by
967    /// the freeze coordinator after `dump_state` returns.
968    #[serde(default, skip_serializing_if = "Vec::is_empty")]
969    pub vcpu_regs: Vec<Option<VcpuRegSnapshot>>,
970    /// Obj name of the currently-attached scheduler, identified by
971    /// matching each `BPF_MAP_TYPE_STRUCT_OPS` map's `value_kva` (the
972    /// guest-KVA of its `kvalue.data` payload) against the dereferenced
973    /// `*scx_root` value (the guest-KVA of the active `struct scx_sched`,
974    /// which is also the KVA of `scx_sched.ops` since `ops` sits at
975    /// offset 0). When the match succeeds, the struct_ops map's name
976    /// carries the obj prefix (libbpf convention: `<obj>.<struct_ops_var>`);
977    /// the prefix is split at the first `.` and stored here.
978    ///
979    /// `None` when:
980    /// - `scx_sched_state` is unavailable (no scheduler attached, BTF
981    ///   missing the `scx_sched` type, or `*scx_root` could not be
982    ///   resolved at capture time).
983    /// - No `BPF_MAP_TYPE_STRUCT_OPS` map had `value_kva` matching the
984    ///   active sched_kva (capture race during a mid-attach window, or
985    ///   the struct_ops map's value_kva was not yet populated).
986    /// - The matched map's name lacks a `<obj>.` prefix.
987    ///
988    /// [`crate::scenario::snapshot::Snapshot::active`] uses this as
989    /// the principled tiebreaker when the projection sees multiple
990    /// obj prefixes in global-section maps. On `None` the consumer
991    /// falls back to the prefix-grouping heuristic (single obj →
992    /// that one; multiple obj → `NoActiveScheduler` with a
993    /// diagnostic naming the observed obj_names + the walker's
994    /// failure cause).
995    #[serde(default, skip_serializing_if = "Option::is_none")]
996    pub active_obj_name: Option<String>,
997    /// Guest-KVAs of every `struct bpf_map` belonging to the
998    /// currently-attached scheduler's loaded BPF object, captured
999    /// alongside [`Self::active_obj_name`] from the same
1000    /// `*scx_root → struct_ops map → owning bpf_prog → used_maps`
1001    /// walk. The walker enumerates the matched struct_ops prog's
1002    /// `used_maps` array and records each entry's KVA so a
1003    /// downstream filter can identify the active scheduler's maps
1004    /// uniquely — even when two scheduler instances loaded from the
1005    /// SAME binary coexist post-[`crate::scenario::ops::Op::ReplaceScheduler`]
1006    /// (where their bss / data / rodata maps share the
1007    /// `<obj_name>.` prefix and cannot be distinguished by name
1008    /// alone).
1009    ///
1010    /// Empty when:
1011    /// - [`Self::active_obj_name`] is `None` (walker did not
1012    ///   resolve the active obj — same reasons; see that field's
1013    ///   doc).
1014    /// - The matched prog's `used_maps` could not be safely read
1015    ///   (torn race per the kernel's `used_map_cnt` / `used_maps`
1016    ///   pointer publication TOCTOU described at
1017    ///   `monitor/bpf_prog.rs::find_active_struct_ops_obj_no_target`).
1018    /// - The walker ran but the kernel published an empty
1019    ///   `used_maps` for the active prog (no maps registered — an
1020    ///   unusual but legal sched_ext shape).
1021    ///
1022    /// **KVA aliasing caveat:** kernel BPF map allocations are
1023    /// vmalloc/slab-backed; a freed map's KVA can be reassigned to a
1024    /// new allocation across captures. [`crate::scenario::snapshot::Snapshot::active`]
1025    /// combines this set with [`Self::active_obj_name`] (both must
1026    /// match) to reject the aliasing case — a KVA hit whose owning
1027    /// map name does not share the active obj prefix is treated as
1028    /// stale and falls through to the obj-prefix heuristic.
1029    ///
1030    /// **Within-run identity ONLY.** KVAs reflect kernel address
1031    /// space allocation at capture time (subject to KASLR slide).
1032    /// Stable for the life of the map within a single VM run; NOT
1033    /// comparable across runs. Never persist or compare against
1034    /// checked-in baselines.
1035    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1036    pub active_map_kvas: Vec<u64>,
1037    /// Structured per-allocation views from sdt_alloc-backed
1038    /// allocators. One entry per discovered allocator; each carries
1039    /// every live leaf slot (capped at
1040    /// `super::sdt_alloc::MAX_SDT_ALLOC_ENTRIES`) BTF-rendered to
1041    /// named field views. Empty when no scheduler-side allocator
1042    /// could be located, when arena offsets / sdt_alloc offsets are
1043    /// absent, or when the program BTF lacks the `scx_allocator`
1044    /// type (scheduler doesn't link `lib/sdt_alloc.bpf.c`).
1045    ///
1046    /// Populated alongside the page-granular `ArenaSnapshot` in
1047    /// each map: a consumer can read either representation depending
1048    /// on whether they want raw bytes or named-field allocations.
1049    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1050    pub sdt_allocations: Vec<SdtAllocatorSnapshot>,
1051    /// Live `scx_static` bump-allocator regions discovered in
1052    /// `.bss`. One entry per `struct scx_static` instance with
1053    /// `memory != 0` and an in-range `(off, max_alloc_bytes)` pair.
1054    /// Distinct from [`Self::sdt_allocations`]: scx_static is a
1055    /// program-lifetime bump allocator (`lib/sdt_alloc.bpf.c:577`)
1056    /// with no per-allocation header, so the surfaced view is
1057    /// region-granular ranges rather than per-slot named allocations.
1058    /// Empty when the scheduler doesn't link `lib/sdt_alloc.bpf.c`,
1059    /// when the program BTF lacks `struct scx_static`, or when no
1060    /// `scx_static` instance has been initialised at freeze time.
1061    ///
1062    /// The dump pipeline uses the same ranges to populate the
1063    /// renderer's `ScxStaticRangeIndex` so a deferred-resolve arena
1064    /// chase whose target lives inside scx_static memory can
1065    /// fail-closed cleanly (no per-slot type recovery is possible
1066    /// without a per-call-site type hook from cast analysis).
1067    #[serde(
1068        default,
1069        skip_serializing_if = "super::scx_static_alloc::ScxStaticSnapshot::is_empty"
1070    )]
1071    pub scx_static_ranges: super::scx_static_alloc::ScxStaticSnapshot,
1072    /// Diagnostic reason for `sdt_allocations` being empty.
1073    ///
1074    /// - `None` → either the pre-pass ran and produced records (the vec
1075    ///   is non-empty), or the pre-pass ran cleanly but the scheduler
1076    ///   simply has no live allocations (the vec is empty for legitimate
1077    ///   reasons that aren't worth a diagnostic). Default.
1078    /// - `Some(REASON_SDT_ALLOC_*)` → the pre-pass skipped before it
1079    ///   could surface any allocator state. The string identifies which
1080    ///   prerequisite was missing: deadline exhaustion, unaligned
1081    ///   `user_vm_start`, missing scheduler `.bss`, missing arena
1082    ///   snapshot, scheduler BTF without `struct scx_allocator`, or no
1083    ///   `.bss` `scx_allocator` instance.
1084    ///
1085    /// Distinct from `dump_truncated_at_us` (which records deadline
1086    /// truncation across the whole dump) and from
1087    /// [`Self::scx_static_ranges`] (which has its own walker independent
1088    /// of the typed-allocator pre-pass). Mirrors the
1089    /// `prog_runtime_stats_unavailable` / `task_enrichments_unavailable`
1090    /// pattern.
1091    #[serde(default, skip_serializing_if = "Option::is_none")]
1092    pub sdt_alloc_unavailable: Option<String>,
1093    /// Per-program BPF runtime stats summed across CPUs at freeze
1094    /// time (cnt, nsecs, misses). One entry per discovered
1095    /// struct_ops BPF program. Empty when no struct_ops programs are
1096    /// loaded OR when the prog accessor was unavailable to
1097    /// `dump_state` — see [`Self::prog_runtime_stats_unavailable`]
1098    /// for the reason.
1099    ///
1100    /// Per-CPU offset resolution failure does NOT empty the vec —
1101    /// each program still contributes one entry, but with
1102    /// `cnt`/`nsecs`/`misses` summed only over CPUs whose per-CPU
1103    /// `bpf_prog_stats` slot translated successfully (out-of-range
1104    /// CPUs return None per `super::bpf_map::read_percpu_array_value`
1105    /// semantics).
1106    ///
1107    /// See `super::bpf_prog::ProgRuntimeStats` for field semantics
1108    /// and the kernel-source-grounded provenance of each counter.
1109    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1110    pub prog_runtime_stats: Vec<super::bpf_prog::ProgRuntimeStats>,
1111    /// Diagnostic reason for `prog_runtime_stats` being empty.
1112    ///
1113    /// Distinguishes the three causes a consumer can't otherwise tell
1114    /// apart from an empty vec:
1115    /// - `None` (field absent on wire) → vec was populated normally
1116    ///   (or the dump path didn't run). Default.
1117    /// - `Some("no struct_ops programs loaded")` → walker ran, no
1118    ///   struct_ops programs were in `prog_idr` at freeze time.
1119    /// - `Some("prog accessor unavailable")` → caller passed
1120    ///   `prog_capture: None`. Typical causes: `prog_idr` symbol
1121    ///   missing, `BpfProgOffsets` BTF parse failed, or
1122    ///   `__per_cpu_offset` resolution didn't yield non-zero offsets
1123    ///   yet (still-booting guest).
1124    ///
1125    /// Set by `dump_state` only when prog_runtime_stats ends up
1126    /// empty AND a definite cause is identifiable; left None
1127    /// otherwise so the field stays absent in the JSON for
1128    /// already-populated dumps.
1129    #[serde(default, skip_serializing_if = "Option::is_none")]
1130    pub prog_runtime_stats_unavailable: Option<String>,
1131    /// Per-CPU CPU-time / softirq / IRQ counters captured from
1132    /// `kernel_cpustat`, `kernel_stat`, and (under NO_HZ)
1133    /// `tick_sched`. One entry per CPU enumerated by the walker.
1134    /// Empty when the dump caller passed no `CpuTimeCapture` or
1135    /// when symbol/BTF resolution failed.
1136    ///
1137    /// See `PerCpuTimeStats` for field semantics. Surfaces the
1138    /// per-CPU interrupt and idle-time data the failure dump
1139    /// otherwise leaves implicit (the existing scx walker reads
1140    /// `rq->nr_iowait` but not the cumulative time accounting).
1141    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1142    pub per_cpu_time: Vec<PerCpuTimeStats>,
1143    /// Per-cgroup PSI-irq samples for the test's workload cgroups, host-walked
1144    /// from the cgroup hierarchy at this freeze (Phase A). One entry per
1145    /// workload-root leaf cgroup with per-cgroup PSI accounting enabled. Empty
1146    /// when the dump caller passed no `CgroupPsiCapture`, the workload root
1147    /// isn't present yet, or `psi_cgroups_enabled` is off — loud-absent. RAW
1148    /// values; decoded + folded at the metric layer. See
1149    /// `super::cgroup_walk::CgroupPsiStat`.
1150    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1151    pub cgroup_psi: Vec<super::cgroup_walk::CgroupPsiStat>,
1152    /// Per-node NUMA event counters captured from
1153    /// `pglist_data->node_zones[]->vm_numa_event[]`. One row per
1154    /// NUMA node enumerated by the walker. Empty when the live
1155    /// walker has not landed yet (the BTF offsets and wire shape
1156    /// are wired; the reader is a follow-up).
1157    ///
1158    /// See `PerNodeNumaStats` for field semantics; see
1159    /// [`Self::per_node_numa_unavailable`] for the "why empty"
1160    /// diagnostic.
1161    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1162    pub per_node_numa: Vec<PerNodeNumaStats>,
1163    /// Diagnostic reason for `per_node_numa` being empty.
1164    /// `None` when the vec was populated normally (or the dump
1165    /// path didn't run); `Some(REASON_NO_NUMA_WALKER)` until the
1166    /// host-side walker lands.
1167    #[serde(default, skip_serializing_if = "Option::is_none")]
1168    pub per_node_numa_unavailable: Option<String>,
1169    /// Per-task failure-dump enrichments — identity (pid, tgid,
1170    /// comm), process tree (group_leader, real_parent, pgid, sid,
1171    /// nr_threads), scheduling (prio family, sched_class name,
1172    /// scx.weight, core_cookie), context-switch counters, watchdog
1173    /// disambiguation flag, and lock-slowpath stack matches.
1174    ///
1175    /// One entry per task the dump path's task walker reaches —
1176    /// today's task walkers are the rq->scx walker and the DSQ
1177    /// walker; both produce task KVAs that get enriched here.
1178    /// Empty when no task walker ran (typical until walker
1179    /// dispatch lands) or when the `TaskEnrichmentCapture` was
1180    /// absent.
1181    ///
1182    /// See `super::task_enrichment::TaskEnrichment` for field
1183    /// semantics; see [`Self::task_enrichments_unavailable`] for the
1184    /// "why empty" diagnostic.
1185    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1186    pub task_enrichments: Vec<super::task_enrichment::TaskEnrichment>,
1187    /// Diagnostic reason for `task_enrichments` being empty.
1188    ///
1189    /// - `None` → vec was populated normally (or the dump path
1190    ///   didn't run).
1191    /// - `Some("no task walker available")` → the
1192    ///   `TaskEnrichmentCapture` was missing from
1193    ///   `DumpContext`. Until DSQ + rq->scx walker dispatch
1194    ///   lands, this is the expected steady state for the dump
1195    ///   pipeline; the offsets + walker library is wired and
1196    ///   ready to populate as soon as a task-list producer hooks
1197    ///   in.
1198    /// - `Some("task walker yielded zero tasks")` → walker
1199    ///   produced no task KVAs (frozen guest with no runnable /
1200    ///   queued scx tasks at the dump instant — possible on a
1201    ///   completely-idle stall trigger).
1202    #[serde(default, skip_serializing_if = "Option::is_none")]
1203    pub task_enrichments_unavailable: Option<String>,
1204    /// Per-monitor-tick SCX_EV_* event counter timeline. Each entry
1205    /// is the cross-CPU sum of the 13 SCX_EV_* counters at one
1206    /// monitor sample. Empty when the dump caller passed no
1207    /// `EventCounterCapture` or no sample reported event counters
1208    /// (event-stat offsets unresolved, scx_root unset). Renderers
1209    /// build sparklines / per-counter delta plots from this vec.
1210    ///
1211    /// See `EventCounterSample` for field semantics; the kernel-
1212    /// source provenance lives on
1213    /// `super::ScxEventCounters` field doc.
1214    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1215    pub event_counter_timeline: Vec<EventCounterSample>,
1216    /// Per-CPU `rq->scx` snapshots — scalar fields the kernel's
1217    /// own `scx_dump_state` reads plus the runnable_list per-task
1218    /// KVAs that fed into the per-task enrichment capture.
1219    /// One entry per CPU walked. Empty when the
1220    /// `ScxWalkerCapture` was absent or every CPU's translate
1221    /// failed.
1222    ///
1223    /// See `super::scx_walker::RqScxState` for field semantics.
1224    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1225    pub rq_scx_states: Vec<super::scx_walker::RqScxState>,
1226    /// Per-DSQ snapshots — local, bypass, global, and user DSQs
1227    /// reachable from `*scx_root`. Each entry carries `nr` (depth),
1228    /// `seq` (BPF-iter counter), and the queued task KVAs.
1229    /// Surfaces data the kernel's own `scx_dump_state` does not
1230    /// emit (per-DSQ depth enumeration), so this vec adds value
1231    /// even on a kernel that prints its own dump.
1232    ///
1233    /// Empty when the `ScxWalkerCapture` was absent.
1234    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1235    pub dsq_states: Vec<super::scx_walker::DsqState>,
1236    /// Top-level `scx_sched` state captured from `*scx_root`:
1237    /// aborting flag, bypass_depth, exit_kind. `None` when no
1238    /// scheduler is attached or `*scx_root` was unreadable.
1239    #[serde(default, skip_serializing_if = "Option::is_none")]
1240    pub scx_sched_state: Option<super::scx_walker::ScxSchedState>,
1241    /// Diagnostic reason for `rq_scx_states` / `dsq_states` /
1242    /// `scx_sched_state` being absent. Mirrors the
1243    /// `prog_runtime_stats_unavailable` / `task_enrichments_unavailable`
1244    /// pattern.
1245    #[serde(default, skip_serializing_if = "Option::is_none")]
1246    pub scx_walker_unavailable: Option<String>,
1247    /// Per-vCPU hardware perf counter snapshot captured at the
1248    /// instant the failure dump fired. One entry per vCPU; index
1249    /// matches vCPU id (0 = BSP, 1..N = APs). `None` per-entry when
1250    /// the freeze-time `read(2)` failed for that vCPU. Empty vec
1251    /// when `DumpContext::perf_capture` was None (perf
1252    /// unavailable on this host) or the read errored wholesale.
1253    ///
1254    /// `exclude_host=1` means each counter ticks only during guest
1255    /// execution; the values here record the cumulative count from
1256    /// the start of the run. Diff against any
1257    /// `super::CpuSnapshot::vcpu_perf` in the monitor timeline to
1258    /// recover the count over a freeze-aligned window. See
1259    /// `super::perf_counters::VcpuPerfSample` for field semantics
1260    /// and the multiplexing math.
1261    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1262    pub vcpu_perf_at_freeze: Vec<Option<super::perf_counters::VcpuPerfSample>>,
1263    /// Microseconds from dump_state entry to the phase that exceeded
1264    /// the soft deadline supplied via `DumpContext::deadline`. `None`
1265    /// when no deadline was supplied, when every phase finished within
1266    /// the deadline, or when the deadline check happened before the
1267    /// dump started any heavy phase. A `Some(us)` value means the dump
1268    /// truncated remaining work (skipped further maps / tasks /
1269    /// walkers) at that elapsed offset to keep the freeze window
1270    /// bounded — the freeze coordinator's parked vCPUs cannot
1271    /// service guest IRQs or MMIO traps while the dump is running,
1272    /// so unbounded dump latency stretches every guest's KVM_RUN
1273    /// pause and risks the freeze rendezvous timeout firing on the
1274    /// next iteration.
1275    #[serde(default, skip_serializing_if = "Option::is_none")]
1276    pub dump_truncated_at_us: Option<u64>,
1277    /// Count of scheduler-under-test maps the per-map render loop
1278    /// skipped because the soft deadline had already been crossed
1279    /// (`dump_truncated_at_us` records WHEN, this records HOW MANY).
1280    /// `0` on a complete dump. A skipped map is absent from `maps`
1281    /// entirely — without this count a consumer reading `maps`
1282    /// cannot tell "the scheduler has N maps" from "the scheduler
1283    /// has N+k maps but k were dropped by truncation", so a
1284    /// degraded dump would silently under-report map state. Excludes
1285    /// ktstr's own framework maps, which are filtered before the
1286    /// deadline check and never counted here.
1287    #[serde(default, skip_serializing_if = "is_zero_u32")]
1288    pub maps_truncated: u32,
1289    /// Probe BPF program's per-CPU diagnostic counter snapshot
1290    /// (see `ProbeBssCounters`). Populated by the host-side
1291    /// reader in `decode_probe_counters_snapshot` which sums
1292    /// each `KTSTR_PCPU_*` slot across CPUs. `None` when the
1293    /// probe `.bss` map isn't enumerated (probe not loaded), the
1294    /// program BTF can't be parsed, or the array's offset doesn't
1295    /// resolve.
1296    ///
1297    /// A populated `trigger_count > 0` is the structural signal
1298    /// that the BPF tp_btf/sched_ext_exit handler fired during
1299    /// the run — distinct from the boolean `trigger_fired` flag
1300    /// in `super::probe::process::ProbeDiagnostics` (which
1301    /// also records host-side observations like a watchdog
1302    /// teardown). The cross-product is the failure-dump E2E
1303    /// test's structural assertion: a stall scenario must show
1304    /// both flag=true AND `trigger_count > 0`, otherwise the
1305    /// probe attached without firing or fired without the host
1306    /// observing.
1307    #[serde(default, skip_serializing_if = "Option::is_none")]
1308    pub probe_counters: Option<ProbeBssCounters>,
1309    /// `true` when this report was produced by
1310    /// [`Self::placeholder`] — i.e. the capture pipeline could
1311    /// not produce real data (typical cause: freeze rendezvous
1312    /// timed out). Periodic-sample temporal assertions skip
1313    /// placeholder reports rather than treating their empty
1314    /// vectors as "no progress" signals; the `*_unavailable`
1315    /// fields carry the reason string for human consumers, but
1316    /// the boolean flag is the machine-checkable discriminant a
1317    /// pattern can branch on without re-deriving placeholder
1318    /// status from the absence of every field.
1319    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
1320    pub is_placeholder: bool,
1321}
1322
1323impl Default for FailureDumpReport {
1324    /// Empty report with `schema = "single"`. Pinning the schema
1325    /// here keeps `FailureDumpReport::default()` and a
1326    /// freshly-constructed `FailureDumpReport { ..., schema:
1327    /// SCHEMA_SINGLE.into(), ... }` indistinguishable to consumers,
1328    /// so the schema discriminant is never quietly missing on a
1329    /// default-built report.
1330    fn default() -> Self {
1331        Self {
1332            schema: SCHEMA_SINGLE.to_string(),
1333            active_map_kvas: Vec::new(),
1334            maps: Vec::new(),
1335            vcpu_regs: Vec::new(),
1336            sdt_allocations: Vec::new(),
1337            scx_static_ranges: super::scx_static_alloc::ScxStaticSnapshot::default(),
1338            sdt_alloc_unavailable: None,
1339            prog_runtime_stats: Vec::new(),
1340            prog_runtime_stats_unavailable: None,
1341            per_cpu_time: Vec::new(),
1342            cgroup_psi: Vec::new(),
1343            per_node_numa: Vec::new(),
1344            per_node_numa_unavailable: None,
1345            task_enrichments: Vec::new(),
1346            task_enrichments_unavailable: None,
1347            event_counter_timeline: Vec::new(),
1348            rq_scx_states: Vec::new(),
1349            dsq_states: Vec::new(),
1350            scx_sched_state: None,
1351            scx_walker_unavailable: None,
1352            vcpu_perf_at_freeze: Vec::new(),
1353            dump_truncated_at_us: None,
1354            maps_truncated: 0,
1355            probe_counters: None,
1356            is_placeholder: false,
1357            active_obj_name: None,
1358        }
1359    }
1360}
1361
1362impl FailureDumpReport {
1363    /// Build a placeholder report for a capture that could not
1364    /// produce real data. Every `*_unavailable` field is set to
1365    /// `Some(reason)` so downstream consumers (`perf-delta`,
1366    /// failure-rendering tooling) can distinguish "capture
1367    /// happened, no data" from "capture path failed for reason X".
1368    /// All vector / option fields stay at their `Default` empty
1369    /// state so the report is structurally a real
1370    /// `FailureDumpReport`, not a sentinel that breaks consumer
1371    /// type contracts.
1372    ///
1373    /// Used by the freeze coordinator's user-watchpoint dispatch,
1374    /// periodic-capture drain, and final-drain teardown — every
1375    /// site that needs to publish a "capture attempted, did not
1376    /// land" entry on the snapshot bridge.
1377    pub fn placeholder(reason: impl Into<String>) -> Self {
1378        let reason = reason.into();
1379        Self {
1380            prog_runtime_stats_unavailable: Some(reason.clone()),
1381            per_node_numa_unavailable: Some(reason.clone()),
1382            task_enrichments_unavailable: Some(reason.clone()),
1383            scx_walker_unavailable: Some(reason.clone()),
1384            sdt_alloc_unavailable: Some(reason),
1385            is_placeholder: true,
1386            ..Self::default()
1387        }
1388    }
1389}
1390
1391/// Identify the obj name (libbpf `<obj>` prefix) of the
1392/// currently-attached scheduler from the captured BPF state, plus
1393/// the live scheduler's `used_maps` KVA set when available.
1394///
1395/// **No `*scx_root` dependency -- works on every supported kernel.**
1396/// `scx_root` (the global pointer to the active `struct scx_sched`)
1397/// only exists on v6.16+ (added by commit 48e126777386); pre-6.16
1398/// kernels track the active scheduler via the global `scx_ops`
1399/// (`struct sched_ext_ops`) plus the atomic enable-state instead, so
1400/// any `*scx_root`-based identification is blind on 6.14/6.15. Both
1401/// paths below read only `prog_idr` / struct_ops map names /
1402/// global-section map names, which are present and BTF-offset-stable
1403/// across the whole range, so this helper resolves the active obj
1404/// uniformly regardless of kernel version.
1405///
1406/// **PRIMARY: target-free `prog_idr` walk.** Delegates to
1407/// `prog_accessor.find_active_struct_ops_obj_no_target` (see
1408/// `monitor::bpf_prog`), which walks `prog_idr` for an alive
1409/// `BPF_PROG_TYPE_STRUCT_OPS` prog whose `aux->used_maps` carries a
1410/// sibling `<obj>.bss/.data/.rodata` global-section map, and returns
1411/// that prog's obj prefix + full `used_map_kvas` snapshot. The
1412/// returned prefix is cross-checked against the captured `maps[]`
1413/// (a `<prefix>.bss/.data/.rodata` must exist) so a torn `used_maps`
1414/// read cannot publish a garbage prefix. The walk takes no target
1415/// and reads live guest memory, so it produces the correct live
1416/// scheduler even mid-swap and on pre-6.16 kernels. Used only when
1417/// the walker returns a NON-EMPTY `used_map_kvas`; an empty
1418/// whitelist cannot disambiguate two same-prefix copies downstream.
1419///
1420/// The walker matches the FIRST such prog and does not gate on the
1421/// kernel enable-state, so uniqueness rests on ktstr's swap
1422/// sequencing (kill the outgoing scheduler and wait for its process
1423/// to exit before loading the next), NOT on a kernel
1424/// old-prog-removed-before-new-prog-added guarantee: the kernel
1425/// does not serialize those (a detached struct_ops prog leaves
1426/// `prog_idr` only when its owning map's last fd closes and an RCU
1427/// grace elapses). See `find_active_struct_ops_obj_no_target` for
1428/// the single-tenant threat model.
1429///
1430/// **FALLBACK: prefix grouping over struct_ops map names.** Runs
1431/// when `prog_walker` is `None` (the prog accessor has not published
1432/// yet) OR the walker found no global-section-bearing prog OR its
1433/// whitelist was empty. Picks the first `BPF_MAP_TYPE_STRUCT_OPS`
1434/// map whose name prefix has an UNAMBIGUOUS global-section sibling
1435/// set (each of `.bss/.data/.rodata` count <= 1) and returns
1436/// `(prefix, vec![])`. Section counts use full-name equality so they
1437/// stay in lockstep with the consumer's classifier at
1438/// [`crate::scenario::snapshot::Snapshot::active`] and the walker's
1439/// `strip_suffix` in
1440/// `monitor::bpf_prog::extract_global_section_obj_prefix` (private
1441/// helper, cited by path rather than intra-doc link). A multi-copy
1442/// prefix (the same-binary `Op::ReplaceScheduler` swap window leaves
1443/// the dying scheduler's globals beside the new scheduler's) is
1444/// skipped here: the empty-whitelist `(prefix, vec![])` cannot
1445/// disambiguate downstream, so the helper instead returns `None` and
1446/// the consumer surfaces an actionable `NoActiveScheduler`.
1447///
1448/// Returns `None` when neither path resolves: the walker was absent
1449/// / found nothing / returned an unbacked or empty result, AND no
1450/// struct_ops map had an unambiguous global-section sibling set. On
1451/// `None`, callers fall back to
1452/// [`crate::scenario::snapshot::Snapshot::active`]'s own
1453/// prefix-grouping, whose per-section-count check enforces the same
1454/// multi-copy detection, so a helper-`None` does not silently
1455/// downgrade into the prefix-only `AmbiguousVar` surface.
1456fn identify_active_obj_from_struct_ops(
1457    maps: &[super::bpf_map::BpfMapInfo],
1458    prog_walker: Option<(
1459        &dyn super::bpf_prog::BpfProgAccessor,
1460        &super::btf_offsets::BpfMapOffsets,
1461    )>,
1462) -> Option<(String, Vec<u64>)> {
1463    // PRIMARY PATH: target-free prog_idr walk. Returns the first
1464    // alive `BPF_PROG_TYPE_STRUCT_OPS` prog whose `aux->used_maps`
1465    // contains a sibling `<obj>.bss/.data/.rodata` global-section map,
1466    // with that prog's obj prefix + full `used_map_kvas` snapshot.
1467    // Reads live guest memory and takes no target, so it resolves the
1468    // live scheduler on every supported kernel -- including pre-6.16,
1469    // where `scx_root` does not exist. In ktstr scenarios only one
1470    // matching prog is alive at a time: ktstr's swap sequence kills
1471    // the outgoing scheduler and waits for its process to exit before
1472    // loading the next, which closes the outgoing struct_ops map's
1473    // last fd and lets the RCU-deferred map+prog free remove the OLD
1474    // prog from `prog_idr`. (The kernel does NOT serialize
1475    // old-prog-removal before new-prog-add; the single-alive-prog
1476    // property is ktstr's sequencing, not a kernel invariant.) The
1477    // returned prefix is cross-checked against the captured `maps[]`
1478    // so a torn `used_maps` read can't publish a garbage prefix.
1479    if let Some((prog_accessor, map_offsets)) = prog_walker
1480        && let Some(walker_match) = prog_accessor.find_active_struct_ops_obj_no_target(map_offsets)
1481        && !walker_match.used_map_kvas.is_empty()
1482    {
1483        // Defense against torn used_maps reads: the walker's returned
1484        // prefix MUST appear as `<prefix>.<section>` in the captured
1485        // `maps[]`. A walker that read garbage from a mid-mutation
1486        // used_maps window would name an obj prefix that no captured
1487        // map matches; the cross-check rejects that case.
1488        let (wb, wd, wr) = count_global_sections_for_prefix(maps, &walker_match.obj_name);
1489        if wb + wd + wr > 0 {
1490            return Some((walker_match.obj_name, walker_match.used_map_kvas));
1491        }
1492    }
1493
1494    // FALLBACK PATH: prefix grouping by struct_ops map name. Runs
1495    // when the prog walker is unavailable (`prog_walker` is `None`,
1496    // e.g. `owned_prog_accessor` hasn't published yet at boot) OR the
1497    // live STRUCT_OPS prog has no global-section maps in its
1498    // used_maps (libbpf-named struct_ops case without a `.bss/.data/
1499    // .rodata` sibling, observed when the scheduler keeps all its
1500    // state in non-libbpf-named maps).
1501    //
1502    // Pick the first STRUCT_OPS map whose prefix has an unambiguous
1503    // global-section sibling set (each section ≤ 1). Multi-copy
1504    // collisions skip -- the consumer surfaces NoActiveScheduler with
1505    // an actionable diagnostic.
1506    for active_struct_ops in maps
1507        .iter()
1508        .filter(|m| m.map_type == super::bpf_map::BPF_MAP_TYPE_STRUCT_OPS)
1509    {
1510        if active_struct_ops.map_kva == 0 {
1511            continue;
1512        }
1513        let so_name = active_struct_ops.name();
1514        let Some(prefix) = so_name.split('.').next().filter(|s| !s.is_empty()) else {
1515            continue;
1516        };
1517        let (bss_count, data_count, rodata_count) = count_global_sections_for_prefix(maps, prefix);
1518        let has_matching_global = bss_count + data_count + rodata_count > 0;
1519        let unambiguous = bss_count <= 1 && data_count <= 1 && rodata_count <= 1;
1520        if has_matching_global && unambiguous {
1521            return Some((prefix.to_string(), Vec::new()));
1522        }
1523    }
1524    None
1525}
1526
1527/// Count captured maps named exactly `<prefix>.bss`, `<prefix>.data`,
1528/// `<prefix>.rodata`. Full-name equality (not prefix matching) so it
1529/// aligns with the consumer's `name.ends_with(".bss"/...)` classifier
1530/// at [`crate::scenario::snapshot::Snapshot::active`] and with the
1531/// walker's `strip_suffix(".bss")` in
1532/// `monitor::bpf_prog::extract_global_section_obj_prefix` (private
1533/// helper, cited by path rather than intra-doc link). A hypothetical
1534/// `<prefix>.bss.shared` map would count as zero here (the walker
1535/// treats it the same way), so the counts stay in lockstep across
1536/// the three sites that classify global-section maps for a scheduler
1537/// obj.
1538///
1539/// Skips `BPF_MAP_TYPE_STRUCT_OPS` maps so the active scheduler's
1540/// own struct_ops map (which `Snapshot::active` filters by type as
1541/// well) never inflates the global-section totals.
1542fn count_global_sections_for_prefix(
1543    maps: &[super::bpf_map::BpfMapInfo],
1544    prefix: &str,
1545) -> (usize, usize, usize) {
1546    let bss_name = format!("{prefix}.bss");
1547    let data_name = format!("{prefix}.data");
1548    let rodata_name = format!("{prefix}.rodata");
1549    maps.iter()
1550        .filter(|m| m.map_type != super::bpf_map::BPF_MAP_TYPE_STRUCT_OPS)
1551        .fold((0usize, 0usize, 0usize), |(b, d, r), m| {
1552            let n = m.name();
1553            (
1554                b + usize::from(n == bss_name),
1555                d + usize::from(n == data_name),
1556                r + usize::from(n == rodata_name),
1557            )
1558        })
1559}
1560
1561/// Pair of failure-dump snapshots captured at two points in a stall.
1562///
1563/// `early` is taken when the host-side runnable_at scanner observes
1564/// any task with `jiffies - p->scx.runnable_at > watchdog_timeout/2`
1565/// (mirrors the kernel's `check_rq_for_timeouts` walk over
1566/// `rq->scx.runnable_list`). `late` is taken at the same trigger as
1567/// the single-snapshot path: the BPF probe's
1568/// `ktstr_err_exit_detected` latch flipping after a sched_ext
1569/// error-class exit.
1570///
1571/// `early == None` when the watchdog half-way threshold never
1572/// triggered before `late` fired (e.g. an immediate scheduler error
1573/// in `init_task` before any task became runnable). Diffing
1574/// `late` against `early` shows what BPF state changed during the
1575/// stall window — the value-add over the single-snapshot dump.
1576///
1577/// **No user toggle — auto-repro engages this automatically.** Only
1578/// the auto-repro VM emits this shape;
1579/// `crate::test_support::probe::attempt_auto_repro` is the
1580/// single call site flipping the builder's `dual_snapshot` flag,
1581/// and there is no public ktstr surface for asking for it from a
1582/// primary VM. Test authors don't need to know about it — when an
1583/// auto-repro fires, the file at
1584/// `<test>-<variant_hash>.repro.failure-dump.json` changes shape from
1585/// [`FailureDumpReport`] to this wrapper.
1586///
1587/// Note: there is no `Default` impl. The `late` field is required
1588/// by the doc invariant ("the freeze coordinator only writes a
1589/// `DualFailureDumpReport` after the late snapshot has been
1590/// captured"); a `Default::default()` would have produced a wrapper
1591/// with an empty late report whose `maps`/`vcpu_regs` vectors
1592/// silently lie about a successful capture. Construct via the
1593/// struct literal with an explicit `late: FailureDumpReport`.
1594#[derive(Debug, Clone, Serialize, Deserialize)]
1595#[non_exhaustive]
1596pub struct DualFailureDumpReport {
1597    /// Wire-format discriminant. Always `"dual"` for this variant,
1598    /// pinning [`SCHEMA_DUAL`]. Mirror of [`FailureDumpReport::schema`]
1599    /// — consumers branch on it before deserializing.
1600    pub schema: String,
1601    /// Snapshot at the watchdog half-way point. `None` when the
1602    /// stall fired before the half-way scanner crossed its threshold.
1603    #[serde(default, skip_serializing_if = "Option::is_none")]
1604    pub early: Option<FailureDumpReport>,
1605    /// Snapshot at the error-exit latch trigger. Always present
1606    /// (the freeze coordinator only writes a `DualFailureDumpReport`
1607    /// after the late snapshot has been captured; if the run ends
1608    /// with only an early snapshot the file is not written at all).
1609    pub late: FailureDumpReport,
1610    /// Maximum `jiffies - p->scx.runnable_at` observed by the
1611    /// runnable_at scanner at the moment the early snapshot fired.
1612    /// Zero when `early` is `None`.
1613    ///
1614    /// To recover the kernel's full `watchdog_timeout`, double
1615    /// [`Self::early_threshold_jiffies`] — the scanner trigger
1616    /// fires at half the watchdog, so the threshold field carries
1617    /// `watchdog_timeout / 2`. Diff `early_max_age_jiffies` against
1618    /// `2 * early_threshold_jiffies` to see how close the system
1619    /// was to the SCX_EXIT_ERROR_STALL emission line at the
1620    /// early-trigger point.
1621    #[serde(default, skip_serializing_if = "is_zero_u64")]
1622    pub early_max_age_jiffies: u64,
1623    /// The half-way trigger threshold the scanner compared against
1624    /// when capturing the early snapshot, expressed in guest
1625    /// jiffies. Equals `(watchdog_timeout_ms * CONFIG_HZ) / 1000 / 2`
1626    /// at the moment the snapshot fired. Zero when `early` is
1627    /// `None`.
1628    ///
1629    /// Surfaced alongside `early_max_age_jiffies` so a downstream
1630    /// consumer reading the JSON does not have to recompute the
1631    /// kernel-internal jiffies arithmetic to reproduce the
1632    /// trigger condition.
1633    #[serde(default, skip_serializing_if = "is_zero_u64")]
1634    pub early_threshold_jiffies: u64,
1635    /// Structured reason the early snapshot is absent. `None` when
1636    /// the early snapshot was captured (the [`Self::early`] field is
1637    /// `Some`). When the early field is `None`, this carries a short
1638    /// machine-friendly string identifying which of the known
1639    /// failure modes occurred:
1640    ///
1641    /// - `"scan prerequisites unavailable: <prereq>"` — the
1642    ///   per-CPU `runnable_at` scan never resolved its dependencies
1643    ///   (most often `<prereq>` names the missing kernel symbol /
1644    ///   BTF entry).
1645    /// - `"max_age never crossed threshold (peak={peak}j,
1646    ///   threshold={threshold}j)"` — the scan ran but the maximum
1647    ///   observed runnable-age stayed below the half-way mark for
1648    ///   the whole VM lifetime. Indicates a non-stall err-class exit
1649    ///   (e.g. `scx_bpf_error()`).
1650    /// - `"scx_tick stall — no per-task runnable_at data"` — the
1651    ///   stall path that drove the late capture has no per-task
1652    ///   `runnable_at` to scan (the kernel's "watchdog failed to
1653    ///   check in" path raises `SCX_EXIT_ERROR_STALL` from the
1654    ///   scx_tick kernel side without any task on
1655    ///   `rq->scx.runnable_list`).
1656    ///
1657    /// Display rendering at `super::display` surfaces this string
1658    /// directly; the previous "stall fired before half-way threshold,
1659    /// or runnable_at scan setup failed" generic text is replaced
1660    /// with the structured reason whenever this field is `Some`.
1661    #[serde(default, skip_serializing_if = "Option::is_none")]
1662    pub early_skipped_reason: Option<String>,
1663}
1664
1665fn is_zero_u64(v: &u64) -> bool {
1666    *v == 0
1667}
1668
1669fn is_zero_u32(v: &u32) -> bool {
1670    *v == 0
1671}
1672
1673/// Top-level degraded failure-dump report. Emitted by the freeze
1674/// coordinator when a real error-class trigger fires but the dump
1675/// path aborts before a full [`FailureDumpReport`] can be captured —
1676/// today only the vCPU rendezvous-timeout path produces this shape.
1677///
1678/// Carries the partial state the coordinator did collect (per-vCPU
1679/// registers from any vCPU that parked before timeout) plus the
1680/// observable trigger state at the moment of degradation
1681/// (watchpoint hit, BPF `.bss` latch status, live `exit_kind` if the
1682/// gate read it). An operator inspecting the JSON learns WHY the
1683/// dump degraded from the `reason` field and WHICH vCPUs stalled
1684/// from the `vcpu_regs` Vec's per-slot `None` / `Some` pattern.
1685///
1686/// Schema discriminant: [`SCHEMA_DEGRADED`]. Parsed via the same
1687/// [`FailureDumpReportAny::from_json`] dispatcher as the other two
1688/// variants.
1689#[derive(Debug, Clone, Serialize, Deserialize)]
1690#[non_exhaustive]
1691pub struct DegradedFailureDumpReport {
1692    /// Wire-format discriminant. Always `"degraded"` for this
1693    /// variant, pinning [`SCHEMA_DEGRADED`]. Mirror of
1694    /// [`FailureDumpReport::schema`] / [`DualFailureDumpReport::schema`]
1695    /// — consumers branch on it before deserializing.
1696    pub schema: String,
1697    /// Operator-readable reason the dump degraded. Carries one of
1698    /// the `REASON_DEGRADED_*` constants as the canonical prefix,
1699    /// followed by dynamic detail filled in at emit time (e.g.
1700    /// timeout milliseconds, parked-vCPU counts). Stable wire format
1701    /// per the [`SCHEMA_DEGRADED`] discriminant contract: new degraded
1702    /// causes add new `REASON_DEGRADED_*` constants rather than
1703    /// mutating the existing ones.
1704    pub reason: String,
1705    /// Per-vCPU register snapshots collected before degradation.
1706    /// Index matches vCPU id (BSP at 0, APs at 1..N). `None` entries
1707    /// identify the vCPUs that never parked (the operator's
1708    /// signal for which vCPUs stalled) — distinct from
1709    /// [`FailureDumpReport::vcpu_regs`]'s `None`, which usually
1710    /// means `KVM_GET_REGS` failed mid-shutdown.
1711    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1712    pub vcpu_regs: Vec<Option<VcpuRegSnapshot>>,
1713    /// Hardware-watchpoint hit state at degradation. `true` when
1714    /// the freeze-coordinator's `*scx_root->exit_kind` watchpoint
1715    /// fired on a vCPU thread; `false` when only the BPF `.bss`
1716    /// latch fired (or the trigger source was a deferred-capture
1717    /// request).
1718    pub watchpoint_hit: bool,
1719    /// BPF probe `.bss` latch state at degradation. One of
1720    /// `"triggered"` (probe latched err exit), `"not_triggered"`
1721    /// (latch readable, value still 0), `"out_of_bounds"` (cached
1722    /// `.bss` PA no longer 4-byte-readable — probe map freed
1723    /// mid-run), or `"not_resolved"` (cached `.bss` PA was never
1724    /// populated). Mirror of `crate::vmm::freeze_coord`'s
1725    /// internal `BssReadState` enum, serialised as the snake-case
1726    /// of each variant. String-typed for wire-format stability with
1727    /// the rest of the `REASON_*` / state-name surface — see
1728    /// [`SCHEMA_DEGRADED`] for the contract.
1729    pub bss_latch_state: String,
1730    /// Live `*scx_root->exit_kind` value at degradation, when the
1731    /// gate read it. `None` when the dump path aborted before
1732    /// reaching the gate (rendezvous timed out earlier) or when the
1733    /// KVA translation failed. `Some(kind)` carries the raw `u32`
1734    /// from `enum scx_exit_kind` — operators read it against
1735    /// the kernel's `scx_exit_kind` enum definition to identify
1736    /// whether the scheduler's intended exit class matched the
1737    /// trigger that fired.
1738    #[serde(default, skip_serializing_if = "Option::is_none")]
1739    pub exit_kind: Option<u32>,
1740    /// Wall-clock milliseconds from the freeze trigger (capture
1741    /// start) to the degraded-emit decision. Lets an operator see
1742    /// how long the coordinator spent trying to capture before
1743    /// giving up. Mirrors the `elapsed_ms` field
1744    /// [`FailureDumpReport`] surfaces via the post-dump log line —
1745    /// here it's structured so consumers can read it without
1746    /// parsing the log.
1747    #[serde(default, skip_serializing_if = "is_zero_u64")]
1748    pub elapsed_ms: u64,
1749}
1750
1751/// Either-or wrapper that owns a parsed [`FailureDumpReport`],
1752/// [`DualFailureDumpReport`], or [`DegradedFailureDumpReport`]. Lets
1753/// a consumer hold and render a failure-dump file without prematurely
1754/// committing to one schema — the discriminant lives in the JSON's
1755/// `schema` field, not in the type the consumer holds.
1756///
1757/// Centralises the schema-tag dispatch logic that previously lived
1758/// inline at every read site (the auto-repro tail renderer, the
1759/// failure-dump-e2e test, any future consumer that wants to inspect
1760/// any shape). Use [`Self::from_json`] to parse an arbitrary
1761/// failure-dump JSON blob; the Display impl forwards to the
1762/// underlying report's existing Display so the rendered output is
1763/// indistinguishable from holding the unwrapped report directly.
1764///
1765/// `non_exhaustive` so a future fourth schema can be added without
1766/// breaking external pattern matches.
1767#[non_exhaustive]
1768pub enum FailureDumpReportAny {
1769    /// Single-snapshot report, schema=`"single"`. Emitted by the
1770    /// primary VM's freeze coordinator when an error-class SCX exit
1771    /// fires.
1772    Single(Box<FailureDumpReport>),
1773    /// Dual-snapshot wrapper, schema=`"dual"`. Emitted by the
1774    /// auto-repro VM when the dual-snapshot path is enabled. Carries
1775    /// optional `early` + required `late` snapshots plus jiffies
1776    /// metadata for the early-trigger condition.
1777    ///
1778    /// Boxed to keep [`FailureDumpReportAny`]'s on-stack size bounded
1779    /// — `DualFailureDumpReport` carries the early+late snapshots
1780    /// inline and is roughly 2x the size of [`FailureDumpReport`].
1781    Dual(Box<DualFailureDumpReport>),
1782    /// Degraded report, schema=`"degraded"`. Emitted when an
1783    /// error-class trigger fires but the dump path aborts before a
1784    /// full single/dual report can be captured — today only the
1785    /// vCPU rendezvous-timeout path produces this shape. Carries
1786    /// partial vCPU register data + trigger-state diagnostics
1787    /// instead of the full map / scx-walker output.
1788    ///
1789    /// Boxed for size parity with the other variants.
1790    Degraded(Box<DegradedFailureDumpReport>),
1791}
1792
1793impl FailureDumpReportAny {
1794    /// Parse a failure-dump JSON blob, choosing the variant by the
1795    /// `schema` field. Returns `None` on any of:
1796    ///
1797    /// - the blob does not parse as JSON
1798    /// - the `schema` field is absent (degraded variant requires an
1799    ///   explicit discriminant; the previous "absent ⇒ single"
1800    ///   fallback would silently mis-route a richer wrapper as a
1801    ///   lossy single shape)
1802    /// - the `schema` field carries an unknown value
1803    /// - the typed deserialisation under the chosen schema fails
1804    pub fn from_json(json: &str) -> Option<Self> {
1805        let value: serde_json::Value = serde_json::from_str(json).ok()?;
1806        let schema = value.get("schema").and_then(|v| v.as_str())?;
1807        match schema {
1808            SCHEMA_DUAL => serde_json::from_str(json)
1809                .ok()
1810                .map(|d| Self::Dual(Box::new(d))),
1811            SCHEMA_SINGLE => serde_json::from_str(json)
1812                .ok()
1813                .map(|r| Self::Single(Box::new(r))),
1814            SCHEMA_DEGRADED => serde_json::from_str(json)
1815                .ok()
1816                .map(|d| Self::Degraded(Box::new(d))),
1817            _ => None,
1818        }
1819    }
1820}
1821
1822/// Rendering of one BPF map's contents.
1823///
1824/// Unifies the map-type rendering paths under a single
1825/// representation: single-entry ARRAY maps (incl. the
1826/// `.bss`/`.data`/`.rodata` global sections) populate `value`;
1827/// multi-entry ARRAY maps populate `array_entries`; keyed HASH maps
1828/// populate `entries`; per-CPU maps populate `percpu_entries`.
1829/// Exactly one of these is non-empty for a successful render; on
1830/// failure `error` is set and the rest empty.
1831#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1832#[non_exhaustive]
1833pub struct FailureDumpMap {
1834    /// Map name as registered with the kernel. Truncated to
1835    /// `BPF_OBJ_NAME_LEN` (16) by the kernel; libbpf composes
1836    /// `"<obj_name>.<section>"` for global-section maps.
1837    pub name: String,
1838    /// Guest-KVA of this map's `struct bpf_map` allocation. Unique
1839    /// per loaded map instance — two map copies sharing the same
1840    /// `name` (e.g. two `<obj>.bss` maps from two scheduler
1841    /// instances loaded from the same binary post-
1842    /// [`crate::scenario::ops::Op::ReplaceScheduler`]) have distinct
1843    /// KVAs and are distinguishable on this field alone.
1844    ///
1845    /// Sourced from [`crate::monitor::bpf_map::BpfMapInfo::map_kva`]
1846    /// at capture time. Within-run stable (the kernel does not
1847    /// relocate `struct bpf_map`); not comparable across runs
1848    /// (KASLR slide differs).
1849    ///
1850    /// `0` when capture did not record a KVA (e.g., synthetic test
1851    /// fixtures constructed via `..Default::default()`); consumers
1852    /// treating `0` as "no kernel identity" gracefully fall back to
1853    /// name-based matching.
1854    #[serde(default, skip_serializing_if = "is_zero_u64")]
1855    pub map_kva: u64,
1856    /// Raw `map_type` from `struct bpf_map` (e.g. `BPF_MAP_TYPE_ARRAY`).
1857    /// Kept as `u32` rather than an enum to avoid bumping a serde
1858    /// schema each time the kernel adds a kind.
1859    pub map_type: u32,
1860    /// Declared per-entry value size. Captured even when rendering
1861    /// fails so the operator can see the map shape.
1862    pub value_size: u32,
1863    /// Declared maximum entry count from `struct bpf_map.max_entries`.
1864    /// Surfaces alongside the rendered slice so a consumer can spot
1865    /// when the dump shows fewer entries than the map declares
1866    /// (e.g. ARRAY / HASH truncated at `MAX_ARRAY_KEYS` /
1867    /// `MAX_HASH_ENTRIES`; PERCPU_ARRAY truncated at
1868    /// `MAX_PERCPU_KEYS`).
1869    pub max_entries: u32,
1870    /// Single-value render for a single-entry ARRAY map
1871    /// (`max_entries <= 1`, incl. the `.bss`/`.data`/`.rodata`
1872    /// global sections). Multi-entry ARRAY maps use `array_entries`.
1873    #[serde(default, skip_serializing_if = "Option::is_none")]
1874    pub value: Option<RenderedValue>,
1875    /// (key, value) entries for HASH maps.
1876    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1877    pub entries: Vec<FailureDumpEntry>,
1878    /// Per-entry values for a multi-entry `BPF_MAP_TYPE_ARRAY` map,
1879    /// indexed by the array key (`u32`). Populated for
1880    /// `max_entries > 1`; the single-entry case uses `value`, so
1881    /// exactly one of `value` / `array_entries` is set for an ARRAY
1882    /// render. Capped at `MAX_ARRAY_KEYS`; truncation and per-key
1883    /// read failures surface in `error`.
1884    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1885    pub array_entries: Vec<FailureDumpArrayEntry>,
1886    /// Per-CPU slots for PERCPU_ARRAY maps. Outer Vec indexed by key,
1887    /// inner Vec indexed by CPU id.
1888    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1889    pub percpu_entries: Vec<FailureDumpPercpuEntry>,
1890    /// Per-key per-CPU slots for `PERCPU_HASH` / `LRU_PERCPU_HASH`
1891    /// maps. Same shape as `percpu_entries` but the outer key is
1892    /// arbitrary bytes (rendered via BTF when a key type id is
1893    /// available, hex otherwise) instead of the implicit u32 key
1894    /// of a per-CPU array.
1895    #[serde(default, skip_serializing_if = "Vec::is_empty")]
1896    pub percpu_hash_entries: Vec<FailureDumpPercpuHashEntry>,
1897    /// Page snapshot for `BPF_MAP_TYPE_ARENA` maps. `None` for all
1898    /// other map types.
1899    #[serde(default, skip_serializing_if = "Option::is_none")]
1900    pub arena: Option<ArenaSnapshot>,
1901    /// Position counters and capacity for `BPF_MAP_TYPE_RINGBUF` /
1902    /// `BPF_MAP_TYPE_USER_RINGBUF` maps. Surfaces stuck-consumer
1903    /// diagnostics — pending bytes far below the watermark plus
1904    /// non-zero `pending_pos` indicates a producer holding a
1905    /// reservation; pending bytes near capacity indicates a stalled
1906    /// consumer. `None` for non-ringbuf maps or when the BTF offsets
1907    /// for `bpf_ringbuf_map` / `bpf_ringbuf` weren't resolvable.
1908    #[serde(default, skip_serializing_if = "Option::is_none")]
1909    pub ringbuf: Option<FailureDumpRingbuf>,
1910    /// Per-bucket trace summary for `BPF_MAP_TYPE_STACK_TRACE` maps.
1911    /// `None` for non-STACK_TRACE maps or when the BTF offsets for
1912    /// `bpf_stack_map` / `stack_map_bucket` weren't resolvable.
1913    #[serde(default, skip_serializing_if = "Option::is_none")]
1914    pub stack_trace: Option<FailureDumpStackTrace>,
1915    /// Populated-slot summary for FD-array families (`PROG_ARRAY`,
1916    /// `PERF_EVENT_ARRAY`, `CGROUP_ARRAY`, `ARRAY_OF_MAPS`,
1917    /// `HASH_OF_MAPS`, `DEVMAP*`, `SOCKMAP*`, `CPUMAP`, `XSKMAP`,
1918    /// `REUSEPORT_SOCKARRAY`). `None` for non-FD-array maps.
1919    #[serde(default, skip_serializing_if = "Option::is_none")]
1920    pub fd_array: Option<FailureDumpFdArray>,
1921    /// Reason this map's contents are missing or partial. Empty on
1922    /// successful render.
1923    #[serde(default, skip_serializing_if = "Option::is_none")]
1924    pub error: Option<String>,
1925}
1926
1927/// Ringbuf occupancy snapshot read from `struct bpf_ringbuf` at the
1928/// freeze instant.
1929///
1930/// Capacity, consumer/producer positions, and the in-flight reservation
1931/// frontier (`pending_pos`) are all that's readable without walking the
1932/// records. Pending bytes (= `producer_pos - consumer_pos`, computed
1933/// with unsigned wraparound) is the operator-visible indicator: low
1934/// values = consumer keeping up; values approaching capacity = consumer
1935/// stuck or kernel producer overrunning. A non-zero gap between
1936/// `producer_pos` and `pending_pos` means a producer is mid-reserve
1937/// and the consumer can't advance past `pending_pos`.
1938///
1939/// Read via `crate::monitor::btf_offsets::BpfRingbufOffsets`; rendered
1940/// in `render_ringbuf_state`.
1941#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1942#[non_exhaustive]
1943pub struct FailureDumpRingbuf {
1944    /// Ring data area capacity in bytes (= `mask + 1`). Always a
1945    /// power of two; matches the map's declared `max_entries`.
1946    pub capacity: u64,
1947    /// Consumer position. Byte index of the next record userspace
1948    /// will read. Monotonically advances; the kernel never writes
1949    /// here.
1950    pub consumer_pos: u64,
1951    /// Producer position. Byte index past the last reserved record.
1952    /// Monotonically advances; updated by the kernel on each
1953    /// `bpf_ringbuf_reserve`.
1954    pub producer_pos: u64,
1955    /// Pending position. Byte index of the oldest in-flight (still
1956    /// being filled) reservation. Records below `pending_pos` are
1957    /// committed and visible to the consumer; records between
1958    /// `pending_pos` and `producer_pos` are reserved but not yet
1959    /// committed.
1960    pub pending_pos: u64,
1961    /// Pending bytes (= `producer_pos.wrapping_sub(consumer_pos)`).
1962    /// 0 = consumer caught up; capacity = ring full / consumer
1963    /// stalled. Computed with unsigned wraparound to match the
1964    /// kernel's dispatch-path arithmetic.
1965    pub pending_bytes: u64,
1966}
1967
1968/// Per-bucket summary of populated stack traces in a STACK_TRACE map.
1969///
1970/// Each `entry` is one populated bucket whose pointer was non-null at
1971/// the freeze instant. `nr` is the number of trace samples (PCs) in
1972/// the bucket; `pcs` carries the actual u64 PC values when readable
1973/// (build-id stacks render the raw bytes hex since the per-entry
1974/// shape is `struct bpf_stack_build_id`, not a u64). The dump caps
1975/// per-bucket entries at `MAX_STACK_TRACE_PCS` to bound memory.
1976#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1977#[non_exhaustive]
1978pub struct FailureDumpStackTrace {
1979    /// `bpf_stack_map.n_buckets` — the rounded-up power-of-two slot
1980    /// count. Iteration upper bound; differs from `max_entries` which
1981    /// the kernel rounds.
1982    pub n_buckets: u32,
1983    /// One entry per non-null bucket pointer. Sorted by bucket id.
1984    pub entries: Vec<FailureDumpStackTraceEntry>,
1985    /// True when any populated bucket was truncated at
1986    /// `MAX_STACK_TRACE_PCS` PCs.
1987    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
1988    pub truncated: bool,
1989    /// Count of buckets whose pointer slot or bucket struct could
1990    /// not be translated to a guest physical address at capture
1991    /// time (an unmapped page in the bucket array or a dangling
1992    /// bucket KVA). These buckets are absent from `entries`
1993    /// entirely. `0` when every non-null bucket was readable.
1994    /// Without this count `entries.len()` undercounts the live
1995    /// buckets and the gap reads as "fewer stacks present" rather
1996    /// than "stacks present but unreadable".
1997    #[serde(default, skip_serializing_if = "is_zero_u32")]
1998    pub buckets_unreadable: u32,
1999}
2000
2001/// One populated stack trace from a STACK_TRACE map.
2002#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2003#[non_exhaustive]
2004pub struct FailureDumpStackTraceEntry {
2005    /// Bucket id (= stack ID returned by `bpf_get_stackid`).
2006    pub bucket_id: u32,
2007    /// Number of trace samples (kernel `stack_map_bucket.nr`).
2008    pub nr: u32,
2009    /// PC values (u64) when the map is in non-build-id mode. Empty
2010    /// when `BPF_F_STACK_BUILD_ID` is set on the map (each entry
2011    /// is then a `bpf_stack_build_id` record — its raw bytes land
2012    /// in `data_hex`).
2013    #[serde(default, skip_serializing_if = "Vec::is_empty")]
2014    pub pcs: Vec<u64>,
2015    /// Hex-encoded raw bucket data bytes. Always populated alongside
2016    /// `pcs` so the operator can decode build-id stacks or correlate
2017    /// trace samples with the wire format.
2018    pub data_hex: String,
2019}
2020
2021/// Per-FD-array snapshot of populated indices.
2022///
2023/// FD-array families store `void *` slots in `bpf_array.ptrs`; each
2024/// slot is either NULL (empty) or a kernel pointer (struct bpf_prog *,
2025/// struct file *, etc.). The dump path reads up to
2026/// `MAX_FD_ARRAY_SLOTS` slots, counts non-zero, and lists the
2027/// populated indices.
2028#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2029#[non_exhaustive]
2030pub struct FailureDumpFdArray {
2031    /// Number of populated (non-zero) slots seen.
2032    pub populated: u32,
2033    /// Total slots scanned. Capped at `MAX_FD_ARRAY_SLOTS`.
2034    pub scanned: u32,
2035    /// Indices of populated slots. Truncated to
2036    /// `MAX_FD_ARRAY_INDICES` entries.
2037    #[serde(default, skip_serializing_if = "Vec::is_empty")]
2038    pub indices: Vec<u32>,
2039    /// True when iteration capped at `MAX_FD_ARRAY_SLOTS` and
2040    /// `scanned < max_entries`.
2041    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
2042    pub truncated: bool,
2043    /// True when `populated > indices.len()` because
2044    /// `MAX_FD_ARRAY_INDICES` capped the index list.
2045    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
2046    pub indices_truncated: bool,
2047    /// Count of scanned slots whose KVA could not be translated to a
2048    /// guest physical address (an unmapped page in the `ptrs` flex
2049    /// array). These slots are neither confirmed empty nor counted
2050    /// as `populated`, so `populated` is a lower bound when this is
2051    /// non-zero. `0` when every scanned slot was readable.
2052    #[serde(default, skip_serializing_if = "is_zero_u32")]
2053    pub unreadable: u32,
2054}
2055
2056/// One (key, value) pair from a hash map. Both sides are rendered via
2057/// BTF when key/value type ids are available; a `None` rendering
2058/// preserves the raw bytes.
2059#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2060#[non_exhaustive]
2061pub struct FailureDumpEntry {
2062    /// Rendered key. `None` when no BTF type is available for the key.
2063    #[serde(default, skip_serializing_if = "Option::is_none")]
2064    pub key: Option<RenderedValue>,
2065    /// Hex-encoded raw key bytes. Kept alongside `key` so the operator
2066    /// can correlate rendered output with the wire format.
2067    pub key_hex: String,
2068    /// Rendered value. `None` when no BTF type is available.
2069    #[serde(default, skip_serializing_if = "Option::is_none")]
2070    pub value: Option<RenderedValue>,
2071    /// Hex-encoded raw value bytes.
2072    pub value_hex: String,
2073    /// Typed render of the per-entry sdt_alloc payload, when the value
2074    /// carries a `struct sdt_data __arena *` field that points into a
2075    /// captured arena page and a payload type id was discovered for
2076    /// the matching allocator. `None` when the entry carries no arena
2077    /// pointer to chase, no allocator metadata was found, the payload
2078    /// type was ambiguous, or the arena read failed.
2079    ///
2080    /// `value` already renders the surface struct (e.g.
2081    /// `scx_task_map_val { tid, tptr, data: 0x100000... -> sdt_data {
2082    /// tid: { idx, genn } } }`), but `sdt_data.payload[]` is a flex
2083    /// array — BTF reports its size as 0, so the per-task struct that
2084    /// actually lives in the payload bytes never decodes through the
2085    /// surface render. This field carries that decoded payload
2086    /// alongside the surface struct so the operator sees both views
2087    /// at once.
2088    #[serde(default, skip_serializing_if = "Option::is_none")]
2089    pub payload: Option<RenderedValue>,
2090}
2091
2092/// One entry of a multi-entry `BPF_MAP_TYPE_ARRAY` map: the array key
2093/// (`u32`) and its rendered value. Mirrors [`FailureDumpPercpuEntry`]'s
2094/// typed `u32` key (ARRAY keys are kernel-imposed indices, not
2095/// user-typed bytes) but carries a single value rather than a per-CPU
2096/// vector.
2097///
2098/// `value` is `None` only when the entry's guest page was unmapped at
2099/// the freeze instant; a BTF-render miss falls back to
2100/// `RenderedValue::Bytes` (hex), so `None` unambiguously means
2101/// "unreadable", not "un-rendered". ARRAY values are not
2102/// `sdt_data`-arena-chased — no in-tree sched_ext ARRAY stores arena
2103/// pointers; add a `payload` field here if one ever does.
2104#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2105#[non_exhaustive]
2106pub struct FailureDumpArrayEntry {
2107    /// Array index (kernel key).
2108    pub key: u32,
2109    /// Rendered value (BTF when `btf_value_type_id` is non-zero, hex
2110    /// fallback otherwise). `None` when the entry was unreadable.
2111    #[serde(default, skip_serializing_if = "Option::is_none")]
2112    pub value: Option<RenderedValue>,
2113}
2114
2115/// One key from a per-CPU array, with one rendered value per CPU
2116/// (None for CPUs whose per-CPU page was unmapped or out-of-range).
2117#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2118#[non_exhaustive]
2119pub struct FailureDumpPercpuEntry {
2120    pub key: u32,
2121    pub per_cpu: Vec<Option<RenderedValue>>,
2122}
2123
2124/// One key from a `PERCPU_HASH` / `LRU_PERCPU_HASH` map, with one
2125/// rendered value per CPU. Mirrors [`FailureDumpEntry`] for the key
2126/// side (rendered + hex) and [`FailureDumpPercpuEntry`] for the
2127/// per-CPU value vector.
2128#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2129#[non_exhaustive]
2130pub struct FailureDumpPercpuHashEntry {
2131    /// Rendered key. `None` when no BTF type is available for the key.
2132    #[serde(default, skip_serializing_if = "Option::is_none")]
2133    pub key: Option<RenderedValue>,
2134    /// Hex-encoded raw key bytes.
2135    pub key_hex: String,
2136    /// One slot per CPU. `None` when the CPU's per-CPU slot was
2137    /// unmapped or out-of-range; `Some` rendered (BTF when value type
2138    /// id is non-zero) or hex bytes otherwise.
2139    pub per_cpu: Vec<Option<RenderedValue>>,
2140}
2141
2142/// Sanity cap on a single BTF blob read.
2143///
2144/// BPF program BTF is normally <100 KB; vmlinux BTF caps around
2145/// ~10 MB. A bogus `data_size` (corrupted `struct btf`) shouldn't
2146/// pull megabytes of unrelated guest memory into the renderer or the
2147/// freeze coordinator. Shared between [`load_program_btf_kva`] and
2148/// `vmm::load_probe_bss_offset`; defining it here keeps the bound
2149/// in one place so a future tightening doesn't drift between sites.
2150pub(crate) const MAX_BTF_BLOB: usize = 32 * 1024 * 1024;
2151
2152/// Hard cap on the per-task enrichment loop inside [`dump_state`].
2153///
2154/// A hostile or pathologically broken guest can produce a runnable_list
2155/// chain whose length is bounded only by the number of
2156/// `task_struct`s in the kernel — tens of thousands on a busy box.
2157/// Each enrichment call walks task/signal/pid/upid offsets, the
2158/// sched_class registry, and the lock-slowpath stack matcher, so an
2159/// uncapped loop turns the freeze window from milliseconds into
2160/// minutes. 4096 is well above any healthy SCX runnable_list depth
2161/// (the kernel's own watchdog fires long before that many tasks
2162/// queue up) and still bounds the worst-case freeze cost. When the
2163/// cap fires, [`dump_state`] truncates without enriching the tail
2164/// and stamps [`FailureDumpReport::dump_truncated_at_us`] so the
2165/// operator knows to attribute missing tasks to truncation rather
2166/// than walker failure.
2167pub const MAX_ENRICHED_TASKS: usize = 4096;
2168
2169/// Bare-named ktstr framework maps to skip during enumeration.
2170///
2171/// These are declared in `src/bpf/probe.bpf.c` without a libbpf
2172/// `<obj>.<section>` prefix (`SEC(".maps")` declarations like
2173/// `func_meta_map`, `probe_data`, `probe_scratch`, `ktstr_events`);
2174/// the kernel registers them under the bare names listed here.
2175/// They're framework-internal — the user looking at a failure dump
2176/// for their scheduler doesn't care about ktstr's own kprobe
2177/// scratch — so the dump path drops them.
2178///
2179/// The framework's ringbuf is named `ktstr_events` (not `events`)
2180/// so a user scheduler that legitimately names its own ringbuf
2181/// `events` is not silently dropped from the dump.
2182///
2183/// Future ktstr probe additions need to be added here AND the
2184/// matching `<obj_name>.` prefix needs to be in the
2185/// `render_map`-internal starts_with list (see `dump_state`).
2186const KTSTR_INTERNAL_MAPS: &[&str] = &[
2187    "func_meta_map",
2188    "probe_data",
2189    "probe_scratch",
2190    "ktstr_events",
2191];
2192
2193/// All inputs the failure-dump renderer needs, bundled so future
2194/// capture sites (DSQ walker, rq->scx walker, NUMA stats, ...) can
2195/// land as new optional fields without churning every call site.
2196///
2197/// `accessor` is currently the concrete guest-memory backend. The
2198/// trait dispatch claim in [`BpfMapAccessor`]'s module-level doc
2199/// is aspirational: `dump_state` reaches through the accessor for
2200/// map enumeration AND for the sdt_alloc post-pass walk, which
2201/// needs the underlying [`super::guest::GuestKernel`] handle —
2202/// only the guest-memory backend exposes that. When the live-host
2203/// backend lands, sdt_alloc walking will move into a
2204/// backend-specific path and `accessor` here can become
2205/// `&'a dyn BpfMapAccessor`.
2206///
2207/// `arena_offsets` and `prog_capture` are both optional borrows
2208/// (uniform shape): `None` for either disables that
2209/// capture leg without affecting the rest. A scheduler running on
2210/// an older kernel without arena support lands here with
2211/// `arena_offsets: None` and the failure dump renders maps + regs
2212/// without arena pages; a setup where the BpfProgAccessor couldn't
2213/// resolve `prog_idr` lands with `prog_capture: None` and
2214/// `prog_runtime_stats` stays empty.
2215pub struct DumpContext<'a> {
2216    /// BPF map accessor. Concrete guest-memory backend today; see
2217    /// the type-level doc for why this is not `&dyn BpfMapAccessor`.
2218    pub accessor: &'a GuestMemMapAccessor<'a>,
2219    /// Host-resolved vmlinux BTF. The renderer uses it as the base
2220    /// for split-BTF parsing on programs that ship their own type
2221    /// info; it's also the fallback when a map's program BTF can't
2222    /// be loaded.
2223    pub btf: &'a Btf,
2224    /// Guest's `nr_cpu_ids`. Forwarded into per-CPU map rendering
2225    /// so PERCPU_ARRAY readers know how many slots to enumerate.
2226    /// Pass `1` for non-percpu-only dumps if the caller doesn't
2227    /// have the value handy.
2228    pub num_cpus: u32,
2229    /// BTF-resolved arena field offsets. Enables
2230    /// `BPF_MAP_TYPE_ARENA` page snapshotting via the accessor
2231    /// trait's `read_arena_pages`. `None` skips arena rendering
2232    /// (older kernel without arena support, or BTF lacking
2233    /// `struct bpf_arena`).
2234    pub arena_offsets: Option<&'a BpfArenaOffsets>,
2235    /// Per-program runtime stats capture. `None` skips
2236    /// prog-runtime capture; the dump still renders every map the
2237    /// accessor enumerates.
2238    pub prog_capture: Option<&'a ProgRuntimeCapture<'a>>,
2239    /// Per-CPU CPU-time / softirq / IRQ capture. `None` skips the
2240    /// per-CPU time walk; the rest of the dump still renders. Same
2241    /// "borrowed-only, optional" shape as
2242    /// [`Self::prog_capture`] / [`Self::arena_offsets`] so a
2243    /// future capture site lands as another optional field without
2244    /// churning the call sites already plumbed through here.
2245    pub cpu_time_capture: Option<&'a CpuTimeCapture<'a>>,
2246    /// Per-cgroup PSI-irq capture (Phase A). `None` skips the cgroup
2247    /// hierarchy walk; the rest of the dump still renders. Same
2248    /// borrowed-only/optional shape as [`Self::cpu_time_capture`].
2249    pub cgroup_psi_capture: Option<&'a CgroupPsiCapture<'a>>,
2250    /// Per-task enrichment capture. `None` skips the per-task walk
2251    /// and `task_enrichments` stays empty; the rest of the dump
2252    /// still renders.
2253    ///
2254    /// Today's freeze coordinator passes `None` because the DSQ
2255    /// and rq->scx task walkers have not yet landed dispatch. The
2256    /// `TaskEnrichmentOffsets` + `SchedClassRegistry` + the
2257    /// `walk_task_enrichment` library are wired and ready —
2258    /// the producer side just needs to populate
2259    /// [`TaskEnrichmentCapture::tasks`] from the rq->scx walker.
2260    pub task_enrichment_capture: Option<&'a TaskEnrichmentCapture<'a>>,
2261    /// SCX_EV_* event counter timeline capture. `None` skips
2262    /// timeline rendering and `event_counter_timeline` stays
2263    /// empty; the rest of the dump still renders. Same
2264    /// "borrowed-only, optional" shape as
2265    /// [`Self::cpu_time_capture`].
2266    pub event_counter_capture: Option<&'a EventCounterCapture<'a>>,
2267    /// SCX rq->scx + DSQ walker capture. `None` skips the walk;
2268    /// `rq_scx_states` / `dsq_states` / `scx_sched_state` stay
2269    /// empty/None and `scx_walker_unavailable` records why.
2270    pub scx_walker_capture: Option<&'a ScxWalkerCapture<'a>>,
2271    /// Host-side per-vCPU hardware perf counters (cycles,
2272    /// instructions, cache-misses, branch-misses) opened with
2273    /// `exclude_host=1`, so each counter only ticks during guest
2274    /// execution. `None` skips the freeze-time read; the
2275    /// [`FailureDumpReport::vcpu_perf_at_freeze`] vec stays empty.
2276    /// See [`super::perf_counters`] for the kernel-source-grounded
2277    /// rationale and capture semantics.
2278    ///
2279    /// The same capture is shared (via `Arc` in the freeze
2280    /// coordinator) with the per-tick monitor sampler; per-tick
2281    /// samples land on each [`super::CpuSnapshot::vcpu_perf`]. The
2282    /// freeze-time read here records the absolute counter values at
2283    /// the instant the failure dump fired, which lets a consumer
2284    /// diff against any earlier sample to compute IPC over a
2285    /// freeze-aligned window.
2286    pub perf_capture: Option<&'a super::perf_counters::PerfCountersCapture>,
2287    /// Soft deadline for the dump's heavy phases (per-map render
2288    /// loop, walk_rq_scx, walk_local_dsqs, walk_dsqs sched-rooted,
2289    /// walk_task_enrichment, sdt_alloc post-pass). When supplied,
2290    /// each phase boundary checks `Instant::now() > deadline`; the
2291    /// first crossing truncates remaining work and stamps
2292    /// [`FailureDumpReport::dump_truncated_at_us`]. `None` disables
2293    /// the bailout — the dump runs every phase to completion.
2294    ///
2295    /// Set by the freeze coordinator to `capture_start +
2296    /// watchdog_timeout/2` so a slow dump can't keep vCPUs parked
2297    /// past the kernel's own SCX_EXIT_ERROR_STALL emission line. The
2298    /// deadline is a soft bound: each phase that has already started
2299    /// runs to completion before checking, so the actual elapsed
2300    /// time at truncation can exceed the deadline by one phase's
2301    /// worth of work.
2302    pub deadline: Option<std::time::Instant>,
2303    /// BPF cast-analysis output for the scheduler's program object,
2304    /// produced once at builder time by parsing the scheduler
2305    /// binary's `.bpf.objs` ELF blob (no libbpf, no kernel
2306    /// interaction). Threaded into every per-map [`RenderMapCtx`]
2307    /// so the renderer's
2308    /// [`super::btf_render::MemReader::cast_lookup`] can promote
2309    /// `u64` fields the analyzer flagged into typed-pointer
2310    /// renders. `None` skips cast-driven promotion entirely (every
2311    /// `u64` renders as a plain unsigned counter, the
2312    /// pre-integration default); same effect as passing an empty
2313    /// map but cheaper to thread.
2314    pub cast_map: Option<&'a super::cast_analysis::CastMap>,
2315    /// Unique alloc_sizes captured from `scx_static_alloc_internal`
2316    /// call sites. Threaded to the renderer as a last-resort fallback.
2317    pub alloc_size_types: &'a [(u64, String)],
2318    /// Cross-BTF Fwd resolution context: every parsed embedded
2319    /// BPF object's program BTF plus a name-keyed index over
2320    /// every complete struct/union across them. Threaded into
2321    /// every per-map [`RenderMapCtx`] so the renderer's
2322    /// [`super::btf_render::MemReader::cross_btf_resolve_fwd`]
2323    /// can chase a `BTF_KIND_FWD` whose body lives in a sibling
2324    /// embedded object's BTF. Borrowed slices in the
2325    /// `(btfs, fwd_index)` pair point into the
2326    /// [`crate::vmm::cast_analysis_load::CastAnalysisOutput`] the
2327    /// freeze coordinator holds alive via `Arc` for the dump
2328    /// pass; `None` (no scheduler binary, or analyzer found no
2329    /// complete struct/union definitions) keeps the renderer's
2330    /// "forward declaration; body not in this BTF" skip path
2331    /// intact.
2332    pub cross_btf_fwd_index: Option<CrossBtfFwdIndex<'a>>,
2333}
2334
2335/// Per-dump cross-BTF Fwd resolution context: every parsed program
2336/// BTF the cast-analysis pre-pass discovered, plus a name-keyed
2337/// index over the complete (`!is_fwd`) struct/union definitions
2338/// across them.
2339///
2340/// Built once at the freeze-coordinator side from
2341/// [`crate::vmm::cast_analysis_load::CastAnalysisOutput`] and
2342/// threaded through [`DumpContext::cross_btf_fwd_index`] into
2343/// every per-map `AccessorMemReader`. The renderer's
2344/// [`super::btf_render::MemReader::cross_btf_resolve_fwd`]
2345/// override range-looks up the hit and returns a
2346/// [`super::btf_render::CrossBtfRef`] whose `btf` borrow points at
2347/// the matching `Arc<Btf>` inside `btfs`.
2348///
2349/// Empty `btfs` / empty `fwd_index` are valid (no scheduler binary,
2350/// or analyzer found no Struct/Union definitions); the bridge stays
2351/// dormant and the chase falls through to the legacy
2352/// "forward declaration" skip path.
2353pub struct CrossBtfFwdIndex<'a> {
2354    /// Every parsed program BTF in the order
2355    /// `crate::vmm::cast_analysis_load::iter_embedded_bpf_objects`
2356    /// yielded the embedded objects. Index 0 is the first object's
2357    /// BTF, etc. Empty when the scheduler binary had no parseable
2358    /// `.bpf.objs`. Borrowed from the
2359    /// [`crate::vmm::cast_analysis_load::CastAnalysisOutput`] held
2360    /// alive by the freeze coordinator's `Arc` for the dump pass.
2361    pub btfs: &'a [std::sync::Arc<Btf>],
2362    /// `name -> FwdIndexEntry` over every complete struct/union
2363    /// across `btfs`. See
2364    /// [`crate::vmm::cast_analysis_load::CastAnalysisOutput::fwd_index`]
2365    /// for the construction policy (first-write-wins on duplicate
2366    /// names, anonymous types skipped).
2367    pub fwd_index:
2368        &'a std::collections::HashMap<String, crate::vmm::cast_analysis_load::FwdIndexEntry>,
2369}
2370
2371/// Reconstruct an `ScxSchedState` from the probe BPF program's
2372/// `.bss` snapshot (`ktstr_exit_*` vars).
2373///
2374/// Used as a fallback by [`dump_state`] when
2375/// [`super::scx_walker::read_scx_sched_state`] returned `None`
2376/// because `*scx_root == 0` at freeze time. The probe's tp_btf
2377/// handler captured the same scalars BEFORE the kernel teardown
2378/// nulled `scx_root`, so this path produces a coherent view of
2379/// what the scheduler looked like AT THE INSTANT IT ERRORED OUT —
2380/// which is exactly the state an operator wants to debug.
2381///
2382/// Returns `None` when:
2383///   - the probe `.bss` map isn't loaded yet (boot-race window),
2384///   - the probe's program BTF can't be parsed,
2385///   - the snapshot's `ktstr_exit_kind_snap` is still 0 (latch
2386///     never fired this run, so the snapshot is empty defaults),
2387///   - or any individual var lookup / read fails wholesale.
2388///
2389/// Variable names match the probe BPF declarations one-for-one
2390/// (`ktstr_exit_aborting`, `ktstr_exit_bypass_depth`,
2391/// `ktstr_exit_kind_snap`, `ktstr_exit_sched_kva`,
2392/// `ktstr_exit_watchdog_timeout`); each is resolved by name via the
2393/// program-BTF Datasec walk so a future addition / reorder of `.bss`
2394/// vars does not silently misalign offsets.
2395/// Decode the probe BPF program's per-CPU counter array
2396/// (`ktstr_pcpu_counters`) and sum each slot across CPUs.
2397///
2398/// The probe declares `pcpu_counter ktstr_pcpu_counters[MAX_CPUS]
2399/// [KTSTR_PCPU_NR]` in `.bss`; each `pcpu_counter` is a single
2400/// `long` field forced to 128-byte alignment, so each per-CPU slot
2401/// occupies its own cacheline. The host walks each
2402/// `(cpu, slot)` 8-byte slice and sums into a [`ProbeBssCounters`]
2403/// — see the BPF source for the
2404/// `ktstr_pcpu_inc(KTSTR_PCPU_<NAME>)` fire sites.
2405///
2406/// Returns `None` when:
2407///   - the probe `.bss` map isn't loaded yet (boot-race window),
2408///   - the probe's program BTF can't be parsed,
2409///   - the BTF doesn't carry a `ktstr_pcpu_counters` var (probe
2410///     build that pre-dates the per-CPU conversion), or
2411///   - the array's bytes can't be read wholesale.
2412///
2413/// All values use `u64` for wire compatibility; the underlying
2414/// kernel `long` is signed but every fire site only ever
2415/// increments, so a positive cumulative count is the only outcome
2416/// in practice. Negative reads (would indicate guest-memory
2417/// corruption) saturate to 0 via `as u64`.
2418fn decode_probe_counters_snapshot(
2419    accessor: &GuestMemMapAccessor<'_>,
2420    base_btf: &Btf,
2421) -> Option<ProbeBssCounters> {
2422    use super::bpf_map::BpfMapAccessor;
2423
2424    // Slot indices must match `enum ktstr_pcpu_idx` in
2425    // src/bpf/probe.bpf.c. A reorder in the BPF source breaks
2426    // every reader; the explicit constants here keep the slot
2427    // mapping localized and reviewable.
2428    const PCPU_PROBE_COUNT: usize = 0;
2429    const PCPU_KPROBE_RETURNS: usize = 1;
2430    const PCPU_META_MISS: usize = 2;
2431    const PCPU_RINGBUF_DROPS: usize = 3;
2432    const PCPU_TIMELINE_COUNT: usize = 4;
2433    const PCPU_TIMELINE_DROPS: usize = 5;
2434    const PCPU_PI_COUNT: usize = 6;
2435    const PCPU_PI_ORPHAN_FEXITS: usize = 7;
2436    const PCPU_PI_CLASS_CHANGE_COUNT: usize = 8;
2437    const PCPU_PI_DROPS: usize = 9;
2438    const PCPU_LOCK_CONTEND_COUNT: usize = 10;
2439    const PCPU_LOCK_CONTEND_DROPS: usize = 11;
2440    const PCPU_PREEMPT_DISABLE_COUNT: usize = 12;
2441    const PCPU_PREEMPT_ENABLE_COUNT: usize = 13;
2442    const PCPU_TRIGGER_COUNT: usize = 14;
2443    const PCPU_NR: usize = 15;
2444    /// Per-CPU slot stride in bytes — `pcpu_counter` is forced to
2445    /// 128-byte alignment in the BPF source so each slot occupies
2446    /// one cacheline. Mirroring the alignment here keeps the
2447    /// host-side walk in lockstep with the BPF storage layout;
2448    /// any future change to the alignment must update both.
2449    const PCPU_SLOT_STRIDE: usize = 128;
2450    /// Per-CPU dimension. Matches `MAX_CPUS` in `src/bpf/probe.bpf.c`
2451    /// (CPU_MASK + 1 = 256). Walking every CPU slot is cheap (256
2452    /// CPUs × 15 slots × 8 bytes = 30 KB of reads); slots beyond
2453    /// the actual `nr_cpus` are zero-init `.bss` and contribute
2454    /// nothing to the sum.
2455    const MAX_CPUS: usize = 256;
2456
2457    // Locate the probe's `.bss` map. Same suffix the freeze
2458    // coordinator's lazy-discovery path uses (matched by suffix
2459    // to avoid colliding with a scheduler-under-test's own
2460    // `.bss`).
2461    let bss_map = accessor.find_array_map("probe_bp.bss")?;
2462    if bss_map.btf_kva == 0 {
2463        // Probe not yet loaded — accessor enumerated a stub.
2464        return None;
2465    }
2466
2467    // Load the probe's program BTF as split BTF on top of the
2468    // host vmlinux BTF (matches the freeze coordinator's
2469    // load_probe_bss_offset pattern). Failure is silent — the
2470    // dump path stays best-effort and falls through to None so
2471    // the caller leaves `probe_counters` as None rather than
2472    // emitting a misleading partial.
2473    let prog_btf = load_program_btf_kva(accessor, bss_map.btf_kva, base_btf)?;
2474
2475    // Resolve the array's byte offset within the `.bss` Datasec.
2476    // A missing var (e.g. probe build that pre-dates the per-CPU
2477    // conversion) means the snapshot wasn't emitted — bail.
2478    let array_off = super::btf_offsets::resolve_var_offset_in_section(
2479        &prog_btf,
2480        ".bss",
2481        "ktstr_pcpu_counters",
2482    )? as usize;
2483
2484    // Read the entire array as one slab — 256 * 15 * 128 = 480 KiB.
2485    // A single slab read is cheaper than 256 * 17 individual reads
2486    // through the page-walking accessor; the read primitive
2487    // tolerates over-large requests (truncates at the map's
2488    // value_size) so a future MAX_CPUS / PCPU_NR shrink doesn't
2489    // need a coordinated host update.
2490    let total_bytes = MAX_CPUS * PCPU_NR * PCPU_SLOT_STRIDE;
2491    let array_bytes = accessor.read_value(&bss_map, array_off, total_bytes)?;
2492    if array_bytes.len() < total_bytes {
2493        // Short read — the map's value_size bounds were tighter
2494        // than the array's compile-time shape. A future probe
2495        // build that shrinks MAX_CPUS or PCPU_NR is the expected
2496        // case; bail rather than misalign the slot indexing.
2497        return None;
2498    }
2499
2500    // Sum every CPU's slot. Each slot's `long value` lives at
2501    // offset 0 within the cacheline-aligned `pcpu_counter`
2502    // struct, so the per-(cpu, slot) byte offset is
2503    // `(cpu * PCPU_NR + slot) * PCPU_SLOT_STRIDE`.
2504    let sum_slot = |slot: usize| -> u64 {
2505        let mut total: u64 = 0;
2506        for cpu in 0..MAX_CPUS {
2507            let off = (cpu * PCPU_NR + slot) * PCPU_SLOT_STRIDE;
2508            // BPF runs in little-endian byte order on every
2509            // host arch ktstr targets (x86_64, aarch64). A future
2510            // big-endian host would need an arch gate — flagged
2511            // in the probe BPF source's byte-order section.
2512            let mut buf = [0u8; 8];
2513            buf.copy_from_slice(&array_bytes[off..off + 8]);
2514            // The kernel's `long` is signed but counters only
2515            // increment; cast through `i64` then to `u64` to
2516            // saturate any negative value (corruption signal) to 0.
2517            let v = i64::from_le_bytes(buf);
2518            if v > 0 {
2519                total = total.saturating_add(v as u64);
2520            }
2521        }
2522        total
2523    };
2524
2525    Some(ProbeBssCounters {
2526        probe_count: sum_slot(PCPU_PROBE_COUNT),
2527        kprobe_returns: sum_slot(PCPU_KPROBE_RETURNS),
2528        meta_miss: sum_slot(PCPU_META_MISS),
2529        ringbuf_drops: sum_slot(PCPU_RINGBUF_DROPS),
2530        timeline_count: sum_slot(PCPU_TIMELINE_COUNT),
2531        timeline_drops: sum_slot(PCPU_TIMELINE_DROPS),
2532        pi_count: sum_slot(PCPU_PI_COUNT),
2533        pi_orphan_fexits: sum_slot(PCPU_PI_ORPHAN_FEXITS),
2534        pi_class_change_count: sum_slot(PCPU_PI_CLASS_CHANGE_COUNT),
2535        pi_drops: sum_slot(PCPU_PI_DROPS),
2536        lock_contend_count: sum_slot(PCPU_LOCK_CONTEND_COUNT),
2537        lock_contend_drops: sum_slot(PCPU_LOCK_CONTEND_DROPS),
2538        preempt_disable_count: sum_slot(PCPU_PREEMPT_DISABLE_COUNT),
2539        preempt_enable_count: sum_slot(PCPU_PREEMPT_ENABLE_COUNT),
2540        trigger_count: sum_slot(PCPU_TRIGGER_COUNT),
2541    })
2542}
2543
2544fn decode_probe_sched_state_snapshot(
2545    accessor: &GuestMemMapAccessor<'_>,
2546    base_btf: &Btf,
2547) -> Option<super::scx_walker::ScxSchedState> {
2548    use super::bpf_map::BpfMapAccessor;
2549
2550    // Locate the probe's `.bss` map. Same suffix the freeze
2551    // coordinator's lazy-discovery path uses (matched by suffix to
2552    // avoid colliding with a scheduler-under-test's own `.bss`).
2553    let bss_map = accessor.find_array_map("probe_bp.bss")?;
2554    if bss_map.btf_kva == 0 {
2555        // Probe not yet loaded — accessor enumerated a stub. The
2556        // var offsets live in the program BTF the loader hasn't
2557        // attached yet.
2558        return None;
2559    }
2560
2561    // Load the probe's program BTF as a split BTF on top of the
2562    // host vmlinux BTF (matches the freeze coordinator's
2563    // load_probe_bss_offset pattern). Failure is silent — the dump
2564    // path stays best-effort and falls through to None so the
2565    // caller leaves `scx_sched_state` as None rather than emitting
2566    // a misleading partial.
2567    let prog_btf = load_program_btf_kva(accessor, bss_map.btf_kva, base_btf)?;
2568
2569    // Resolve each `ktstr_exit_*` var's byte offset within the
2570    // `.bss` Datasec. A missing var (e.g. probe build that pre-
2571    // dates the snapshot vars) means the snapshot wasn't emitted —
2572    // bail rather than render zero defaults that would alias as
2573    // "scheduler healthy and exited cleanly".
2574    let kind_off = super::btf_offsets::resolve_var_offset_in_section(
2575        &prog_btf,
2576        ".bss",
2577        "ktstr_exit_kind_snap",
2578    )?;
2579    let aborting_off = super::btf_offsets::resolve_var_offset_in_section(
2580        &prog_btf,
2581        ".bss",
2582        "ktstr_exit_aborting",
2583    )?;
2584    let bypass_depth_off = super::btf_offsets::resolve_var_offset_in_section(
2585        &prog_btf,
2586        ".bss",
2587        "ktstr_exit_bypass_depth",
2588    )?;
2589    let sched_kva_off = super::btf_offsets::resolve_var_offset_in_section(
2590        &prog_btf,
2591        ".bss",
2592        "ktstr_exit_sched_kva",
2593    )?;
2594    let watchdog_timeout_off = super::btf_offsets::resolve_var_offset_in_section(
2595        &prog_btf,
2596        ".bss",
2597        "ktstr_exit_watchdog_timeout",
2598    )?;
2599
2600    // Read each var's bytes via the accessor. `.bss` maps have a
2601    // single key (zero) and the value bytes ARE the section bytes,
2602    // so `read_value(map, off, size)` is the read primitive. A
2603    // failed read on any field bails the whole snapshot — partial
2604    // values would mislead the consumer.
2605    let kind_bytes = accessor.read_value(&bss_map, kind_off as usize, 4)?;
2606    let kind = u32::from_le_bytes(kind_bytes.as_slice().try_into().ok()?);
2607
2608    // The snapshot is sticky: `ktstr_exit_kind_snap` stays at 0
2609    // until the BPF tp_btf handler latches an error-class exit. A
2610    // 0 here means the latch never fired — the snapshot vars are
2611    // all at their initial 0/false defaults and the dump should
2612    // honour `*scx_root == 0` as "no scheduler state to surface"
2613    // rather than render a fake healthy-exit ScxSchedState.
2614    if kind == 0 {
2615        return None;
2616    }
2617
2618    let aborting_bytes = accessor.read_value(&bss_map, aborting_off as usize, 1)?;
2619    let aborting = aborting_bytes.first().copied()? != 0;
2620
2621    let bypass_depth_bytes = accessor.read_value(&bss_map, bypass_depth_off as usize, 4)?;
2622    let bypass_depth = i32::from_le_bytes(bypass_depth_bytes.as_slice().try_into().ok()?);
2623
2624    let sched_kva_bytes = accessor.read_value(&bss_map, sched_kva_off as usize, 8)?;
2625    let sched_kva = u64::from_le_bytes(sched_kva_bytes.as_slice().try_into().ok()?);
2626
2627    let watchdog_timeout_bytes = accessor.read_value(&bss_map, watchdog_timeout_off as usize, 8)?;
2628    let watchdog_timeout = u64::from_le_bytes(watchdog_timeout_bytes.as_slice().try_into().ok()?);
2629
2630    Some(super::scx_walker::ScxSchedState {
2631        aborting,
2632        bypass_depth,
2633        exit_kind: kind,
2634        watchdog_timeout: Some(watchdog_timeout),
2635        source: Some(super::scx_walker::SCX_SCHED_STATE_SOURCE_BSS.to_string()),
2636        // `sched_kva == 0` would mean the BPF probe handler ran
2637        // BEFORE `*scx_root` was populated (impossibly early — the
2638        // tp_btf hook is on `sched_ext_exit`, which only fires after
2639        // a sched_ext scheduler attached and ran). Surface it as
2640        // None so the consumer can distinguish "snapshot data exists
2641        // but no slab address" from "snapshot has the address" via
2642        // a single Option rather than a magic-zero check.
2643        sched_kva: if sched_kva == 0 {
2644            None
2645        } else {
2646            Some(sched_kva)
2647        },
2648    })
2649}
2650
2651/// Snapshot every BPF map visible to the host accessor.
2652///
2653/// The dump is best-effort: a map that fails to render lands in the
2654/// report with `error: Some(...)` rather than aborting the whole walk,
2655/// so a single corrupt map can't blind the operator to the rest of
2656/// the scheduler's state.
2657pub fn dump_state(ctx: DumpContext<'_>) -> FailureDumpReport {
2658    let DumpContext {
2659        accessor,
2660        btf,
2661        num_cpus,
2662        arena_offsets,
2663        prog_capture,
2664        cpu_time_capture,
2665        cgroup_psi_capture,
2666        task_enrichment_capture,
2667        event_counter_capture,
2668        scx_walker_capture,
2669        perf_capture,
2670        deadline,
2671        cast_map,
2672        cross_btf_fwd_index,
2673        alloc_size_types,
2674    } = ctx;
2675    let cross_btf_fwd_index_ref = cross_btf_fwd_index.as_ref();
2676    // Wall-clock origin for per-phase elapsed_us tracing and the
2677    // soft-deadline bailout. Each heavy phase compares
2678    // `Instant::now()` against `deadline` AFTER it finishes, so a
2679    // truncation captures the phase's data before short-circuiting
2680    // the remaining ones (consistent with the doc on
2681    // [`DumpContext::deadline`]).
2682    let dump_start = std::time::Instant::now();
2683    // Tracks the elapsed_us of the first phase to observe a deadline
2684    // crossing. Stamped onto [`FailureDumpReport::dump_truncated_at_us`]
2685    // at the end so the operator can attribute missing maps / tasks /
2686    // walker results to truncation rather than walker failure.
2687    let mut truncated_at_us: Option<u64> = None;
2688    // Helper closure: returns `true` once the deadline (if any) has
2689    // been crossed. Sets `truncated_at_us` on the FIRST crossing so
2690    // the report records WHERE truncation began, not the last phase
2691    // to short-circuit. Idempotent on repeated calls — once stamped,
2692    // every later phase sees the same elapsed_us.
2693    let deadline_exceeded = |truncated_at: &mut Option<u64>| -> bool {
2694        if let Some(deadline) = deadline {
2695            let now = std::time::Instant::now();
2696            if now > deadline {
2697                if truncated_at.is_none() {
2698                    let elapsed_us = dump_start.elapsed().as_micros() as u64;
2699                    *truncated_at = Some(elapsed_us);
2700                    tracing::warn!(
2701                        elapsed_us,
2702                        "dump_state: deadline exceeded, truncating remaining phases"
2703                    );
2704                }
2705                return true;
2706            }
2707        }
2708        false
2709    };
2710    let maps = accessor.maps();
2711    let (prog_runtime_stats, prog_runtime_stats_unavailable) = match prog_capture {
2712        Some(cap) => {
2713            let stats = cap.accessor.struct_ops_runtime_stats(cap.per_cpu_offsets);
2714            let reason = if stats.is_empty() {
2715                Some(REASON_NO_STRUCT_OPS_LOADED.to_string())
2716            } else {
2717                None
2718            };
2719            (stats, reason)
2720        }
2721        None => (
2722            Vec::new(),
2723            Some(REASON_PROG_ACCESSOR_UNAVAILABLE.to_string()),
2724        ),
2725    };
2726    let per_cpu_time = match cpu_time_capture {
2727        Some(cap) => collect_per_cpu_time(cap),
2728        None => Vec::new(),
2729    };
2730    // Per-cgroup PSI-irq for the test's workload leaves (Phase A). Empty
2731    // when no capture was supplied, the workload root isn't present, or
2732    // psi_cgroups is off — loud-absent.
2733    let cgroup_psi = match cgroup_psi_capture {
2734        Some(cap) => collect_cgroup_psi(cap),
2735        None => Vec::new(),
2736    };
2737    let task_enrichment_t0 = std::time::Instant::now();
2738    let (task_enrichments, task_enrichments_unavailable) = match task_enrichment_capture {
2739        Some(cap) => {
2740            // Cap iteration AND Vec capacity at MAX_ENRICHED_TASKS so
2741            // a hostile guest with a corrupt or absurdly long
2742            // runnable_list can't drag the freeze window into the
2743            // tens-of-seconds range.
2744            let total = cap.tasks.len();
2745            let cap_n = total.min(MAX_ENRICHED_TASKS);
2746            let mut enrichments = Vec::with_capacity(cap_n);
2747            for entry in cap.tasks.iter().take(cap_n) {
2748                if let Some(e) = super::task_enrichment::walk_task_enrichment(
2749                    cap.kernel,
2750                    entry.task_kva,
2751                    cap.offsets,
2752                    cap.sched_classes,
2753                    cap.lock_slowpaths,
2754                    entry.is_runnable_in_scx,
2755                    entry.running_pc,
2756                ) {
2757                    enrichments.push(e);
2758                }
2759            }
2760            if total > cap_n {
2761                tracing::warn!(
2762                    cap = MAX_ENRICHED_TASKS,
2763                    total,
2764                    "dump_state task_enrichment: capped at MAX_ENRICHED_TASKS, dropping tail"
2765                );
2766            }
2767            let reason = if enrichments.is_empty() {
2768                tracing::debug!(
2769                    tasks_count = total,
2770                    "dump_state task_enrichment: walker yielded zero entries — \
2771                     scx_tasks list and rq->scx.runnable_list both empty, or every \
2772                     walk_task_enrichment call returned None (translate failures)",
2773                );
2774                Some(REASON_TASK_WALKER_ZERO_TASKS.to_string())
2775            } else {
2776                None
2777            };
2778            (enrichments, reason)
2779        }
2780        None => {
2781            tracing::debug!(
2782                "dump_state task_enrichment: capture is None — \
2783                 freeze coordinator passed no TaskEnrichmentCapture \
2784                 (scx_owned, scx_walker_offsets, or task_enrichment_offsets unresolved)",
2785            );
2786            (Vec::new(), Some(REASON_NO_TASK_WALKER.to_string()))
2787        }
2788    };
2789    tracing::debug!(
2790        elapsed_us = task_enrichment_t0.elapsed().as_micros() as u64,
2791        enriched = task_enrichments.len(),
2792        "dump_state phase: walk_task_enrichment"
2793    );
2794    deadline_exceeded(&mut truncated_at_us);
2795    let event_counter_timeline = match event_counter_capture {
2796        Some(cap) => cap
2797            .samples
2798            .iter()
2799            .filter_map(EventCounterSample::from_monitor_sample)
2800            .collect(),
2801        None => Vec::new(),
2802    };
2803    let (rq_scx_states, dsq_states, scx_sched_state, scx_walker_unavailable) =
2804        match scx_walker_capture {
2805            Some(cap) => {
2806                // Sub-group offsets resolved per kernel struct;
2807                // surface the absent groups in the diagnostic so a
2808                // partial walk announces which passes were skipped.
2809                let missing = cap.offsets.missing_groups();
2810
2811                // 1. Read scalar scx_sched state and recover the
2812                //    sched_pa for the sched-rooted DSQ walker passes.
2813                //    `sched_state` is None when the BTF lacked the
2814                //    `sched` sub-group OR when *scx_root == 0
2815                //    (no scheduler attached) — both surface as a
2816                //    None scx_sched_state in the report. The
2817                //    distinction is encoded in `scx_walker_unavailable`
2818                //    via REASON_SCX_ROOT_NULL.
2819                let (sched_pa_opt, sched_state) = match super::scx_walker::read_scx_sched_state(
2820                    cap.kernel,
2821                    cap.scx_root_kva,
2822                    cap.offsets,
2823                ) {
2824                    Some((sched_kva, state)) => {
2825                        // Translate sched_kva → PA (slab/vmalloc; use
2826                        // translate_any_kva via the GuestKernel handle).
2827                        let mem = cap.kernel.mem();
2828                        let walk = cap.kernel.walk_context();
2829                        let pa = super::idr::translate_any_kva(
2830                            mem,
2831                            walk.cr3_pa,
2832                            walk.page_offset,
2833                            sched_kva,
2834                            walk.l5,
2835                            walk.tcr_el1,
2836                        );
2837                        (pa, Some(state))
2838                    }
2839                    None => {
2840                        // Live read failed — `*scx_root == 0` because
2841                        // the scheduler has already torn down by
2842                        // freeze time. Fall back to the BPF .bss
2843                        // snapshot the probe's tp_btf handler latched
2844                        // at err-exit time. The snapshot is the
2845                        // strict subset of scheduler state the host
2846                        // renderer needs; the sched_pa stays None
2847                        // because the slab page that backed the live
2848                        // `scx_sched` was freed during teardown and
2849                        // the sched-rooted DSQ passes (per-node
2850                        // global, user dsq_hash) cannot reach it any
2851                        // longer. The caller's `unavail` selector
2852                        // below now sees `Some(state)` and skips
2853                        // REASON_SCX_ROOT_NULL — the consumer reads
2854                        // `state.source = "bss_snapshot"` to
2855                        // distinguish snapshot from live.
2856                        let snap = decode_probe_sched_state_snapshot(accessor, btf);
2857                        if snap.is_some() {
2858                            tracing::debug!(
2859                                scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
2860                                "dump_state scx walker: live read returned None; \
2861                                 BPF .bss snapshot fallback populated scx_sched_state \
2862                                 (scheduler torn down before freeze, snapshot \
2863                                 captured at err-exit instant)",
2864                            );
2865                        }
2866                        (None, snap)
2867                    }
2868                };
2869
2870                // 2. Per-CPU rq->scx walk. Per-CPU runs only when the
2871                //    rq + scx_rq + task sub-groups are present;
2872                //    walk_rq_scx returns None to skip otherwise.
2873                let walk_rq_scx_t0 = std::time::Instant::now();
2874                let mut rq_states = Vec::with_capacity(cap.rq_kvas.len());
2875                if !deadline_exceeded(&mut truncated_at_us) {
2876                    for (cpu, (&rq_kva, &rq_pa)) in
2877                        cap.rq_kvas.iter().zip(cap.rq_pas.iter()).enumerate()
2878                    {
2879                        if let Some((state, _entries)) = super::scx_walker::walk_rq_scx(
2880                            cap.kernel,
2881                            cpu as u32,
2882                            rq_kva,
2883                            rq_pa,
2884                            cap.offsets,
2885                        ) {
2886                            rq_states.push(state);
2887                        }
2888                    }
2889                }
2890                tracing::debug!(
2891                    elapsed_us = walk_rq_scx_t0.elapsed().as_micros() as u64,
2892                    cpus = cap.rq_kvas.len(),
2893                    rq_states = rq_states.len(),
2894                    "dump_state phase: walk_rq_scx"
2895                );
2896
2897                // 3. Per-CPU local DSQ walk runs unconditionally —
2898                //    `rq->scx.local_dsq` is initialized at boot
2899                //    (init_dsq from kernel/sched/ext.c:4581 for every
2900                //    possible CPU) and survives scheduler teardown,
2901                //    so it produces data even when *scx_root is NULL.
2902                //    This is the data source that survives
2903                //    scx_bypass's runnable_list drain
2904                //    (kernel/sched/ext.c:5448-5548) during teardown.
2905                let walk_local_dsqs_t0 = std::time::Instant::now();
2906                let mut dsqs: Vec<super::scx_walker::DsqState> = Vec::new();
2907                if !deadline_exceeded(&mut truncated_at_us)
2908                    && let Some((local_states, _entries)) = super::scx_walker::walk_local_dsqs(
2909                        cap.kernel,
2910                        cap.rq_kvas,
2911                        cap.rq_pas,
2912                        cap.per_cpu_offsets,
2913                        cap.offsets,
2914                    )
2915                {
2916                    dsqs.extend(local_states);
2917                }
2918                tracing::debug!(
2919                    elapsed_us = walk_local_dsqs_t0.elapsed().as_micros() as u64,
2920                    local_dsqs = dsqs.len(),
2921                    "dump_state phase: walk_local_dsqs"
2922                );
2923
2924                // 4. Sched-rooted DSQ passes (per-CPU bypass, per-node
2925                //    global, user dsq_hash) require the sched_pa we
2926                //    resolved in step 1. Without it, no scheduler is
2927                //    attached and these DSQs don't exist at all.
2928                let walk_dsqs_t0 = std::time::Instant::now();
2929                if !deadline_exceeded(&mut truncated_at_us)
2930                    && let Some(sched_pa) = sched_pa_opt
2931                {
2932                    let (sched_states, _entries) = super::scx_walker::walk_dsqs(
2933                        cap.kernel,
2934                        sched_pa,
2935                        cap.per_cpu_offsets,
2936                        cap.nr_nodes,
2937                        cap.offsets,
2938                    );
2939                    dsqs.extend(sched_states);
2940                }
2941                tracing::debug!(
2942                    elapsed_us = walk_dsqs_t0.elapsed().as_micros() as u64,
2943                    total_dsqs = dsqs.len(),
2944                    "dump_state phase: walk_dsqs"
2945                );
2946
2947                // Diagnostic priority:
2948                //   1. Partial-degradation (sub-group(s) missing) —
2949                //      announces exactly which passes were skipped.
2950                //   2. *scx_root is NULL — sched/bypass/global/user
2951                //      passes blinded but rq->scx + local DSQ still
2952                //      work; surface this distinct reason so the
2953                //      operator knows the scheduler isn't attached.
2954                //   3. Walker reached no state at all — typical when
2955                //      every read fails.
2956                //   4. None — every pass had data to surface.
2957                let unavail = if !missing.is_empty() {
2958                    tracing::debug!(
2959                        missing_groups = ?missing,
2960                        rq_states_count = rq_states.len(),
2961                        dsq_count = dsqs.len(),
2962                        sched_state_some = sched_state.is_some(),
2963                        "dump_state scx walker: partial degradation — missing BTF sub-groups",
2964                    );
2965                    Some(format!(
2966                        "scx walker partial: missing offset groups [{}]",
2967                        missing.join(", ")
2968                    ))
2969                } else if sched_state.is_none() {
2970                    tracing::debug!(
2971                        scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
2972                        rq_states_count = rq_states.len(),
2973                        dsq_count = dsqs.len(),
2974                        "dump_state scx walker: scx_root is NULL — no scheduler attached; \
2975                         rq->scx and local DSQ captures populated, sched/bypass/global/user passes blinded",
2976                    );
2977                    Some(REASON_SCX_ROOT_NULL.to_string())
2978                } else if rq_states.is_empty() && dsqs.is_empty() {
2979                    tracing::debug!(
2980                        scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
2981                        "dump_state scx walker: every walker read failed — no rq->scx, no DSQ, but sched_state present",
2982                    );
2983                    Some(REASON_SCX_WALKER_NO_STATE.to_string())
2984                } else {
2985                    None
2986                };
2987                (rq_states, dsqs, sched_state, unavail)
2988            }
2989            None => {
2990                tracing::debug!(
2991                    "dump_state scx walker: capture is None — \
2992                     freeze coordinator passed no ScxWalkerCapture (offsets/symbols/per_cpu_offsets unresolved)",
2993                );
2994                (
2995                    Vec::new(),
2996                    Vec::new(),
2997                    None,
2998                    Some(REASON_NO_SCX_WALKER.to_string()),
2999                )
3000            }
3001        };
3002    // Freeze-time per-vCPU perf-counter snapshot. With `exclude_host=1`
3003    // each counter ticks only during guest execution; the freeze
3004    // coordinator has parked every vCPU before reaching this site, so
3005    // the read returns the cumulative count at the last guest exit
3006    // for each vCPU. A single per-vCPU read failure is recorded as
3007    // `None` for that entry; a failure on one vCPU does not blank the
3008    // others. When `perf_capture` is None the vec stays empty (the
3009    // host lacked perf, or `perf_event_open` failed at run start).
3010    let vcpu_perf_at_freeze: Vec<Option<super::perf_counters::VcpuPerfSample>> = match perf_capture
3011    {
3012        Some(cap) => cap.per_vcpu.iter().map(|p| p.read().ok()).collect(),
3013        None => Vec::new(),
3014    };
3015
3016    // Snapshot the probe's per-CPU diagnostic counters before the
3017    // per-map render loop walks `.bss` itself — the read goes
3018    // through the same `read_value` path the renderer uses, but
3019    // captures the array as a structured `ProbeBssCounters` rather
3020    // than the BTF Datasec render. Best-effort: a None result
3021    // (probe not loaded, BTF missing the var) leaves the report's
3022    // `probe_counters` empty and the existing `.bss` map render
3023    // still surfaces the raw bytes.
3024    let probe_counters = decode_probe_counters_snapshot(accessor, btf);
3025
3026    // Resolve the active-scheduler obj name. The prog accessor +
3027    // BpfMapOffsets pair powers the target-free `prog_idr` walker
3028    // (the primary path; needs no `scx_root`, so it works on
3029    // pre-6.16 kernels too) -- when absent, the helper degrades to the
3030    // prefix-grouping fallback over struct_ops map names.
3031    let prog_walker = prog_capture.map(|cap| {
3032        (
3033            cap.accessor as &dyn super::bpf_prog::BpfProgAccessor,
3034            accessor.offsets(),
3035        )
3036    });
3037    let (active_obj_name, active_map_kvas) =
3038        match identify_active_obj_from_struct_ops(&maps, prog_walker) {
3039            Some((name, kvas)) => (Some(name), kvas),
3040            None => (None, Vec::new()),
3041        };
3042    let mut report = FailureDumpReport {
3043        schema: SCHEMA_SINGLE.to_string(),
3044        active_map_kvas,
3045        maps: Vec::with_capacity(maps.len()),
3046        vcpu_regs: Vec::new(),
3047        sdt_allocations: Vec::new(),
3048        sdt_alloc_unavailable: None,
3049        prog_runtime_stats,
3050        prog_runtime_stats_unavailable,
3051        per_cpu_time,
3052        cgroup_psi,
3053        // Per-node NUMA wire fields: empty Vec + the well-defined
3054        // diagnostic string until the host-side walker lands.
3055        per_node_numa: Vec::new(),
3056        per_node_numa_unavailable: Some(REASON_NO_NUMA_WALKER.to_string()),
3057        task_enrichments,
3058        task_enrichments_unavailable,
3059        event_counter_timeline,
3060        rq_scx_states,
3061        dsq_states,
3062        scx_sched_state,
3063        scx_walker_unavailable,
3064        vcpu_perf_at_freeze,
3065        dump_truncated_at_us: None,
3066        maps_truncated: 0,
3067        probe_counters,
3068        scx_static_ranges: Default::default(),
3069        is_placeholder: false,
3070        active_obj_name,
3071    };
3072
3073    // Per-map program-BTF cache, keyed by `btf_kva`. Each unique
3074    // `struct btf *` lives in the kernel BTF IDR — multiple maps from
3075    // the same BPF program point at the same KVA, so caching dedupes
3076    // the heavy `Btf::from_bytes`/`from_split_bytes` parse across them
3077    // (a scheduler with N maps backed by one BPF object pays one
3078    // parse, not N). Lookups go through this cache before falling
3079    // back to the caller-supplied vmlinux `btf`.
3080    //
3081    // Populated by an explicit pre-pass below so the sdt_alloc walk
3082    // can read it before the per-map render loop runs (the renderer
3083    // needs the resulting allocator metadata via `RenderMapCtx`).
3084    let mut program_btfs: std::collections::HashMap<u64, Btf> = std::collections::HashMap::new();
3085
3086    // Pre-pass: locate the first non-internal `BPF_MAP_TYPE_ARENA`
3087    // map (skipping the same ktstr-internal name set the main loop
3088    // skips) and snapshot it once before any map renders. This lets
3089    // the per-map `MemReader` chase `__arena` pointers no matter
3090    // which slot the arena map occupies in the iteration order —
3091    // the previous design ran `snapshot_arena` lazily inside
3092    // `render_map`'s arena arm, so non-arena maps that rendered
3093    // earlier saw `arena_snapshot: None` and silently failed every
3094    // arena pointer chase. `lib/arena_map.h` declares one `__weak`
3095    // arena per BPF object so a single shared snapshot covers every
3096    // `__arena` pointer the scheduler emits; additional arena maps
3097    // (multi-object schedulers, theoretical) still get their own
3098    // snapshot inside `render_map`'s arena arm — they just don't
3099    // contribute to the cross-map pointer-chase context.
3100    let shared_arena_snapshot: Option<(BpfMapInfo, ArenaSnapshot)> =
3101        arena_offsets.and_then(|off| {
3102            for info in &maps {
3103                let name = info.name();
3104                if name.starts_with("probe_bp.")
3105                    || name.starts_with("fentry_p.")
3106                    || name == "probe_bp"
3107                    || name == "fentry_p"
3108                    || KTSTR_INTERNAL_MAPS.contains(&name.as_ref())
3109                {
3110                    continue;
3111                }
3112                if info.map_type == BPF_MAP_TYPE_ARENA {
3113                    let snap = snapshot_arena(accessor.kernel(), info, off);
3114                    return Some((info.clone(), snap));
3115                }
3116            }
3117            None
3118        });
3119    let shared_arena_ref: Option<(&ArenaSnapshot, u64)> = shared_arena_snapshot
3120        .as_ref()
3121        .map(|(info, snap)| (snap, info.map_kva));
3122
3123    // Cache `kern_vm_start` from the pre-pass snapshot for the
3124    // sdt_alloc walk. Pulling directly from `shared_arena_snapshot`
3125    // (rather than scraping each rendered map's `arena` field in the
3126    // main loop) keeps the walk gating decoupled from per-map render
3127    // order — the data the walker needs is finalized before the
3128    // loop runs.
3129    let arena_kern_vm_start: u64 = shared_arena_snapshot
3130        .as_ref()
3131        .map(|(_, snap)| snap.kern_vm_start)
3132        .unwrap_or(0);
3133
3134    // Pre-pass: load every non-internal map's program BTF and locate
3135    // the scheduler's `.bss` raw bytes. Both inputs feed the
3136    // sdt_alloc walk below — moving them out of the main render loop
3137    // means the allocator metadata that decoration in the
3138    // TASK_STORAGE arm needs (`elem_size`, `target_type_id`) is
3139    // available BEFORE any map renders, instead of getting derived
3140    // post-loop only to be unusable for per-entry payload chase.
3141    let mut sched_bss_bytes: Option<(Vec<u8>, u64)> = None; // (bytes, btf_kva)
3142    for info in &maps {
3143        let name = info.name();
3144        if name.starts_with("probe_bp.")
3145            || name.starts_with("fentry_p.")
3146            || name == "probe_bp"
3147            || name == "fentry_p"
3148            || KTSTR_INTERNAL_MAPS.contains(&name.as_ref())
3149        {
3150            continue;
3151        }
3152        if info.btf_kva != 0
3153            && !program_btfs.contains_key(&info.btf_kva)
3154            && let Some(loaded) = accessor.load_program_btf(info, btf)
3155        {
3156            program_btfs.insert(info.btf_kva, loaded);
3157        }
3158        if sched_bss_bytes.is_none()
3159            && info.map_type == BPF_MAP_TYPE_ARRAY
3160            && info.btf_kva != 0
3161            && name.ends_with(".bss")
3162            && let Some(bytes) = accessor.read_value(info, 0, info.value_size as usize)
3163        {
3164            sched_bss_bytes = Some((bytes, info.btf_kva));
3165        }
3166    }
3167
3168    // Pre-pass: walk sdt_alloc trees if all prerequisites lined up.
3169    // Runs BEFORE the main render loop so the allocator metadata it
3170    // discovers (`elem_size`, `target_type_id`,
3171    // `data_header_size`) is available to per-map decoration —
3172    // specifically, the TASK_STORAGE arm uses it to expand each
3173    // entry's `struct sdt_data __arena *` pointer into a typed
3174    // payload render via [`render_map`]'s
3175    // [`crate::monitor::dump::render_map::SdtAllocMeta`].
3176    //
3177    // The walk is best-effort and silent: any missing prerequisite
3178    // (no scheduler .bss, no arena window, no program BTF, no
3179    // `scx_allocator` type) leaves `sdt_allocations` empty rather
3180    // than failing the dump. `sdt_alloc_metas` stays empty in the
3181    // same cases, so each per-entry payload field also degrades to
3182    // `None` (the surface struct still renders).
3183    //
3184    // Build the dump-pass arena page index here too — once outside
3185    // the per-map loop so each per-map `mem_reader` borrows the
3186    // existing table instead of rebuilding it. The sdt_alloc walk
3187    // below uses the same index for its own MemReader.
3188    let arena_page_index = crate::monitor::dump::render_map::build_arena_page_index(
3189        shared_arena_snapshot.as_ref().map(|(_, snap)| snap),
3190    );
3191    let sdt_alloc_t0 = std::time::Instant::now();
3192    // Every typed allocator the program declares; the per-map
3193    // selector in [`render_map`] picks the matching entry by name
3194    // (e.g. `scx_task_allocator` matches `scx_task_map`). A
3195    // single-allocator scheduler hits the unique-candidate path —
3196    // every map gets that allocator. A multi-allocator scheduler
3197    // (per-task + per-cgroup) lets each local-storage map render
3198    // its own payload type instead of forcing the renderer to give
3199    // up.
3200    let mut sdt_alloc_metas: Vec<crate::monitor::dump::render_map::SdtAllocMeta> = Vec::new();
3201    // `slot_start → ArenaSlotInfo` lookup populated as each
3202    // allocator walk completes.
3203    // [`MemReader::resolve_arena_type`] consults this index via a
3204    // range lookup: given a chased address, find the slot whose
3205    // `[slot_start, slot_start + elem_size)` range contains it,
3206    // then route on `offset_in_slot`:
3207    //
3208    //   - `offset_in_slot == 0` (slot-start pointer, e.g. the
3209    //     `data` field of `scx_task_map_val` storing the raw
3210    //     `sdt_alloc()` return) → render the payload skipping
3211    //     `header_size` bytes of header.
3212    //   - `offset_in_slot == header_size` (payload-start pointer,
3213    //     e.g. the return of `scx_task_data(p)` cached in
3214    //     `cached_taskc_raw`) → render the payload directly.
3215    //   - Other in-slot offsets → no resolve; the renderer falls
3216    //     back to its existing skip behaviour.
3217    //
3218    // Built incrementally inside the walk loop so the
3219    // per-allocator snapshot moves into `report.sdt_allocations`
3220    // after each iteration without a clone.
3221    //
3222    // [`crate::monitor::sdt_alloc::TreeWalker::emit_leaf`]
3223    // populates each [`SdtAllocEntry::user_addr`] as
3224    // `data_ptr & 0xFFFF_FFFF` — the slot-START address windowed
3225    // to the low 32 bits. The index keys directly on this
3226    // windowed slot start, paired with an [`ArenaSlotInfo`] that
3227    // carries `elem_size`, `header_size`, and the payload BTF
3228    // type id so the [`MemReader::resolve_arena_type`] range
3229    // lookup has every value it needs to decide the chase shape.
3230    //
3231    // Slot non-overlap invariant: the kernel allocator places
3232    // slots back-to-back inside one `sdt_chunk` and never re-uses
3233    // a position while the bitmap still has it marked allocated
3234    // (see `lib/sdt_alloc.bpf.c::scx_alloc_internal`'s
3235    // bitmap-then-data ordering). Two distinct slots cannot have
3236    // overlapping `[start, start + elem_size)` ranges, so
3237    // dedup-on-exact-key here is sufficient — we cannot land on a
3238    // case where `slot_a + elem_a > slot_b > slot_a` with
3239    // `slot_b` separately keyed.
3240    //
3241    // Duplicates (two slots reporting the same slot start,
3242    // indicating a stale snapshot from a freed allocation racing
3243    // with the freeze) keep the FIRST entry; this matches the
3244    // [`build_arena_page_index`] policy on duplicate user_addr
3245    // pages and emits a `tracing::warn!` line so an operator
3246    // diagnosing a wrong-render can spot the collision.
3247    let mut arena_slot_index = crate::monitor::dump::render_map::ArenaSlotIndex::new();
3248    // 4 GiB-alignment invariant: the bridge keys on the low 32
3249    // bits of slot start. That is correct iff `user_vm_start` is
3250    // 4 GiB-aligned — `slot_full_addr - slot_low32 == user_vm_start`
3251    // and the renderer reconstructs full addresses by masking
3252    // chased values with `0xFFFF_FFFF`. Every in-tree scx scheduler
3253    // sets `map_extra` to a 4 GiB-aligned value (`1 << 32`,
3254    // `1 << 44`); the kernel auto-pick path in
3255    // `bpf_arena_map_alloc` (kernel/bpf/arena.c) rounds the user
3256    // VM area up to `SZ_4G` before mounting. The kernel does
3257    // accept arbitrary `map_extra` from userspace, so an
3258    // out-of-tree scheduler could in theory pass an unaligned
3259    // value — surface a warning and skip the index build rather
3260    // than silently misroute every chase.
3261    let user_vm_aligned = shared_arena_snapshot
3262        .as_ref()
3263        .map(|(_, snap)| snap.user_vm_start & 0xFFFF_FFFF == 0)
3264        .unwrap_or(false);
3265    if !user_vm_aligned && let Some((_, snap)) = shared_arena_snapshot.as_ref() {
3266        tracing::warn!(
3267            user_vm_start = format_args!("{:#x}", snap.user_vm_start),
3268            "sdt_alloc bridge skipped: user_vm_start is not 4 GiB-aligned; \
3269             low-32 keying would misroute every chase",
3270        );
3271    }
3272    // `user_vm_start == 0` is technically 4 GiB-aligned (and the
3273    // gate above accepts it), but the
3274    // [`super::dump::render_map::is_arena_addr_in_snapshot`] helper
3275    // rejects every address when `user_vm_start == 0` — silently
3276    // disabling the bridge. Surface a warn so an operator
3277    // diagnosing missing typed-pointer renders sees the cause
3278    // (likely a snapshot capture failure that produced an
3279    // uninitialized arena VM start).
3280    if let Some((_, snap)) = shared_arena_snapshot.as_ref()
3281        && snap.user_vm_start == 0
3282    {
3283        tracing::warn!(
3284            "sdt_alloc bridge effectively disabled: user_vm_start == 0 \
3285             (snapshot capture may have failed before resolving the \
3286             arena's user VM window); every chase will skip with \
3287             `is_arena_addr` = false",
3288        );
3289    }
3290    // Resolve the unavailable reason in the order the gate checks
3291    // run. The first failing prerequisite wins — subsequent reasons
3292    // (which would all surface for the same missing prerequisite) are
3293    // suppressed. `None` here means the pre-pass actually runs; the
3294    // sdt_alloc_unavailable field is then populated post-loop based
3295    // on whether any allocator was discovered.
3296    let sdt_alloc_skip_reason: Option<&'static str> = if deadline_exceeded(&mut truncated_at_us) {
3297        Some(REASON_SDT_ALLOC_DEADLINE_EXCEEDED)
3298    } else if !user_vm_aligned {
3299        Some(REASON_SDT_ALLOC_UNALIGNED_USER_VM)
3300    } else if sched_bss_bytes.is_none() {
3301        Some(REASON_SDT_ALLOC_NO_BSS)
3302    } else if arena_kern_vm_start == 0 {
3303        Some(REASON_SDT_ALLOC_NO_ARENA)
3304    } else if let Some((_, btf_kva)) = sched_bss_bytes.as_ref()
3305        && !program_btfs.contains_key(btf_kva)
3306    {
3307        Some(REASON_SDT_ALLOC_NO_BSS)
3308    } else if let Some((_, btf_kva)) = sched_bss_bytes.as_ref()
3309        && let Some(prog_btf) = program_btfs.get(btf_kva)
3310        && SdtAllocOffsets::from_btf(prog_btf).is_err()
3311    {
3312        Some(REASON_SDT_ALLOC_NO_TYPE)
3313    } else {
3314        None
3315    };
3316    if let Some(reason) = sdt_alloc_skip_reason {
3317        report.sdt_alloc_unavailable = Some(reason.to_string());
3318    }
3319    // Track whether the pre-pass body ran (every prerequisite
3320    // satisfied). Distinct from `sdt_alloc_skip_reason`: if the
3321    // body runs but discovers no `.bss` instance of
3322    // `struct scx_allocator`, the unavailable reason flips to
3323    // [`REASON_SDT_ALLOC_NO_INSTANCE`] AFTER the loop.
3324    let mut sdt_alloc_pre_pass_ran = false;
3325    if !deadline_exceeded(&mut truncated_at_us)
3326        && user_vm_aligned
3327        && let Some((bss_bytes, btf_kva)) = sched_bss_bytes
3328        && arena_kern_vm_start != 0
3329        && let Some(prog_btf) = program_btfs.get(&btf_kva)
3330        && let Ok(sdt_offsets) = SdtAllocOffsets::from_btf(prog_btf)
3331    {
3332        sdt_alloc_pre_pass_ran = true;
3333        // One MemReader for every leaf payload render, so an
3334        // arena pointer embedded in a per-task / per-cgroup
3335        // sdt_alloc payload chases into typed contents instead
3336        // of opaque hex.
3337        //
3338        // The arena type index is intentionally `None` on this
3339        // pre-pass reader: the walk produces the entries the
3340        // index is built from, so the index does not yet exist
3341        // when the leaf payload renders run. A nested `__arena
3342        // *` pointer inside a payload that targets a separate
3343        // allocator slot whose payload type is forward-declared
3344        // in the program BTF degrades to the existing chase
3345        // behaviour during the pre-pass; the index is wired
3346        // into the per-map renders below where the typical
3347        // bridge call site lives (TASK_STORAGE / HASH maps
3348        // holding `struct sdt_data __arena *` entry pointers).
3349        let sdt_mem = accessor.mem_reader(
3350            shared_arena_snapshot.as_ref().map(|(_, snap)| snap),
3351            &arena_page_index,
3352            num_cpus,
3353            // Threaded in from [`DumpContext::cast_map`]: same
3354            // cast-analysis output the per-map renderer below
3355            // consumes. Letting the sdt_alloc pre-pass see it
3356            // means typed-allocator payload chases (per-task /
3357            // per-cgroup contents inside arena) get the same
3358            // `u64` → typed-pointer promotion as the rest of
3359            // the dump, instead of degrading to plain counters
3360            // for fields the analyzer recovered.
3361            cast_map,
3362            None,
3363            // The sdt_alloc pre-pass populates `sdt_alloc_metas`
3364            // itself; the metas slice is empty until this loop
3365            // finishes, so pass an empty slice here. The
3366            // metas-driven fallback only fires for the per-map
3367            // renders below (after this loop has produced every
3368            // allocator's metadata).
3369            &[],
3370            // The sdt_alloc pre-pass reads BTF type metadata for
3371            // the typed allocator payload from the scheduler's
3372            // own program BTF; no cross-BTF Fwd resolution is
3373            // needed here. The per-map renders below pass the
3374            // built `cross_btf_fwd_index` where it matters.
3375            None,
3376            // The sdt_alloc pre-pass runs BEFORE the
3377            // [`crate::monitor::scx_static_alloc`] walker (the
3378            // walks are independent, but the per-allocator leaf
3379            // payload renders here happen during the sdt_alloc
3380            // walk's own loop, so the scx_static index is not yet
3381            // built). Pass `None` so the bridge stays a no-op for
3382            // these pre-pass renders; the per-map renders below
3383            // pass the built `scx_static_index` where it matters.
3384            None,
3385            // The sdt_alloc pre-pass IS the surface that produces
3386            // the rendered-slot set; pass `None` so no dedup gate
3387            // fires while the typed-allocator surface itself is
3388            // being assembled. The per-map renders below pass the
3389            // built set where it matters.
3390            None,
3391            alloc_size_types,
3392            // The sdt_alloc pre-pass walks against the scheduler's
3393            // program BTF identified by `btf_kva` (the same BTF the
3394            // allocator metadata was resolved against — see
3395            // `append_arena_slot_index_for_allocator` below). Pass
3396            // this kva as the requesting-BTF identifier so the
3397            // pre-pass's own leaf renders compare against it
3398            // (currently the index is still being built so the gate
3399            // never fires here, but the threading keeps the contract
3400            // consistent across all `mem_reader` call sites).
3401            btf_kva,
3402        );
3403        // Locate every sdt_alloc allocator instance declared in
3404        // `.bss`. The Datasec walk gives us each variable's name and
3405        // offset; we filter to types matching `struct scx_allocator`
3406        // by re-resolving the var's chained type. A scheduler may
3407        // declare more than one allocator (e.g. one per-task, one
3408        // per-cgroup) so we iterate all of them.
3409        for (var_name, var_offset, var_type_id) in iter_bss_vars_with_type(prog_btf, ".bss") {
3410            // Only walk vars whose type is `struct scx_allocator`.
3411            if !is_scx_allocator_type(prog_btf, var_type_id) {
3412                continue;
3413            }
3414            // Slice the in-bss bytes for one full `struct scx_allocator`.
3415            // The size comes from BTF (resolved into `allocator_size`
3416            // by `SdtAllocOffsets::from_btf`); using the BTF-reported
3417            // size means a future field appended to scx_allocator
3418            // doesn't silently slip past the slice end.
3419            let Some(slice_end) = var_offset.checked_add(sdt_offsets.allocator_size) else {
3420                continue;
3421            };
3422            let slice = match bss_bytes.get(var_offset..slice_end) {
3423                Some(s) => s,
3424                None => continue,
3425            };
3426
3427            // Discover the payload BTF type id from the elem_size
3428            // we'd read in the walker. We do a small read here just
3429            // to drive the heuristic; the walker re-reads it.
3430            let pool_off = sdt_offsets.allocator_pool + sdt_offsets.pool_elem_size;
3431            let elem_size = if pool_off + 8 <= slice.len() {
3432                let mut buf = [0u8; 8];
3433                buf.copy_from_slice(&slice[pool_off..pool_off + 8]);
3434                u64::from_le_bytes(buf)
3435            } else {
3436                0
3437            };
3438            let payload_size =
3439                elem_size.saturating_sub(sdt_offsets.data_header_size as u64) as usize;
3440            // `prog_btf` is split BTF: the scheduler's program types
3441            // layered on the vmlinux base. `discover_payload_btf_id`
3442            // probes only the program section's id range (via
3443            // `Btf::split`), so vmlinux base `*_ctx` structs of the
3444            // same size cannot shadow the scheduler's payload struct.
3445            let choice = discover_payload_btf_id(prog_btf, payload_size, &var_name);
3446
3447            let snap = walk_sdt_allocator(
3448                accessor.kernel(),
3449                arena_kern_vm_start,
3450                slice,
3451                &sdt_offsets,
3452                prog_btf,
3453                choice.target_type_id,
3454                choice.reason.clone(),
3455                var_name.clone(),
3456                &sdt_mem,
3457            );
3458            // Accumulate every allocator with a typed payload AND
3459            // append its live slots to the bridge index. The
3460            // per-map selector (`select_sdt_alloc_meta`) picks the
3461            // right one by matching `var_name` (the .bss symbol —
3462            // e.g. `scx_task_allocator`) against each rendered map's
3463            // name (e.g. `scx_task_map`). Schedulers that declare
3464            // multiple typed allocators no longer lose payload
3465            // expansion — each map renders against the matching
3466            // allocator's payload type. Only allocators with a
3467            // resolved payload type id contribute to the bridge
3468            // index — without a typed payload there is no useful
3469            // BTF id to surface to the renderer, and the index
3470            // would just point every chase at 0 (which the bridge
3471            // gate filters as "no payload type").
3472            if choice.target_type_id != 0 {
3473                sdt_alloc_metas.push(crate::monitor::dump::render_map::SdtAllocMeta {
3474                    allocator_name: var_name.clone(),
3475                    elem_size,
3476                    header_size: sdt_offsets.data_header_size,
3477                    target_type_id: choice.target_type_id,
3478                    kern_vm_start: arena_kern_vm_start,
3479                });
3480                // Append this allocator's slots to the bridge index.
3481                // The helper handles the size-fits-u32 check, the
3482                // dedup-on-duplicate-slot-start, and the
3483                // `tracing::warn!` collision diagnostic — see
3484                // [`append_arena_slot_index_for_allocator`] for the
3485                // full contract. Bridge gate (`target_type_id
3486                // != 0`) is encoded inside the helper as well; the
3487                // outer guard here is a fast-path bail before we
3488                // even allocate metadata for the allocator.
3489                crate::monitor::dump::render_map::append_arena_slot_index_for_allocator(
3490                    &mut arena_slot_index,
3491                    &var_name,
3492                    choice.target_type_id,
3493                    sdt_offsets.data_header_size,
3494                    elem_size,
3495                    &snap.all_slot_addrs,
3496                    // Stamp the slot with the program BTF the
3497                    // `target_type_id` was resolved against. The
3498                    // per-map renderer's
3499                    // [`MemReader::resolve_arena_type`] gate
3500                    // compares this against each requesting map's
3501                    // `btf_kva` and suppresses the hit on mismatch
3502                    // so the BTF id cannot leak into a sibling
3503                    // BTF's id space (multi-`.bpf.o` schedulers).
3504                    btf_kva,
3505                );
3506            }
3507            // Surface only allocators with a non-empty result OR a
3508            // diagnostic elem_size; an all-zero snapshot from a
3509            // never-initialized allocator is just noise.
3510            if !snap.entries.is_empty() || snap.elem_size != 0 {
3511                report.sdt_allocations.push(snap);
3512            }
3513        }
3514    }
3515    // Post-loop: if the pre-pass ran but discovered nothing, the
3516    // scheduler has program BTF + arena + bss but no `struct
3517    // scx_allocator` declared in `.bss`. Surface a distinct
3518    // diagnostic so an operator can tell the schedule-doesn't-link
3519    // case from the link-but-no-instance case.
3520    if sdt_alloc_pre_pass_ran
3521        && report.sdt_allocations.is_empty()
3522        && report.sdt_alloc_unavailable.is_none()
3523    {
3524        report.sdt_alloc_unavailable = Some(REASON_SDT_ALLOC_NO_INSTANCE.to_string());
3525    }
3526    let arena_slot_index_ref = if arena_slot_index.is_empty() {
3527        None
3528    } else {
3529        Some(&arena_slot_index)
3530    };
3531    // Build the rendered-slot set for arena chase dedup. Keys on
3532    // every slot that is actually rendered under
3533    // `report.sdt_allocations` — typed AND untyped allocators both
3534    // contribute via [`SdtAllocEntry::user_addr`] (emit_leaf records
3535    // an entry on every leaf it visits, regardless of whether
3536    // payload BTF resolution succeeded). Keying on
3537    // `arena_slot_index.keys()` alone would have missed slots from
3538    // allocators whose `target_type_id == 0` — those allocators
3539    // never reach `append_arena_slot_index_for_allocator` (the
3540    // helper short-circuits on a zero target id) yet their slots
3541    // ARE rendered in `report.sdt_allocations`.
3542    //
3543    // Two address keys per entry: the slot start (`user_addr`) and
3544    // the payload start (`user_addr + header_size`). Chase targets
3545    // resolved via `scx_task_data(p)` and similar helpers point at
3546    // the payload, not the slot start; without the payload-start
3547    // key the dedup misses every chase that uses the helper-
3548    // computed pointer.
3549    //
3550    // Slots past [`super::sdt_alloc::MAX_SDT_ALLOC_ENTRIES`] are
3551    // intentionally excluded — they are NOT rendered in
3552    // `snap.entries`, so the per-map renderer must surface their
3553    // payload (otherwise the truncated tail would appear nowhere
3554    // in the dump). The walker's `truncated` flag is the operator-
3555    // visible signal that some slots are only available via the
3556    // per-map render path.
3557    //
3558    // The per-map renderer's [`MemReader::is_already_rendered`]
3559    // consults this set to skip re-rendering the same allocation
3560    // when a TASK_STORAGE / HASH map's value pointer chases back
3561    // into it. Only TYPED allocators (target_type_id != 0) enter
3562    // the set — untyped pre-pass renders (hex fallback) must not
3563    // suppress per-map chases because the cast analyzer's shape
3564    // inference may resolve a concrete target type the heuristic
3565    // missed.
3566    let header_size_by_allocator: std::collections::HashMap<&str, usize> = sdt_alloc_metas
3567        .iter()
3568        .map(|meta| (meta.allocator_name.as_str(), meta.header_size))
3569        .collect();
3570    let rendered_slot_addrs: std::collections::HashSet<u32> = report
3571        .sdt_allocations
3572        .iter()
3573        .filter_map(|snap| {
3574            // Only dedup slots from allocators with a resolved
3575            // payload type. Untyped allocators (target_type_id == 0,
3576            // rendered as hex in the pre-pass) must NOT suppress
3577            // per-map chases — the cast analyzer's shape inference
3578            // may resolve a concrete target type that the pre-pass
3579            // heuristic missed.
3580            let &header_size = header_size_by_allocator.get(snap.allocator_name.as_str())?;
3581            Some(snap.entries.iter().flat_map(move |e| {
3582                let slot_start = e.user_addr as u32;
3583                let payload_start = slot_start.wrapping_add(header_size as u32);
3584                [slot_start, payload_start]
3585            }))
3586        })
3587        .flatten()
3588        .collect();
3589    let rendered_slot_addrs_ref = if rendered_slot_addrs.is_empty() {
3590        None
3591    } else {
3592        Some(&rendered_slot_addrs)
3593    };
3594    tracing::debug!(
3595        elapsed_us = sdt_alloc_t0.elapsed().as_micros() as u64,
3596        allocations = report.sdt_allocations.len(),
3597        index_entries = arena_slot_index.len(),
3598        "dump_state phase: sdt_alloc"
3599    );
3600
3601    // Pre-pass: walk every `scx_static` bump-allocator instance in
3602    // `.bss` and surface its live-allocated range. Distinct from the
3603    // sdt_alloc per-instance allocator walk above:
3604    //
3605    //   - sdt_alloc (`struct scx_allocator`) hands out fixed-stride
3606    //     slots via a 3-level radix tree with per-slot metadata; the
3607    //     walker produces one entry per live slot keyed on slot start.
3608    //   - scx_static (`struct scx_static`) is a flat bump allocator
3609    //     with no per-slot metadata; the walker produces one entry
3610    //     per live region keyed on the region's base address.
3611    //
3612    // The walk runs only when:
3613    //   - we have a scheduler `.bss` blob to read from (re-located
3614    //     here because the sdt_alloc walk above consumed the
3615    //     pre-pass `sched_bss_bytes` Option),
3616    //   - we have a program BTF to resolve `struct scx_static`
3617    //     against,
3618    //   - the program BTF carries `struct scx_static`.
3619    //
3620    // When any prerequisite is missing, the walk leaves
3621    // `report.scx_static_ranges` empty (default) rather than failing
3622    // the dump — schedulers that don't link `lib/sdt_alloc.bpf.c` or
3623    // don't use the static allocator simply skip the walk.
3624    //
3625    // Membership-only: the walker produces an UNTYPED range index.
3626    // Per-allocation type recovery requires a per-call-site type
3627    // hook from cast analysis that does not exist today (see the
3628    // module-level doc in [`crate::monitor::scx_static_alloc`] for
3629    // why). When the renderer's deferred-resolve arena chase lands
3630    // on an address inside an scx_static range, the bridge
3631    // recognises the address as "in scx_static memory" and
3632    // fails closed (returns `None` from `resolve_arena_type`)
3633    // rather than returning a wrong type — the "no invalid data
3634    // made" contract.
3635    let scx_static_t0 = std::time::Instant::now();
3636    if !deadline_exceeded(&mut truncated_at_us)
3637        && let Some((bss_bytes, prog_btf)) = relocate_sched_bss(&maps, accessor, &program_btfs)
3638        && let Ok(scx_static_offsets) =
3639            crate::monitor::scx_static_alloc::ScxStaticOffsets::from_btf(prog_btf)
3640    {
3641        let snap = crate::monitor::scx_static_alloc::walk_scx_static(
3642            &bss_bytes,
3643            &scx_static_offsets,
3644            iter_bss_vars_with_type(prog_btf, ".bss"),
3645            |type_id| is_scx_static_type(prog_btf, type_id),
3646        );
3647        report.scx_static_ranges = snap;
3648    }
3649    let scx_static_index =
3650        crate::monitor::scx_static_alloc::build_scx_static_range_index(&report.scx_static_ranges);
3651    let scx_static_index_ref = if scx_static_index.is_empty() {
3652        None
3653    } else {
3654        Some(&scx_static_index)
3655    };
3656    tracing::debug!(
3657        elapsed_us = scx_static_t0.elapsed().as_micros() as u64,
3658        ranges = report.scx_static_ranges.ranges.len(),
3659        skipped = report.scx_static_ranges.skipped,
3660        index_entries = scx_static_index.len(),
3661        "dump_state phase: scx_static"
3662    );
3663
3664    let render_map_t0 = std::time::Instant::now();
3665    let mut maps_rendered: usize = 0;
3666    let mut maps_truncated: usize = 0;
3667    for info in maps {
3668        // Skip ktstr's own framework maps so the report only shows
3669        // the scheduler-under-test's state. Three distinct shapes
3670        // need filtering:
3671        //
3672        // 1. Global-section maps from the probe skeleton: libbpf
3673        //    composes `<obj_name>.<section>` so `probe_bp.bss`,
3674        //    `probe_bp.data`, `probe_bp.rodata` all match the
3675        //    `probe_bp.` prefix. (`probe_bp` matching the bare obj
3676        //    name covers any single-name section the kernel might
3677        //    surface, though libbpf today always adds the suffix.)
3678        // 2. Global-section maps from the fentry skeleton, named
3679        //    with the `fentry_p.` prefix following the same
3680        //    libbpf convention.
3681        // 3. Bare-named maps declared via `SEC(".maps")` in
3682        //    src/bpf/probe.bpf.c — these don't get an obj prefix
3683        //    because they're not from a global section. The
3684        //    explicit denylist [`KTSTR_INTERNAL_MAPS`] enumerates
3685        //    them.
3686        //
3687        // A future tighter filter would consult bpf_prog ownership
3688        // (the program-attachment ID list pinned to each map), but
3689        // name-based filtering is enough today and avoids loading
3690        // the full prog_idr walk on the freeze hot path.
3691        {
3692            let info_name = info.name();
3693            if info_name.starts_with("probe_bp.")
3694                || info_name.starts_with("fentry_p.")
3695                || info_name == "probe_bp"
3696                || info_name == "fentry_p"
3697                || KTSTR_INTERNAL_MAPS.contains(&info_name.as_ref())
3698            {
3699                continue;
3700            }
3701        }
3702
3703        // Deadline check before each map render — bigger maps
3704        // (large hashes, arenas) can each take a meaningful slice
3705        // of the freeze window, so we re-check between renders to
3706        // bound the worst case rather than letting one
3707        // straggler push us past the watchdog.
3708        if deadline_exceeded(&mut truncated_at_us) {
3709            maps_truncated += 1;
3710            continue;
3711        }
3712
3713        // Per-map BTF resolution.
3714        //
3715        // The map's `btf_value_type_id` / `btf_key_type_id` index
3716        // the *map's own* BTF, NOT the kernel vmlinux BTF — when
3717        // `btf_kva != 0` the type IDs are program-local and using
3718        // vmlinux BTF with them would resolve to unrelated kernel
3719        // types (or out-of-range nonsense). So:
3720        //
3721        //   - `BPF_MAP_TYPE_STRUCT_OPS`              → use vmlinux
3722        //     BTF. The wrapper struct `bpf_struct_ops_<name>` is
3723        //     declared in the kernel's vmlinux BTF and the
3724        //     wrapper type id stored on the map (in
3725        //     `btf_vmlinux_value_type_id`) indexes vmlinux. Using
3726        //     the program BTF here would fail to resolve the
3727        //     wrapper.
3728        //   - `btf_kva != 0` AND program BTF loaded by pre-pass → use it.
3729        //   - `btf_kva != 0` AND program BTF load failed in pre-pass
3730        //     → render hex-only (None map_btf), no fallback.
3731        //   - `btf_kva == 0` (kernel-builtin map)      → use the
3732        //     caller-supplied vmlinux BTF; the type IDs (if any)
3733        //     genuinely index vmlinux BTF in this case.
3734        let map_btf: Option<&Btf> = if info.map_type == super::bpf_map::BPF_MAP_TYPE_STRUCT_OPS {
3735            Some(btf)
3736        } else if info.btf_kva != 0 {
3737            program_btfs.get(&info.btf_kva)
3738        } else {
3739            Some(btf)
3740        };
3741
3742        let rendered = render_map(
3743            &RenderMapCtx {
3744                accessor,
3745                btf: map_btf,
3746                num_cpus,
3747                arena_offsets,
3748                shared_arena: shared_arena_ref,
3749                arena_page_index: &arena_page_index,
3750                sdt_alloc_metas: &sdt_alloc_metas,
3751                // Threaded in from
3752                // [`DumpContext::cast_map`]: the BPF
3753                // cast-analysis output for the scheduler's
3754                // program object. `Some(&map)` lets the
3755                // renderer promote `u64` fields the analyzer
3756                // flagged into typed-pointer renders via
3757                // [`super::btf_render::MemReader::cast_lookup`];
3758                // `None` keeps every `u64` rendered as a plain
3759                // unsigned counter (the trait default).
3760                cast_map,
3761                // Built from the sdt_alloc pre-pass above:
3762                // `slot_start → ArenaSlotInfo` for every live
3763                // allocator slot. Lets the renderer range-lookup
3764                // the slot a chased arena address falls in and
3765                // recover a `BTF_KIND_FWD` pointee's real
3766                // struct id (plus a `header_skip` byte count)
3767                // via [`MemReader::resolve_arena_type`] — a
3768                // `struct sdt_data __arena *` field (or a `data`
3769                // field caching the raw `sdt_alloc()` return)
3770                // whose pointee body lives in the sdt_alloc
3771                // library's BTF still chases as the typed
3772                // per-task / per-cgroup struct, instead of
3773                // skipping with "forward declaration; body not
3774                // in this BTF". `None` when no allocator with a
3775                // typed payload was discovered.
3776                arena_slot_index: arena_slot_index_ref,
3777                // Threaded in from
3778                // [`DumpContext::cross_btf_fwd_index`]: the
3779                // cross-BTF Fwd resolution context populated by
3780                // the cast-analysis pre-pass over every embedded
3781                // BPF object's BTF. `Some(&idx)` lets the
3782                // renderer chase a `BTF_KIND_FWD` whose body
3783                // lives in a sibling embedded object via
3784                // [`MemReader::cross_btf_resolve_fwd`]. `None`
3785                // keeps the renderer's "forward declaration;
3786                // body not in this BTF" skip path intact.
3787                cross_btf_fwd_index: cross_btf_fwd_index_ref,
3788                // Built from the scx_static pre-pass above:
3789                // `start_low32 → size` for every live
3790                // `scx_static` bump-allocator region. Lets the
3791                // renderer's
3792                // [`MemReader::resolve_arena_type`]
3793                // diagnose-and-skip on a chased arena address
3794                // that lands inside scx_static memory — the
3795                // bridge cannot recover a per-allocation type
3796                // (no per-slot header) so the chase falls through
3797                // to the historical Fwd-skip / cross-BTF
3798                // resolution path. `None` when no live
3799                // `scx_static` instance was discovered.
3800                scx_static_index: scx_static_index_ref,
3801                // Built from the sdt_alloc pre-pass above: the
3802                // low-32 windowed `slot_start` of every allocator
3803                // slot already rendered into
3804                // `report.sdt_allocations`. Lets the renderer's
3805                // [`MemReader::is_already_rendered`] short-circuit
3806                // the arena chase when a TASK_STORAGE / HASH
3807                // map's value pointer lands in a slot the
3808                // typed-allocator surface already shows — no
3809                // duplicate payload in the dump. `None` when no
3810                // allocator pre-pass produced any rendered slot.
3811                rendered_slot_addrs: rendered_slot_addrs_ref,
3812                alloc_size_types,
3813            },
3814            &info,
3815        );
3816
3817        report.maps.push(rendered);
3818        maps_rendered += 1;
3819    }
3820    tracing::debug!(
3821        elapsed_us = render_map_t0.elapsed().as_micros() as u64,
3822        rendered = maps_rendered,
3823        truncated = maps_truncated,
3824        "dump_state phase: per-map render"
3825    );
3826
3827    report.dump_truncated_at_us = truncated_at_us;
3828    report.maps_truncated = maps_truncated as u32;
3829    report
3830}
3831
3832/// Walk every CPU's `kernel_cpustat`, `kernel_stat`, and (under
3833/// NO_HZ) `tick_sched` slots and produce a [`PerCpuTimeStats`]
3834/// vector — one entry per CPU index in `cap.per_cpu_offsets`.
3835///
3836/// Reads against the supplied [`super::reader::GuestMem`]. Each CPU's
3837/// per-CPU base for a symbol `S` is `S + per_cpu_offsets[cpu]`,
3838/// converted to a guest physical offset via
3839/// [`super::symbols::kva_to_pa`] using the supplied `page_offset`
3840/// (the standard direct-mapping translation; per-CPU pages always
3841/// live in the direct mapping).
3842///
3843/// `cpustat[]` is read as 8 contiguous u64s starting at the
3844/// resolved offset (length matches the indices captured —
3845/// CPUTIME_USER through CPUTIME_STEAL — leaving CPUTIME_GUEST /
3846/// CPUTIME_GUEST_NICE / CPUTIME_FORCEIDLE unread; the dump
3847/// surfaces them as zero in the unread slots, which is acceptable
3848/// since they're virt-guest specific or kernel-config gated and
3849/// distinct from the failure-dump narrative). `softirqs[]` reads
3850/// as `NR_SOFTIRQS` u32s, widened to u64 for the report. `irqs_sum`
3851/// is `unsigned long` (read as u64 — 64-bit only kernels are the
3852/// supported configuration). `iowait_sleeptime` is `ktime_t` /
3853/// `s64`; the value is cast to u64 (the kernel never produces
3854/// negative iowait time).
3855fn collect_per_cpu_time(cap: &CpuTimeCapture<'_>) -> Vec<PerCpuTimeStats> {
3856    use super::btf_offsets::{
3857        CPUTIME_IDLE, CPUTIME_IOWAIT, CPUTIME_IRQ, CPUTIME_NICE, CPUTIME_SOFTIRQ, CPUTIME_STEAL,
3858        CPUTIME_SYSTEM, CPUTIME_USER, NR_SOFTIRQS,
3859    };
3860    let mut out = Vec::with_capacity(cap.per_cpu_offsets.len());
3861    for (cpu_idx, &per_cpu_off) in cap.per_cpu_offsets.iter().enumerate() {
3862        let cpu = cpu_idx as u32;
3863
3864        // kernel_cpustat::cpustat[N]: each slot is a u64 in nsec.
3865        // Read CPUTIME_USER through CPUTIME_STEAL (indices 0..=7).
3866        let cpustat_kva =
3867            super::symbols::per_cpu_kva(cap.kernel_cpustat_kva, cap.kaslr_offset, per_cpu_off);
3868        let cpustat_pa = super::symbols::kva_to_pa(cpustat_kva, cap.page_offset);
3869        let cpustat_base = cap.offsets.kernel_cpustat_cpustat;
3870        let read_cpustat = |idx: usize| -> u64 {
3871            // sizeof(u64) == 8.
3872            cap.mem.read_u64(cpustat_pa, cpustat_base + idx * 8)
3873        };
3874        let cpustat_user_ns = read_cpustat(CPUTIME_USER);
3875        let cpustat_nice_ns = read_cpustat(CPUTIME_NICE);
3876        let cpustat_system_ns = read_cpustat(CPUTIME_SYSTEM);
3877        let cpustat_softirq_ns = read_cpustat(CPUTIME_SOFTIRQ);
3878        let cpustat_irq_ns = read_cpustat(CPUTIME_IRQ);
3879        let cpustat_idle_ns = read_cpustat(CPUTIME_IDLE);
3880        let cpustat_iowait_ns = read_cpustat(CPUTIME_IOWAIT);
3881        let cpustat_steal_ns = read_cpustat(CPUTIME_STEAL);
3882
3883        // kernel_stat::softirqs[N]: each slot is a u32 (count).
3884        // Widen to u64 for reporting consistency with cpustat.
3885        let kstat_kva = super::symbols::per_cpu_kva(cap.kstat_kva, cap.kaslr_offset, per_cpu_off);
3886        let kstat_pa = super::symbols::kva_to_pa(kstat_kva, cap.page_offset);
3887        let mut softirqs = [0u64; NR_SOFTIRQS];
3888        for (i, slot) in softirqs.iter_mut().enumerate() {
3889            // sizeof(unsigned int) == 4.
3890            *slot = cap
3891                .mem
3892                .read_u32(kstat_pa, cap.offsets.kstat_softirqs + i * 4) as u64;
3893        }
3894
3895        // kernel_stat::irqs_sum: unsigned long. 64-bit only
3896        // kernels are supported, so read as u64.
3897        let irqs_sum = cap.mem.read_u64(kstat_pa, cap.offsets.kstat_irqs_sum);
3898
3899        // tick_sched::iowait_sleeptime: ktime_t (s64) ns,
3900        // accumulated only under NO_HZ when the CPU enters idle
3901        // with nr_iowait > 0. Skip when the symbol or BTF offset
3902        // is absent.
3903        let iowait_sleeptime_ns = cap
3904            .tick_cpu_sched_kva
3905            .zip(cap.offsets.tick_sched_iowait_sleeptime)
3906            .map(|(tick_sym_kva, off)| {
3907                let kva = super::symbols::per_cpu_kva(tick_sym_kva, cap.kaslr_offset, per_cpu_off);
3908                let pa = super::symbols::kva_to_pa(kva, cap.page_offset);
3909                cap.mem.read_u64(pa, off)
3910            });
3911
3912        out.push(PerCpuTimeStats {
3913            cpu,
3914            cpustat_user_ns,
3915            cpustat_nice_ns,
3916            cpustat_system_ns,
3917            cpustat_softirq_ns,
3918            cpustat_irq_ns,
3919            cpustat_idle_ns,
3920            cpustat_iowait_ns,
3921            cpustat_steal_ns,
3922            softirqs,
3923            irqs_sum,
3924            iowait_sleeptime_ns,
3925        });
3926    }
3927    out
3928}
3929
3930/// Walk the test's workload cgroups and read each leaf's PSI_IRQ_FULL (Phase A).
3931/// Thin adapter over
3932/// [`super::cgroup_walk::collect_workload_cgroup_psi`] that unpacks the
3933/// borrowed [`CgroupPsiCapture`]. Empty when no workload leaf has per-cgroup
3934/// PSI accounting (loud-absent).
3935fn collect_cgroup_psi(cap: &CgroupPsiCapture<'_>) -> Vec<super::cgroup_walk::CgroupPsiStat> {
3936    super::cgroup_walk::collect_workload_cgroup_psi(
3937        cap.mem,
3938        cap.cgroup_offsets,
3939        cap.psi_offsets,
3940        cap.root_cgroup_kva,
3941        cap.root_cgroup_pa,
3942        cap.workload_root_path,
3943        cap.page_offset,
3944    )
3945}
3946
3947/// Walk a Datasec section by name, yielding `(var_name, byte_offset,
3948/// type_id)` for every variable declared in it.
3949///
3950/// Used by [`dump_state`] to enumerate `.bss` variables when looking
3951/// for `scx_allocator` instances. Returns an empty iterator when the
3952/// Datasec doesn't exist or any chained Var resolution fails — the
3953/// caller treats that as "no sdt_alloc state to surface" rather than
3954/// a hard error.
3955fn iter_bss_vars_with_type(btf: &Btf, section_name: &str) -> Vec<(String, usize, u32)> {
3956    use btf_rs::BtfType;
3957    let mut out = Vec::new();
3958    let Ok(candidates) = btf.resolve_types_by_name(section_name) else {
3959        return out;
3960    };
3961    for ty in candidates {
3962        let btf_rs::Type::Datasec(ds) = ty else {
3963            continue;
3964        };
3965        for var_info in &ds.variables {
3966            let Ok(chained) = btf.resolve_chained_type(var_info) else {
3967                continue;
3968            };
3969            let btf_rs::Type::Var(var) = chained else {
3970                continue;
3971            };
3972            let Ok(name) = btf.resolve_name(&var) else {
3973                continue;
3974            };
3975            // The Var's type_id points to the variable's actual
3976            // type (e.g. struct scx_allocator). var_info.offset() is
3977            // the byte offset within the Datasec.
3978            let Some(type_id) = var.get_type_id() else {
3979                continue;
3980            };
3981            out.push((name, var_info.offset() as usize, type_id));
3982        }
3983    }
3984    out
3985}
3986
3987/// True iff `type_id` resolves to a struct named `scx_allocator`,
3988/// stripping the BTF modifier chain en route. The five modifier
3989/// kinds the loop unwraps — `Const`, `Volatile`, `Typedef`,
3990/// `Restrict`, `TypeTag` — are the complete set the kernel BPF
3991/// pipeline emits for global variable types in `.bss`. Any other
3992/// kind in the chain (Ptr, Array, etc.) terminates the lookup with
3993/// a non-match.
3994fn is_scx_allocator_type(btf: &Btf, type_id: u32) -> bool {
3995    use btf_rs::Type as T;
3996    // Mirror the modifier-chain pattern in
3997    // `btf_offsets::resolve_member_composite` — resolve the
3998    // chained type via the BtfType trait object so the type
3999    // aliases (Const = Volatile, TypeTag = Typedef) all share the
4000    // same path through the loop.
4001    let Ok(mut t) = btf.resolve_type_by_id(type_id) else {
4002        return false;
4003    };
4004    for _ in 0..20 {
4005        match t {
4006            T::Struct(s) => {
4007                return btf.resolve_name(&s).is_ok_and(|n| n == "scx_allocator");
4008            }
4009            T::Const(_) | T::Volatile(_) | T::Typedef(_) | T::Restrict(_) | T::TypeTag(_) => {
4010                let Some(btf_ty) = t.as_btf_type() else {
4011                    return false;
4012                };
4013                let Ok(next) = btf.resolve_chained_type(btf_ty) else {
4014                    return false;
4015                };
4016                t = next;
4017            }
4018            _ => return false,
4019        }
4020    }
4021    false
4022}
4023
4024/// True iff `type_id` resolves to a struct named `scx_static`,
4025/// stripping the BTF modifier chain en route. Mirrors the
4026/// modifier-handling shape of [`is_scx_allocator_type`] — the five
4027/// modifier kinds (`Const`, `Volatile`, `Typedef`, `Restrict`,
4028/// `TypeTag`) are the complete set the kernel BPF pipeline emits for
4029/// global variable types in `.bss`; any other kind terminates the
4030/// lookup with a non-match.
4031///
4032/// Used by the [`crate::monitor::scx_static_alloc::walk_scx_static`]
4033/// pre-pass to filter `.bss` Vars to only `struct scx_static`
4034/// instances. A scheduler that doesn't link `lib/sdt_alloc.bpf.c`
4035/// has no such Var; the filter rejects every candidate and the
4036/// walker produces an empty snapshot.
4037fn is_scx_static_type(btf: &Btf, type_id: u32) -> bool {
4038    use btf_rs::Type as T;
4039    let Ok(mut t) = btf.resolve_type_by_id(type_id) else {
4040        return false;
4041    };
4042    for _ in 0..20 {
4043        match t {
4044            T::Struct(s) => {
4045                return btf.resolve_name(&s).is_ok_and(|n| n == "scx_static");
4046            }
4047            T::Const(_) | T::Volatile(_) | T::Typedef(_) | T::Restrict(_) | T::TypeTag(_) => {
4048                let Some(btf_ty) = t.as_btf_type() else {
4049                    return false;
4050                };
4051                let Ok(next) = btf.resolve_chained_type(btf_ty) else {
4052                    return false;
4053                };
4054                t = next;
4055            }
4056            _ => return false,
4057        }
4058    }
4059    false
4060}
4061
4062/// Locate the scheduler's `.bss` array map and pull out (raw bytes,
4063/// program BTF) for the [`crate::monitor::scx_static_alloc`] pre-pass.
4064///
4065/// The earlier sdt_alloc pre-pass at the top of [`dump_state`]
4066/// already collected `sched_bss_bytes` once but the if-let chain at
4067/// the sdt_alloc walk consumed the `Option`. Re-locating here keeps
4068/// both walkers independent: each owns its own bss-bytes read so a
4069/// future ordering change can't accidentally leave one walker
4070/// without input. The cost is one extra map walk; small compared to
4071/// the per-map render loop that follows.
4072///
4073/// Returns `None` when:
4074///   - no `*.bss` map exists (libbpf only creates this map when the
4075///     program has any global non-const data),
4076///   - the map's `btf_kva == 0` (no program BTF — type resolution
4077///     would fail),
4078///   - the program BTF for that `btf_kva` was not loaded in the
4079///     pre-pass (parse failed earlier; the caller already logged),
4080///   - the map's value bytes can't be read.
4081fn relocate_sched_bss<'btf>(
4082    maps: &[BpfMapInfo],
4083    accessor: &GuestMemMapAccessor<'_>,
4084    program_btfs: &'btf std::collections::HashMap<u64, Btf>,
4085) -> Option<(Vec<u8>, &'btf Btf)> {
4086    for info in maps {
4087        let name = info.name();
4088        if name.starts_with("probe_bp.")
4089            || name.starts_with("fentry_p.")
4090            || name == "probe_bp"
4091            || name == "fentry_p"
4092            || KTSTR_INTERNAL_MAPS.contains(&name.as_ref())
4093        {
4094            continue;
4095        }
4096        if info.map_type == BPF_MAP_TYPE_ARRAY
4097            && info.btf_kva != 0
4098            && name.ends_with(".bss")
4099            && let Some(prog_btf) = program_btfs.get(&info.btf_kva)
4100            && let Some(bytes) = accessor.read_value(info, 0, info.value_size as usize)
4101        {
4102            return Some((bytes, prog_btf));
4103        }
4104    }
4105    None
4106}
4107
4108/// Load a BPF program's `struct btf` from guest memory at `btf_kva`.
4109///
4110/// Reads the kernel `struct btf` at `btf_kva`, follows its `data` /
4111/// `data_size` / `base_btf` fields, fetches the raw BTF blob via
4112/// page-walked vmalloc reads, and parses it. When `base_btf` is
4113/// non-NULL the program's BTF is split atop the vmlinux BTF (the
4114/// kernel's own base BTF) — pass the host's already-parsed vmlinux
4115/// `Btf` as the split base so type IDs resolve correctly.
4116///
4117/// Returns `None` when any step fails: missing offsets, untranslatable
4118/// pages, or `Btf::from_bytes` rejection (truncated / corrupted blob).
4119/// Failure is silent and the caller falls back to the host vmlinux
4120/// BTF — the dump is best-effort, a partial render still beats no
4121/// render.
4122///
4123/// Distinct from the [`super::bpf_map::BpfMapAccessor::load_program_btf`]
4124/// trait method (which dispatches across backends): this free function
4125/// is the guest-memory backend's actual KVA-based loader. The trait
4126/// method on `GuestMemMapAccessor` just forwards here.
4127pub(super) fn load_program_btf_kva(
4128    accessor: &GuestMemMapAccessor<'_>,
4129    btf_kva: u64,
4130    base_btf: &Btf,
4131) -> Option<Btf> {
4132    let kernel = accessor.kernel();
4133    let offsets = accessor.offsets();
4134    let mem = kernel.mem();
4135    let walk = kernel.walk_context();
4136
4137    // `struct btf` may be kmalloc'd (direct map) or vmalloc'd; use
4138    // translate_any_kva.
4139    let btf_pa = super::idr::translate_any_kva(
4140        mem,
4141        walk.cr3_pa,
4142        walk.page_offset,
4143        btf_kva,
4144        walk.l5,
4145        walk.tcr_el1,
4146    )?;
4147    let data_kva = mem.read_u64(btf_pa, offsets.btf_data);
4148    let data_size = mem.read_u32(btf_pa, offsets.btf_data_size) as usize;
4149    let base_kva = mem.read_u64(btf_pa, offsets.btf_base_btf);
4150
4151    if data_kva == 0 || data_size == 0 {
4152        return None;
4153    }
4154
4155    if data_size > MAX_BTF_BLOB {
4156        return None;
4157    }
4158
4159    // The BTF blob is vmalloc-backed — `btf->data` is allocated via
4160    // vmalloc / kvmalloc inside `kernel/bpf/btf.c`'s
4161    // `btf_parse_*` paths. Use the chunked vmalloc reader so a
4162    // 100 KB blob doesn't pay 100K syscalls of byte-wise translate.
4163    // The chunked reader honours all-or-nothing semantics, so a
4164    // short read returns None directly; no extra length check needed.
4165    let blob = kernel.read_kva_bytes_chunked(data_kva, data_size)?;
4166
4167    if base_kva != 0 {
4168        // Split BTF: the program's types extend the kernel's
4169        // vmlinux BTF. Pass the host's parsed vmlinux Btf as the
4170        // base so cross-base type IDs (e.g. `task_struct`) resolve.
4171        //
4172        // Uses host vmlinux BTF as split base — correct when host
4173        // kernel == guest kernel (ktstr's default and the common
4174        // CI configuration). A guest running a different kernel
4175        // version would silently mis-render cross-base type
4176        // references; flagged as a known limitation in the module
4177        // doc above.
4178        Btf::from_split_bytes(&blob, base_btf).ok()
4179    } else {
4180        Btf::from_bytes(&blob).ok()
4181    }
4182}
4183
4184/// Render a byte slice as space-separated hex pairs.
4185///
4186/// `pub(crate)` so [`super::sdt_alloc`] can reuse the same wire shape
4187/// for its hex-fallback payload renderings — keeps the dump's hex
4188/// output consistent across both renderers.
4189pub(crate) fn hex_dump(bytes: &[u8]) -> String {
4190    use std::fmt::Write;
4191    let mut s = String::with_capacity(bytes.len() * 3);
4192    for (i, b) in bytes.iter().enumerate() {
4193        if i > 0 {
4194            s.push(' ');
4195        }
4196        // unwrap is safe: write! to String never fails.
4197        let _ = write!(s, "{b:02x}");
4198    }
4199    s
4200}