ktstr/monitor/dump/mod.rs
1//! BPF map state dump for scheduler-failure post-mortem.
2//!
3//! [`dump_state`] is invoked by the freeze coordinator after the vCPU
4//! rendezvous succeeds (see `src/vmm/freeze_coord/mod.rs`). It enumerates every
5//! BPF map in the guest via [`BpfMapAccessor::maps`], filters out
6//! ktstr-internal probes (the framework's own probe and fentry skel
7//! maps), and dispatches per map type:
8//!
9//! - `BPF_MAP_TYPE_ARRAY` (and the `.bss` / `.data` / `.rodata`
10//! global-section maps libbpf creates as single-key arrays) — read
11//! the whole value buffer and render it via [`super::btf_render::render_value_with_mem`]
12//! so embedded `__arena` pointers chase into the captured arena pages.
13//! - `BPF_MAP_TYPE_HASH` — iterate (key, value) pairs, capped at
14//! [`MAX_HASH_ENTRIES`].
15//! - `BPF_MAP_TYPE_PERCPU_ARRAY` — read each CPU's slot for keys
16//! `0..min(max_entries, MAX_PERCPU_KEYS)`.
17//! - Other types — recorded as [`FailureDumpMap::error`] so the operator
18//! sees the gap rather than a silent omission.
19//!
20//! # BTF source — per-map program BTF loading
21//!
22//! The renderer loads each map's program BTF from guest memory at
23//! [`BpfMapInfo::btf_kva`], following the kernel `struct btf`'s
24//! `data`/`data_size`/`base_btf` fields. Split BTF (program types
25//! extending vmlinux) is parsed via [`Btf::from_split_bytes`] with
26//! the host's vmlinux BTF as the base (correct when host kernel ==
27//! guest kernel — ktstr's default and the common CI configuration).
28//! A per-`btf_kva` cache dedupes parses across maps sharing a
29//! program's BTF object. When per-map load fails (still-booting
30//! guest, untranslatable page, corrupted blob), the renderer falls
31//! back to the caller-supplied vmlinux BTF.
32//!
33//! # sdt_alloc pre-pass
34//!
35//! Before the per-map walk runs, [`dump_state`] runs a pre-pass
36//! that locates `sdt_alloc`-backed allocator instances inside the
37//! scheduler's `.bss` and surfaces every live per-task / per-cgroup
38//! allocation as structured records under
39//! [`FailureDumpReport::sdt_allocations`]. The walk runs only when
40//! every prerequisite is present:
41//! - the per-map dump deadline has not been exceeded (the
42//! pre-pass runs before every map render, and an earlier
43//! phase exhausting the budget skips the walk to keep the
44//! dump bounded),
45//! - the arena's `user_vm_start` is 4 GiB-aligned (low 32 bits
46//! zero — the bridge's address arithmetic treats slot starts as
47//! low-32 keys against a `[user_vm_start, user_vm_start + 4 GiB)`
48//! window; misalignment would silently mismap chases),
49//! - the scheduler exposes a `.bss` ARRAY map with non-zero
50//! `btf_kva` (so we can read its raw bytes and have a program
51//! BTF to resolve types against),
52//! - at least one `BPF_MAP_TYPE_ARENA` map snapshot succeeded
53//! (so we have `kern_vm_start` for arena pointer translation),
54//! - the program BTF carries `struct scx_allocator` (the scheduler
55//! links `lib/sdt_alloc.bpf.c`).
56//!
57//! When any prerequisite is missing, the pre-pass leaves
58//! `sdt_allocations` empty rather than failing the dump — the
59//! per-map page-granular [`super::arena::ArenaSnapshot`] still
60//! captures raw arena content for callers that don't need
61//! structured rendering. See [`super::sdt_alloc`] for the walker
62//! design.
63
64mod display;
65mod render_map;
66#[cfg(test)]
67mod tests;
68use render_map::*;
69
70use serde::{Deserialize, Serialize};
71
72use btf_rs::Btf;
73
74use super::arena::{ArenaSnapshot, BpfArenaOffsets, snapshot_arena};
75use super::bpf_map::{
76 BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_ARRAY, BpfMapAccessor, BpfMapInfo, GuestMemMapAccessor,
77};
78use super::btf_render::RenderedValue;
79use super::sdt_alloc::{
80 SdtAllocOffsets, SdtAllocatorSnapshot, discover_payload_btf_id, walk_sdt_allocator,
81};
82
83/// Borrow-only capture context for per-program runtime stats
84/// (cnt/nsecs/misses) populated alongside the BPF map dump.
85///
86/// Carries a borrowed [`super::bpf_prog::BpfProgAccessor`] plus the
87/// per-CPU offset array obtained from
88/// [`super::symbols::read_per_cpu_offsets`]. [`dump_state`] calls
89/// [`super::bpf_prog::BpfProgAccessor::struct_ops_runtime_stats`]
90/// with the supplied offsets and stores the resulting
91/// [`super::bpf_prog::ProgRuntimeStats`] vector in
92/// [`FailureDumpReport::prog_runtime_stats`].
93///
94/// Pass `None` to skip prog-runtime capture (e.g. when the
95/// accessor could not be constructed because `prog_idr` is
96/// missing or the BPF prog offsets did not resolve). The dump still
97/// renders every map the [`super::bpf_map::BpfMapAccessor`] enumerates.
98pub struct ProgRuntimeCapture<'a> {
99 /// Accessor for walking `prog_idr` and reading per-program
100 /// `bpf_prog_stats` slots. Trait dispatch lets the same dump
101 /// site consume either the guest-memory backend or the planned
102 /// live-host backend without committing to a concrete type.
103 pub accessor: &'a dyn super::bpf_prog::BpfProgAccessor,
104 /// Per-CPU offset array (`__per_cpu_offset[cpu]`) used to address
105 /// each CPU's `bpf_prog_stats` slot for summation.
106 pub per_cpu_offsets: &'a [u64],
107}
108
109/// Borrow-only capture context for per-CPU CPU-time / softirq / IRQ
110/// counters populated alongside the BPF map dump.
111///
112/// Carries the BTF-resolved field offsets for `kernel_cpustat`,
113/// `kernel_stat`, and `tick_sched`, the resolved `.data..percpu`
114/// section offsets of the three per-CPU symbols, and the
115/// `__per_cpu_offset[cpu]` array used to address each CPU's slot.
116///
117/// [`dump_state`] reads each CPU's slot via direct guest-memory
118/// reads against the supplied [`super::reader::GuestMem`] and
119/// records the result into [`FailureDumpReport::per_cpu_time`].
120/// Mirrors [`ProgRuntimeCapture`]'s "borrowed-only, optional"
121/// shape — when `None`, the dump skips the per-CPU time capture
122/// and leaves the field empty.
123///
124/// Skipped silently when the resolver could not locate any of the
125/// three per-CPU symbols (stripped vmlinux), the BTF offsets are
126/// not present (CPU-time accounting types missing), or
127/// `__per_cpu_offset` resolution returned an empty array. The
128/// capture is best-effort diagnostic data; its absence does not
129/// fail the dump.
130pub struct CpuTimeCapture<'a> {
131 /// Guest memory handle used to read each per-CPU slot.
132 pub mem: &'a super::reader::GuestMem,
133 /// BTF-resolved offsets for `kernel_cpustat::cpustat[]`,
134 /// `kernel_stat::softirqs[]`, `kernel_stat::irqs_sum`, and
135 /// optionally `tick_sched::iowait_sleeptime`.
136 pub offsets: &'a super::btf_offsets::CpuTimeOffsets,
137 /// Link-time KVA of the `kernel_cpustat` per-CPU symbol (the
138 /// value `st_value` carries in the vmlinux symbol table — the
139 /// template address the linker assigned). The runtime KVA on
140 /// CPU `cpu` is
141 /// [`super::symbols::per_cpu_kva`]`(kernel_cpustat_kva,
142 /// kaslr_offset, per_cpu_offsets[cpu])`.
143 pub kernel_cpustat_kva: u64,
144 /// Link-time KVA of the `kstat` per-CPU symbol. See
145 /// `kernel_cpustat_kva` for the runtime KVA formula.
146 pub kstat_kva: u64,
147 /// Link-time KVA of the `tick_cpu_sched` per-CPU symbol.
148 /// `None` when the kernel was built without
149 /// `CONFIG_NO_HZ_COMMON`; iowait_sleeptime capture is skipped.
150 pub tick_cpu_sched_kva: Option<u64>,
151 /// Per-CPU offset array (`__per_cpu_offset[cpu]`) — same array
152 /// the BPF prog-stats walker uses (see
153 /// [`super::symbols::read_per_cpu_offsets`]). Length determines
154 /// how many CPUs the walker visits.
155 pub per_cpu_offsets: &'a [u64],
156 /// Guest's `PAGE_OFFSET` (resolved via
157 /// [`super::symbols::resolve_page_offset`]). Used to translate
158 /// each CPU's per-CPU KVA to a guest physical address for the
159 /// memory read.
160 pub page_offset: u64,
161 /// Virtual KASLR offset that per-CPU KVA derivation needs to
162 /// bridge the link-time (`__per_cpu_start_LINK`) and runtime
163 /// (`__per_cpu_start_RUNTIME`) bases. Sourced from the shared
164 /// `kern_virt_kaslr` Arc populated by either the BSP MSR_LSTAR
165 /// derive (`crate::vmm::x86_64::msr_kaslr::read_and_derive`,
166 /// x86_64-only) or the guest-channel KERN_ADDRS `_text`
167 /// subtraction (`crate::vmm::freeze_coord::dispatch`, both
168 /// arches). 0 fallback matches KASLR-off / nokaslr-karg
169 /// semantics and collapses [`super::symbols::per_cpu_kva`] to
170 /// the no-slide formula. On aarch64 without `_text` in
171 /// /proc/kallsyms (kptr_restrict masked) the value stays 0
172 /// and per-CPU resolution relies on the `nokaslr` karg
173 /// (`src/vmm/setup.rs`) instead.
174 pub kaslr_offset: u64,
175}
176
177/// Per-cgroup PSI-irq host-walk inputs (Phase A). Borrowed-only/optional,
178/// mirroring [`CpuTimeCapture`]: the freeze coordinator builds it only when the
179/// cgroup-walk offsets, the `cgrp_dfl_root` symbol, and the `psi_group` offsets
180/// all resolve; otherwise [`DumpContext::cgroup_psi_capture`] is `None` and the
181/// per-cgroup axis reads loud-absent. The walk descends `cgrp_dfl_root` → the
182/// host-held workload-root path → leaf cgroups and reads each leaf's
183/// `cgroup->psi` PSI_IRQ_FULL (see [`super::cgroup_walk`]).
184pub struct CgroupPsiCapture<'a> {
185 /// Guest memory handle used to read the cgroup hierarchy + each psi_group.
186 pub mem: &'a super::reader::GuestMem,
187 /// BTF-resolved cgroup-hierarchy field offsets (`cgroup.{self,kn,psi}`,
188 /// `cgroup_subsys_state.{sibling,children}`, `cgroup_root.cgrp`,
189 /// `kernfs_node.name`).
190 pub cgroup_offsets: &'a super::btf_offsets::CgroupWalkOffsets,
191 /// BTF-resolved `psi_group` offsets (shared with the system-wide walk —
192 /// a per-cgroup psi_group is the same `struct psi_group`).
193 pub psi_offsets: &'a super::btf_offsets::PsiGroupOffsets,
194 /// RUNTIME KVA of the hierarchy root cgroup (`cgrp_dfl_root +
195 /// offsetof(cgroup_root, cgrp)`, with any virtual-KASLR slide applied by
196 /// the caller) — used for the children-list anchor compare at the root
197 /// level. Every descendant is a direct-mapped slab object.
198 pub root_cgroup_kva: u64,
199 /// Guest physical address of the hierarchy root cgroup (the kernel-image
200 /// translation of the link-time root KVA, done by the caller via
201 /// `GuestKernel::text_kva_to_pa`) — the walk's entry read.
202 pub root_cgroup_pa: u64,
203 /// The test's workload-root cgroup path (host-held VM config, default
204 /// `/sys/fs/cgroup/ktstr`). The walk descends ONLY this subtree, so the
205 /// scheduler's separate cgroup does not confound the per-cgroup axis.
206 pub workload_root_path: &'a str,
207 /// Guest `PAGE_OFFSET` for the direct-map (`kva_to_pa`) hops to every
208 /// descendant cgroup / kernfs_node / psi_group.
209 pub page_offset: u64,
210}
211
212/// Borrow-only capture context for per-task enrichment.
213///
214/// Carries the [`super::guest::GuestKernel`] (guest memory + symbol
215/// table), the BTF-resolved task/signal/pid/upid offsets, the cached
216/// sched_class symbol KVAs (for class-name decode and the
217/// PI-boost-out-of-SCX flag), the lock-slowpath symbol cache (for
218/// stack-trace pattern matching), AND the task list itself — a
219/// pre-collected `&[TaskWalkerEntry]` produced by a task walker
220/// (rq->scx walk, DSQ walk, init_task→tasks enumeration).
221///
222/// Mirrors the [`ProgRuntimeCapture`] / [`CpuTimeCapture`]
223/// borrowed-only-optional shape. When `dump_state` receives
224/// `Some(TaskEnrichmentCapture)`, it iterates `tasks` and calls
225/// [`super::task_enrichment::walk_task_enrichment`] for each entry,
226/// pushing results into [`FailureDumpReport::task_enrichments`]. When
227/// `None`, the field stays empty and
228/// [`FailureDumpReport::task_enrichments_unavailable`] gets a
229/// "no task walker available" diagnostic.
230///
231/// The walker producer (rq->scx walker etc.) is responsible for
232/// building this struct. Until walker dispatch lands, no walker
233/// exists; the freeze coordinator passes `None` and the field is
234/// plumbed but empty.
235pub struct TaskEnrichmentCapture<'a> {
236 /// Borrowed GuestKernel — provides memory access, page-table
237 /// translation context, and the vmlinux symbol table.
238 pub kernel: &'a super::guest::GuestKernel,
239 /// BTF-resolved offsets for the task/signal/pid/upid walk.
240 pub offsets: &'a super::btf_offsets::TaskEnrichmentOffsets,
241 /// Cached sched_class symbol KVAs for class decode + PI-boost
242 /// flag.
243 pub sched_classes: &'a super::task_enrichment::SchedClassRegistry,
244 /// Cached lock-slowpath symbol KVAs for stack-PC pattern
245 /// matching.
246 pub lock_slowpaths: &'a super::task_enrichment::LockSlowpathRegistry,
247 /// Tasks the walker discovered, plus per-task metadata
248 /// `walk_task_enrichment` needs (see [`TaskWalkerEntry`]).
249 pub tasks: &'a [TaskWalkerEntry],
250}
251
252/// One entry produced by a task walker (rq->scx, DSQ, etc.) for the
253/// enrichment capture pipeline.
254///
255/// Each task walker discovers task KVAs by traversing the kernel's
256/// own scheduling data structures; the walker also knows which task
257/// was reachable via `rq->scx.runnable_list` (used for the
258/// PI-boost-out-of-SCX flag) and which vCPU's instruction-pointer
259/// matches the running task (used for the lock-slowpath stack
260/// matcher). Capturing those signals at the walker site keeps the
261/// enrichment surface side-effect free — `walk_task_enrichment` only
262/// reads guest memory; it does not perform discovery itself.
263#[derive(Debug, Clone, Copy)]
264pub struct TaskWalkerEntry {
265 /// Kernel virtual address of the `task_struct`.
266 pub task_kva: u64,
267 /// True iff the task was reached via `rq->scx.runnable_list`.
268 /// Required for the PI-boost-out-of-SCX flag — see
269 /// [`super::task_enrichment::TaskEnrichment::pi_boosted_out_of_scx`].
270 pub is_runnable_in_scx: bool,
271 /// Optional instruction pointer for the lock-slowpath stack
272 /// matcher. Pass the corresponding vCPU's
273 /// [`VcpuRegSnapshot::instruction_pointer`] when this task was
274 /// running on that vCPU at freeze time; pass `None` for tasks
275 /// not actively running.
276 pub running_pc: Option<u64>,
277}
278
279/// Per-CPU CPU-time / softirq / IRQ snapshot captured at freeze
280/// time. One entry per CPU index visible to the host walker.
281///
282/// All counter fields are monotonic in the kernel — the freeze
283/// captures the instantaneous value at the moment the vCPUs
284/// rendezvous-park. Diffing two snapshots (or comparing against a
285/// pre-test baseline) is the consumer's job; this type does not
286/// derive deltas.
287///
288/// Field semantics match the kernel sources:
289/// - `cpustat_*_ns`: ns counter from
290/// `kernel_cpustat::cpustat[CPUTIME_*]`. Updated by
291/// `account_user_time` / `account_system_index_time` and
292/// siblings (`kernel/sched/cputime.c`). The kernel stores
293/// nanoseconds; `/proc/stat` divides by `cputime_to_clock_t`.
294/// - `softirqs[i]`: `kernel_stat::softirqs[i]` cumulative count
295/// incremented by `kstat_incr_softirqs_this_cpu` on every
296/// softirq raise. Indexed by `super::btf_offsets::SOFTIRQ_NAMES`.
297/// - `irqs_sum`: `kernel_stat::irqs_sum` cumulative count
298/// incremented by `kstat_incr_irq_this_cpu` on every hardirq.
299/// - `iowait_sleeptime_ns`: `tick_sched::iowait_sleeptime`
300/// accumulated only under NO_HZ when the CPU enters idle with
301/// `nr_iowait > 0`. `None` when CONFIG_NO_HZ_COMMON is off or
302/// the resolver couldn't locate `tick_cpu_sched`.
303#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
304#[non_exhaustive]
305pub struct PerCpuTimeStats {
306 /// CPU index (0-based) this entry describes.
307 pub cpu: u32,
308 /// `cpustat[CPUTIME_USER]` (ns).
309 pub cpustat_user_ns: u64,
310 /// `cpustat[CPUTIME_NICE]` (ns).
311 pub cpustat_nice_ns: u64,
312 /// `cpustat[CPUTIME_SYSTEM]` (ns).
313 pub cpustat_system_ns: u64,
314 /// `cpustat[CPUTIME_SOFTIRQ]` (ns).
315 pub cpustat_softirq_ns: u64,
316 /// `cpustat[CPUTIME_IRQ]` (ns).
317 pub cpustat_irq_ns: u64,
318 /// `cpustat[CPUTIME_IDLE]` (ns).
319 pub cpustat_idle_ns: u64,
320 /// `cpustat[CPUTIME_IOWAIT]` (ns).
321 pub cpustat_iowait_ns: u64,
322 /// `cpustat[CPUTIME_STEAL]` (ns).
323 pub cpustat_steal_ns: u64,
324 /// `kernel_stat::softirqs[]` per-vector cumulative counts.
325 /// Indexed by `super::btf_offsets::SOFTIRQ_NAMES`.
326 pub softirqs: [u64; super::btf_offsets::NR_SOFTIRQS],
327 /// `kernel_stat::irqs_sum` cumulative hardirq count.
328 pub irqs_sum: u64,
329 /// `tick_sched::iowait_sleeptime` accumulated NO_HZ idle time
330 /// with outstanding IO (ns). `None` when NO_HZ disabled or
331 /// `tick_cpu_sched` symbol was absent at resolve time.
332 #[serde(default, skip_serializing_if = "Option::is_none")]
333 pub iowait_sleeptime_ns: Option<u64>,
334}
335
336/// Per-node NUMA event counters captured from
337/// `pglist_data->node_zones[]->vm_numa_event[]` at freeze time.
338///
339/// Each row is one row of NUMA event counters summed across all
340/// zones on a single node. The six counters mirror the kernel's
341/// `enum numa_stat_item` (see `super::btf_offsets::NUMA_HIT`
342/// etc. for the enum-stable indices). All counters are
343/// monotonic-since-boot; consumers diff against a baseline (or
344/// against another node's row) to extract the test-window delta.
345///
346/// Diagnostic value for sched_ext stalls is informational only —
347/// the NUMA balancer is not active for ext tasks. The rows
348/// surface here so an operator triaging a NUMA-aware workload
349/// (e.g. a memory-tiering test) can verify the kernel actually
350/// observed the expected node-locality distribution.
351///
352/// **Live walker status:** the wire shape, BTF offsets
353/// (`super::btf_offsets::NumaStatsOffsets`), and report field
354/// are wired through. The actual host-side walker that resolves
355/// `node_data[]` and reads per-zone counters is pending; until it
356/// lands, the report's [`FailureDumpReport::per_node_numa`] vec
357/// stays empty and
358/// [`FailureDumpReport::per_node_numa_unavailable`] carries the
359/// `"no NUMA walker"` reason.
360#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
361#[non_exhaustive]
362pub struct PerNodeNumaStats {
363 /// NUMA node id this row describes.
364 pub node: u32,
365 /// `vm_numa_event[NUMA_HIT]` summed across zones — pages
366 /// allocated on the requested node when local was preferred.
367 pub numa_hit: u64,
368 /// `vm_numa_event[NUMA_MISS]` — local node full, allocation
369 /// landed on a non-local node.
370 pub numa_miss: u64,
371 /// `vm_numa_event[NUMA_FOREIGN]` — process-policy targeted a
372 /// different node, this node honored the policy.
373 pub numa_foreign: u64,
374 /// `vm_numa_event[NUMA_INTERLEAVE_HIT]` — interleave policy
375 /// allocations that landed on this node.
376 pub numa_interleave_hit: u64,
377 /// `vm_numa_event[NUMA_LOCAL]` — allocations on this node by
378 /// processes running on this node.
379 pub numa_local: u64,
380 /// `vm_numa_event[NUMA_OTHER]` — allocations on this node by
381 /// processes running on a different node.
382 pub numa_other: u64,
383}
384
385/// Reason string written into [`FailureDumpReport::per_node_numa_unavailable`]
386/// when the per-node NUMA walker has not landed yet. Distinct from
387/// other unavailable reasons so a downstream consumer can tell
388/// "walker not implemented" apart from "walker ran and produced
389/// no data" once the live producer ships.
390pub const REASON_NO_NUMA_WALKER: &str = "no NUMA walker (host-side walker pending)";
391
392/// Borrow-only capture context for the per-sample SCX event counter
393/// timeline.
394///
395/// The freeze coordinator forwards the monitor sampler's accumulated
396/// `super::MonitorSample` vec via [`Self::samples`]; the dump path
397/// folds each sample's per-CPU `super::ScxEventCounters` into a
398/// single cross-CPU sum and produces one [`EventCounterSample`] per
399/// monitor tick.
400///
401/// `None` skips the timeline capture; the dump still renders the
402/// rest of the report. Mirrors [`ProgRuntimeCapture`] /
403/// [`CpuTimeCapture`]'s "borrowed-only, optional" shape.
404pub struct EventCounterCapture<'a> {
405 /// Periodic monitor samples gathered between VM start and the
406 /// freeze trigger. Each sample carries per-CPU
407 /// `super::ScxEventCounters` when scx event-stat offsets
408 /// resolved; the dump folder skips samples whose CPUs all
409 /// reported `event_counters: None`.
410 pub samples: &'a [super::MonitorSample],
411}
412
413/// Borrow-only capture context for the rq->scx + DSQ walkers.
414/// Mirrors [`TaskEnrichmentCapture`] / [`CpuTimeCapture`] shape —
415/// `dump_state` consumes everything by reference.
416///
417/// Carries:
418/// - `kernel`: GuestKernel handle for guest-memory reads
419/// (PTE walks, symbol resolution).
420/// - `offsets`: BTF-resolved
421/// [`super::btf_offsets::ScxWalkerOffsets`] covering scx_rq,
422/// scx_sched, scx_sched_pcpu, scx_sched_pnode, scx_dispatch_q,
423/// sched_ext_entity, scx_dsq_list_node, rhashtable, bucket_table,
424/// rhash_head.
425/// - `scx_root_kva`: kernel-text-mapped pointer the walker
426/// dereferences to find the active `scx_sched`.
427/// - `rq_kvas` / `rq_pas`: per-CPU rq KVA + PA arrays; same vecs
428/// the runnable_at scanner uses.
429/// - `per_cpu_offsets`: `__per_cpu_offset[]` array — needed for
430/// per-CPU bypass DSQ resolution.
431/// - `nr_nodes`: NUMA node count, for the per-node global-DSQ
432/// walk. Pass `1` on UMA / unknown configurations; the walker
433/// gracefully skips slots whose pnode pointers are NULL.
434///
435/// When `None` is passed in [`DumpContext::scx_walker_capture`],
436/// the dump emits empty `rq_scx_states` / `dsq_states` and
437/// records `scx_walker_unavailable` with a diagnostic reason.
438pub struct ScxWalkerCapture<'a> {
439 /// Borrowed GuestKernel — provides memory access, page-table
440 /// translation context, and the vmlinux symbol table.
441 pub kernel: &'a super::guest::GuestKernel,
442 /// BTF-resolved offsets for the scx walker.
443 pub offsets: &'a super::btf_offsets::ScxWalkerOffsets,
444 /// `scx_root` symbol KVA (resolved via vmlinux ELF symtab).
445 /// The walker reads `*scx_root` to find the active scx_sched.
446 pub scx_root_kva: u64,
447 /// Per-CPU rq kernel virtual addresses (one per CPU).
448 pub rq_kvas: &'a [u64],
449 /// Per-CPU rq guest physical addresses (parallel to rq_kvas).
450 pub rq_pas: &'a [u64],
451 /// `__per_cpu_offset[]` array, used to address each CPU's
452 /// `scx_sched_pcpu.bypass_dsq`.
453 pub per_cpu_offsets: &'a [u64],
454 /// NUMA node count for the per-node global-DSQ walk. Pass `1`
455 /// on UMA / unknown configurations.
456 pub nr_nodes: u32,
457}
458
459/// One per-monitor-tick snapshot of the 13 SCX_EV_* event counters
460/// summed across every CPU at that tick.
461///
462/// The kernel stores per-CPU `s64` counters in `scx_sched_pcpu`
463/// (kernel/sched/ext.c); the monitor sampler reads them at every
464/// tick and stores per-CPU `event_counters` on each
465/// `super::CpuSnapshot`. The dump path sums across CPUs into the
466/// fields here so a downstream consumer can render the run's
467/// counter timeline (sparkline, delta plot, ...) without
468/// re-iterating the per-CPU vec.
469///
470/// Field semantics match
471/// `super::ScxEventCounters` one-to-one — see that struct's
472/// per-field doc for kernel-source provenance. `total_*` naming
473/// here echoes `super::ScxEventDeltas`'s aggregate-across-window
474/// fields but with per-tick (not per-window) granularity.
475#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
476#[non_exhaustive]
477pub struct EventCounterSample {
478 /// Milliseconds since VM start (mirrors
479 /// `super::MonitorSample::elapsed_ms`). Zero on the first
480 /// sample.
481 pub elapsed_ms: u64,
482 /// Sum of `select_cpu_fallback` across all CPUs at this tick.
483 pub select_cpu_fallback: i64,
484 /// Sum of `dispatch_local_dsq_offline` across all CPUs.
485 pub dispatch_local_dsq_offline: i64,
486 /// Sum of `dispatch_keep_last` across all CPUs.
487 pub dispatch_keep_last: i64,
488 /// Sum of `enq_skip_exiting` across all CPUs.
489 pub enq_skip_exiting: i64,
490 /// Sum of `enq_skip_migration_disabled` across all CPUs.
491 pub enq_skip_migration_disabled: i64,
492 /// Sum of `reenq_immed` across all CPUs.
493 pub reenq_immed: i64,
494 /// Sum of `reenq_local_repeat` across all CPUs.
495 pub reenq_local_repeat: i64,
496 /// Sum of `refill_slice_dfl` across all CPUs.
497 pub refill_slice_dfl: i64,
498 /// Sum of `bypass_duration` across all CPUs (ns).
499 pub bypass_duration: i64,
500 /// Sum of `bypass_dispatch` across all CPUs.
501 pub bypass_dispatch: i64,
502 /// Sum of `bypass_activate` across all CPUs.
503 pub bypass_activate: i64,
504 /// Sum of `insert_not_owned` across all CPUs.
505 pub insert_not_owned: i64,
506 /// Sum of `sub_bypass_dispatch` across all CPUs.
507 pub sub_bypass_dispatch: i64,
508}
509
510impl EventCounterSample {
511 /// Construct from a `super::MonitorSample` by summing every
512 /// CPU's `super::ScxEventCounters`. CPUs whose
513 /// `event_counters` is `None` (event-stat offsets unresolved)
514 /// contribute 0 to every field.
515 ///
516 /// Returns `None` when no CPU on the sample reported event
517 /// counters — propagating that to the timeline would emit a
518 /// row of all zeros that's indistinguishable from a real
519 /// "every counter at zero" tick. Callers filter `None` out.
520 pub fn from_monitor_sample(sample: &super::MonitorSample) -> Option<Self> {
521 let mut any = false;
522 let mut out = Self {
523 elapsed_ms: sample.elapsed_ms,
524 ..Self::default()
525 };
526 for cpu in &sample.cpus {
527 if let Some(ev) = &cpu.event_counters {
528 any = true;
529 // Per-CPU SCX event counters are s64 in the kernel
530 // and originate from BPF map reads of guest memory.
531 // A corrupt counter could trip i64 addition overflow
532 // when summed across many CPUs; saturating_add pins
533 // the sum at i64::{MIN,MAX} rather than panicking
534 // (debug) or wrapping (release) into a misleading
535 // value.
536 out.select_cpu_fallback = out
537 .select_cpu_fallback
538 .saturating_add(ev.select_cpu_fallback);
539 out.dispatch_local_dsq_offline = out
540 .dispatch_local_dsq_offline
541 .saturating_add(ev.dispatch_local_dsq_offline);
542 out.dispatch_keep_last =
543 out.dispatch_keep_last.saturating_add(ev.dispatch_keep_last);
544 out.enq_skip_exiting = out.enq_skip_exiting.saturating_add(ev.enq_skip_exiting);
545 out.enq_skip_migration_disabled = out
546 .enq_skip_migration_disabled
547 .saturating_add(ev.enq_skip_migration_disabled);
548 out.reenq_immed = out.reenq_immed.saturating_add(ev.reenq_immed);
549 out.reenq_local_repeat =
550 out.reenq_local_repeat.saturating_add(ev.reenq_local_repeat);
551 out.refill_slice_dfl = out.refill_slice_dfl.saturating_add(ev.refill_slice_dfl);
552 out.bypass_duration = out.bypass_duration.saturating_add(ev.bypass_duration);
553 out.bypass_dispatch = out.bypass_dispatch.saturating_add(ev.bypass_dispatch);
554 out.bypass_activate = out.bypass_activate.saturating_add(ev.bypass_activate);
555 out.insert_not_owned = out.insert_not_owned.saturating_add(ev.insert_not_owned);
556 out.sub_bypass_dispatch = out
557 .sub_bypass_dispatch
558 .saturating_add(ev.sub_bypass_dispatch);
559 }
560 }
561 if any { Some(out) } else { None }
562 }
563}
564
565/// Render a u64 counter series as a 1-line UTF-8 sparkline.
566///
567/// Maps each value into one of 8 unicode block-element glyphs
568/// (`▁▂▃▄▅▆▇█`) by min-max scaling. Empty input renders as the
569/// empty string; a constant non-zero series renders as repeated
570/// mid-tier glyphs (matches the "no variation" reading in the
571/// data, not as misleading monotonic up-bars). A constant zero
572/// series renders as repeated lowest glyphs.
573///
574/// Used by the `Display` impl for the event-counter timeline. Pure
575/// helper — no allocation outside the returned `String`.
576pub fn render_sparkline(values: &[u64]) -> String {
577 const GLYPHS: &[char] = &['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█'];
578 if values.is_empty() {
579 return String::new();
580 }
581 let min = *values.iter().min().expect("non-empty");
582 let max = *values.iter().max().expect("non-empty");
583 let mut s = String::with_capacity(values.len() * 4);
584 if max == min {
585 let glyph = if max == 0 {
586 GLYPHS[0]
587 } else {
588 GLYPHS[GLYPHS.len() / 2]
589 };
590 for _ in values {
591 s.push(glyph);
592 }
593 return s;
594 }
595 let span = max - min;
596 let last_idx = (GLYPHS.len() - 1) as u64;
597 for &v in values {
598 // Linear scale [min, max] → [0, GLYPHS.len()-1]. Integer
599 // math is sufficient (no rounding artifact at the cost
600 // of one extra glyph step at boundaries).
601 let scaled = ((v - min) * last_idx) / span;
602 let idx = scaled.min(last_idx) as usize;
603 s.push(GLYPHS[idx]);
604 }
605 s
606}
607
608/// Saturating-cast wrapper around [`render_sparkline`] for signed
609/// (i64) counter series. Negative values clamp to 0; the kernel
610/// stores SCX_EV_* as `s64` but every counter is non-negative in
611/// practice, so the saturation only fires on a corrupt read.
612pub fn render_sparkline_i64(values: &[i64]) -> String {
613 let widened: Vec<u64> = values.iter().map(|&v| v.max(0) as u64).collect();
614 render_sparkline(&widened)
615}
616
617/// Snapshot of one vCPU's instruction-pointer / stack-pointer / page-
618/// table-root at freeze time. Re-export of the freeze-side type so
619/// dump consumers don't have to depend on `vmm::exit_dispatch`
620/// internals.
621pub use crate::vmm::exit_dispatch::VcpuRegSnapshot;
622
623/// Schema discriminant value emitted in `FailureDumpReport.schema`.
624///
625/// Consumers that read a `.failure-dump.json` file use the `schema`
626/// field's value to choose between [`FailureDumpReport`],
627/// [`DualFailureDumpReport`], and [`DegradedFailureDumpReport`]
628/// before attempting deserialization. The
629/// [`FailureDumpReportAny::from_json`] dispatcher handles this
630/// routing for in-process consumers.
631/// Values are stable wire constants — extending the dump pipeline
632/// with a new shape adds a new constant rather than changing this
633/// one.
634pub const SCHEMA_SINGLE: &str = "single";
635
636/// Schema discriminant value emitted in `DualFailureDumpReport.schema`.
637/// See [`SCHEMA_SINGLE`] for the discriminant contract.
638pub const SCHEMA_DUAL: &str = "dual";
639
640/// Schema discriminant value emitted in `DegradedFailureDumpReport.schema`.
641///
642/// Carried by failure-dumps the freeze coordinator was unable to
643/// capture as a full [`SCHEMA_SINGLE`] / [`SCHEMA_DUAL`] report — the
644/// trigger fired but rendezvous, gate cross-reference, or KVA
645/// translation aborted the dump path. Per the wire-format contract on
646/// [`SCHEMA_SINGLE`], degraded dumps are a stable variant added by
647/// new constant, not a mutation of the existing two.
648pub const SCHEMA_DEGRADED: &str = "degraded";
649
650/// Reason string written into [`DegradedFailureDumpReport::reason`]
651/// when the freeze coordinator's vCPU rendezvous timed out before
652/// every parked acknowledgement arrived. Wire-format-stable: matches
653/// the operator-grep contract used by every other `REASON_*` constant
654/// in this module. The dynamic detail appended at emit time
655/// (`<timeout_ms>` / `<parked>` / `<expected>`) lets an operator see
656/// which vCPUs stalled without a separate field.
657pub const REASON_DEGRADED_RENDEZVOUS_TIMEOUT: &str =
658 "vCPU rendezvous timed out before parked acknowledgement";
659
660/// Reason string written into [`DegradedFailureDumpReport::reason`]
661/// when the host kill signal flipped during the vCPU freeze
662/// rendezvous, short-circuiting the wait before every parked
663/// acknowledgement arrived. Distinct from
664/// [`REASON_DEGRADED_RENDEZVOUS_TIMEOUT`] (which fires only when the
665/// 30s deadline expires): a kill-mid-rendezvous typically lands in
666/// the rendezvous loop within milliseconds, so the elapsed_ms in the
667/// dynamic detail (appended at emit time) reads as a small number
668/// and the "timed out" label is internally contradictory. The kill
669/// sources are SCHED_EXIT propagation from a vCPU thread, watchdog
670/// hard-deadline expiry, and panic-hook flips. Wire-format-stable;
671/// matches the operator-grep contract used by every other `REASON_*`
672/// constant.
673pub const REASON_DEGRADED_KILL_DURING_RENDEZVOUS: &str =
674 "vCPU rendezvous aborted by external kill before parked acknowledgement";
675
676/// Snapshot tag used when the early-snapshot trigger fires but
677/// `freeze_and_dispatch(FreezeMode::Capture { gate_on_exit_kind: false })` returns `Degraded` (early-half
678/// rendezvous timeout). The freeze coordinator writes the degraded
679/// JSON to a sibling path named via
680/// `super::super::vmm::freeze_coord::snapshot_tagged_path` using
681/// this tag — main `{stem}.failure-dump.json` is preserved for the
682/// subsequent late-trigger emission. Operator-readable wire-format
683/// constant: kebab-case, stable across releases.
684///
685/// MAINTENANCE: adding a new `SNAPSHOT_TAG_*` const requires updating
686/// BOTH (a) the `ALL_SNAPSHOT_TAGS` slice below AND (b) the
687/// `expected` hand-list in
688/// `all_snapshot_tags_enumerates_every_pub_const_in_module` in
689/// `src/monitor/dump/tests.rs`. The pinning test catches slice-vs-
690/// expected divergence; it does NOT catch a const added without
691/// updating either (both arrays stay at the same length).
692pub const SNAPSHOT_TAG_EARLY_DEGRADED: &str = "early-degraded";
693
694/// Snapshot tag used when dual-snapshot mode held a Captured early
695/// snapshot AND the late-trigger path returned `Degraded`. The early
696/// snapshot is written to a sibling path with this tag while the
697/// late degraded JSON occupies the main dump path. Distinguishes
698/// "early itself degraded" ([`SNAPSHOT_TAG_EARLY_DEGRADED`]) from
699/// "early captured, late degraded" (this tag) so an operator browsing
700/// the dump directory knows which case produced which file. Every
701/// captured snapshot reaches disk.
702pub const SNAPSHOT_TAG_EARLY_PRE_LATE_DEGRADED: &str = "early-pre-late-degraded";
703
704/// Snapshot tag used when dual-snapshot mode held a Captured early
705/// snapshot AND the late-trigger path ran AND returned `Suppressed`
706/// (the gate examined `*scx_root->exit_kind`, found it below
707/// SCX_EXIT_ERROR, and decided no failure dump warranted). Distinct
708/// from [`SNAPSHOT_TAG_EARLY_ONLY_LATE_NEVER_FIRED`]: this tag
709/// means the late trigger DID fire and the gate explicitly decided
710/// clean exit. Operator triage: scheduler recovered from the early
711/// stall and reached a clean shutdown via the SCX_EXIT_NONE /
712/// SCX_EXIT_DONE path. The early observation (runnable-age spike)
713/// is independently meaningful and reaches disk at the tagged
714/// sibling. Symmetric with
715/// [`SNAPSHOT_TAG_EARLY_PRE_LATE_DEGRADED`] — tagged sibling rather
716/// than main path so the main `{stem}.failure-dump.json` keeps the
717/// "scheduler had a failure-class exit" semantic.
718pub const SNAPSHOT_TAG_EARLY_ONLY_LATE_SUPPRESSED: &str = "early-only-late-suppressed";
719
720/// Snapshot tag used when dual-snapshot mode held a Captured early
721/// snapshot AND the late-trigger path NEVER FIRED for the run (no
722/// `err_exit_detected` BPF latch flip; the scheduler never reached
723/// an error-class late event). Distinct from
724/// [`SNAPSHOT_TAG_EARLY_ONLY_LATE_SUPPRESSED`]: this tag means the
725/// late trigger never ran at all, NOT that it ran and decided
726/// clean. Operator triage: scheduler crossed the half-watchdog
727/// runnable-age threshold (early-trigger fired) but then either
728/// recovered or terminated before reaching the late-trigger path —
729/// `freeze_state` stayed at `Idle` or `TookEarly` through coord
730/// exit. The end-of-coord drain emits the early observation to the
731/// tagged sibling rather than letting it drop with the closure.
732pub const SNAPSHOT_TAG_EARLY_ONLY_LATE_NEVER_FIRED: &str = "early-only-late-never-fired";
733
734/// Canonical enumeration of every `SNAPSHOT_TAG_*` constant in this
735/// module. Tests that negative-scan tag locations (e.g. asserting no
736/// file landed at any wrong tag) iterate this slice rather than
737/// hardcoding the 4-element list. Adding a new `SNAPSHOT_TAG_*`
738/// constant requires updating this slice — the pinning test
739/// `all_snapshot_tags_enumerates_every_pub_const_in_module` in
740/// `src/monitor/dump/tests.rs` will fail until the new tag is added,
741/// flagging the inconsistency at test-time.
742///
743/// Order: NEVER_FIRED first because it is the default tag in
744/// `EarlySnapshotGuard::drain_to_disk`'s `unwrap_or` arm at
745/// src/vmm/freeze_coord/mod.rs (the guard struct is `pub(super)` so
746/// rustdoc cannot intra-doc-link it from this module; cite by file
747/// path matches the cross-ref convention used elsewhere in this
748/// batch). The other three follow in dispatch-arm order:
749/// SUPPRESSED (late-trigger Suppressed write-failure) → PRE_LATE_DEGRADED
750/// (late-trigger Degraded write-failure) → EARLY_DEGRADED (early-
751/// trigger Degraded direct write) — so readers can map slice index
752/// to dispatch arm by inspection.
753///
754/// `#[cfg(test)]` — production code never iterates this slice;
755/// only the negative-scan tests and the pinning canary reference it.
756/// Gated to keep the lib's dead-code lint clean.
757#[cfg(test)]
758pub const ALL_SNAPSHOT_TAGS: &[&str] = &[
759 SNAPSHOT_TAG_EARLY_ONLY_LATE_NEVER_FIRED,
760 SNAPSHOT_TAG_EARLY_ONLY_LATE_SUPPRESSED,
761 SNAPSHOT_TAG_EARLY_PRE_LATE_DEGRADED,
762 SNAPSHOT_TAG_EARLY_DEGRADED,
763];
764
765/// Reason string written into [`FailureDumpReport::prog_runtime_stats_unavailable`]
766/// when [`DumpContext::prog_capture`] was supplied but the per-program
767/// walker found no struct_ops programs in `prog_idr` at freeze time.
768/// Wire-format-stable: an operator parsing the sidecar JSON looks for
769/// this exact string to distinguish from the prog-accessor-missing
770/// case.
771pub const REASON_NO_STRUCT_OPS_LOADED: &str = "no struct_ops programs loaded";
772
773/// Reason string written into [`FailureDumpReport::prog_runtime_stats_unavailable`]
774/// when [`DumpContext::prog_capture`] was `None`. Distinguishes from
775/// [`REASON_NO_STRUCT_OPS_LOADED`] — the walker never ran in this case
776/// because the accessor wasn't constructed (e.g. `prog_idr` symbol
777/// missing).
778pub const REASON_PROG_ACCESSOR_UNAVAILABLE: &str = "prog accessor unavailable";
779
780/// Reason string written into [`FailureDumpReport::task_enrichments_unavailable`]
781/// when [`DumpContext::task_enrichment_capture`] was supplied but
782/// every walker entry produced no enrichment (idle guest with no
783/// runnable scx tasks at the freeze instant).
784pub const REASON_TASK_WALKER_ZERO_TASKS: &str = "task walker yielded zero tasks";
785
786/// Reason string written into [`FailureDumpReport::task_enrichments_unavailable`]
787/// when [`DumpContext::task_enrichment_capture`] was `None`.
788/// Distinguishes from [`REASON_TASK_WALKER_ZERO_TASKS`] — the walker
789/// never ran because the capture wasn't supplied.
790pub const REASON_NO_TASK_WALKER: &str = "no task walker available";
791
792/// Reason string written into [`FailureDumpReport::scx_walker_unavailable`]
793/// when offsets resolved AND the walker found rq->scx + local DSQ
794/// data BUT `*scx_root == 0` — no scheduler attached at the freeze
795/// instant. The sched-rooted passes (bypass / global / user-hash)
796/// have nothing to walk, but the rq->scx and per-CPU local DSQ
797/// captures still produced data. Surfaces a distinct reason so the
798/// operator knows the scheduler isn't loaded vs. the walker is
799/// broken.
800pub const REASON_SCX_ROOT_NULL: &str = "scx_root is NULL (no scheduler attached)";
801
802/// Reason string written into [`FailureDumpReport::scx_walker_unavailable`]
803/// when [`DumpContext::scx_walker_capture`] was supplied AND every
804/// offset sub-group resolved BUT the walker reached no rq, no DSQ,
805/// and no scx_sched state at all (every read failed). Distinct from
806/// [`REASON_SCX_ROOT_NULL`]: that case has rq->scx + local DSQ data
807/// but no sched_state; this case has nothing.
808pub const REASON_SCX_WALKER_NO_STATE: &str = "scx walker reached no state";
809
810/// Reason string written into [`FailureDumpReport::scx_walker_unavailable`]
811/// when [`DumpContext::scx_walker_capture`] was `None`. Distinguishes
812/// from [`REASON_SCX_WALKER_NO_STATE`] — the walker never ran at all
813/// because no capture was supplied.
814pub const REASON_NO_SCX_WALKER: &str = "no scx walker capture";
815
816/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
817/// when the sdt_alloc pre-pass could not run because the scheduler's
818/// arena `user_vm_start` was not 4 GiB-aligned. See the gate in
819/// [`dump_state`]: low-32 keying of the per-pass
820/// [`crate::monitor::dump::render_map::ArenaSlotIndex`] is only
821/// correct when `user_vm_start & 0xFFFF_FFFF == 0`.
822pub const REASON_SDT_ALLOC_UNALIGNED_USER_VM: &str =
823 "user_vm_start is not 4 GiB-aligned; low-32 keying disabled";
824
825/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
826/// when the dump enumerated no `*.bss` ARRAY map with a non-zero
827/// `btf_kva`. Without the scheduler's `.bss` bytes the pre-pass cannot
828/// read any allocator's in-memory state, and without `btf_kva` the
829/// program BTF that names `struct scx_allocator` is not loadable.
830pub const REASON_SDT_ALLOC_NO_BSS: &str = "no scheduler .bss map (or .bss has no program BTF)";
831
832/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
833/// when no `BPF_MAP_TYPE_ARENA` map snapshot was captured: the
834/// pre-pass has no `kern_vm_start` to translate arena pointers
835/// against, so no allocator slot can be walked.
836pub const REASON_SDT_ALLOC_NO_ARENA: &str = "no arena map captured (kern_vm_start unavailable)";
837
838/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
839/// when the scheduler's program BTF does not carry `struct scx_allocator`
840/// or its peer types (`sdt_pool`, `sdt_desc`, `sdt_chunk`) — the scheduler
841/// does not link `lib/sdt_alloc.bpf.c` and there are no allocator slots
842/// to walk. [`crate::monitor::sdt_alloc::SdtAllocOffsets::from_btf`]
843/// returns the underlying `anyhow::Error` describing which struct was
844/// missing; the dump caller folds that into this reason string.
845pub const REASON_SDT_ALLOC_NO_TYPE: &str = "scheduler BTF does not declare struct scx_allocator";
846
847/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
848/// when every prerequisite resolved but no `.bss` variable of type
849/// `struct scx_allocator` was discovered. The scheduler links
850/// `lib/sdt_alloc.bpf.c` for its types but has not declared a typed
851/// allocator instance.
852pub const REASON_SDT_ALLOC_NO_INSTANCE: &str = "no scx_allocator instance in .bss";
853
854/// Reason string written into [`FailureDumpReport::sdt_alloc_unavailable`]
855/// when the per-map dump deadline was exhausted before the pre-pass
856/// could run. The `dump_truncated_at_us` field also surfaces the
857/// truncation; this string disambiguates "deadline" from the other
858/// no-data causes when a consumer is scanning sdt_alloc-related
859/// diagnostics specifically.
860pub const REASON_SDT_ALLOC_DEADLINE_EXCEEDED: &str =
861 "dump deadline exceeded before sdt_alloc pre-pass could run";
862
863/// Cross-CPU sum of every per-CPU diagnostic counter slot in the
864/// probe BPF program's `.bss` `ktstr_pcpu_counters` array.
865///
866/// The probe declares one fixed-shape per-CPU array
867/// (`pcpu_counter ktstr_pcpu_counters[MAX_CPUS][KTSTR_PCPU_NR]` —
868/// see `src/bpf/probe.bpf.c`); each tracepoint / kprobe handler
869/// bumps a slot via `ktstr_pcpu_inc(KTSTR_PCPU_<NAME>)`. The host
870/// reader sums across the CPU axis to recover the cumulative count
871/// each handler reports. Field names mirror the slot names from
872/// `enum ktstr_pcpu_idx` so an operator can walk back from the
873/// failure-dump field to the probe source by exact name.
874///
875/// All counters are monotonic-since-probe-attach. Zero values
876/// indicate either "the corresponding tracepoint never fired" (the
877/// common case for `pi_*` and `lock_contend_*` on tests that don't
878/// exercise PI / lock contention) or "the tracepoint never attached"
879/// (e.g. `preempt_*` on a kernel without
880/// `CONFIG_TRACE_PREEMPT_TOGGLE`); the counter alone cannot
881/// distinguish those two cases — pair with the attach-state surface
882/// in `crate::probe::process::ProbeDiagnostics` when the
883/// distinction matters.
884#[derive(Debug, Clone, Default, Serialize, Deserialize)]
885#[non_exhaustive]
886pub struct ProbeBssCounters {
887 /// `KTSTR_PCPU_PROBE_COUNT` summed across CPUs — total kprobe
888 /// fires past the `ktstr_enabled` gate.
889 pub probe_count: u64,
890 /// `KTSTR_PCPU_KPROBE_RETURNS` summed across CPUs — kprobe fires
891 /// that committed an entry to `probe_data` (past `func_meta_map`
892 /// lookup and scratch-slot allocation).
893 pub kprobe_returns: u64,
894 /// `KTSTR_PCPU_META_MISS` summed across CPUs — kprobe fires
895 /// whose IP missed `func_meta_map`. `probe_count -
896 /// kprobe_returns` is the total bail count; `meta_miss` is the
897 /// subset whose bail came from the `func_meta_map` lookup.
898 pub meta_miss: u64,
899 /// `KTSTR_PCPU_RINGBUF_DROPS` summed across CPUs — failed
900 /// `bpf_ringbuf_reserve` calls inside the trigger handler.
901 pub ringbuf_drops: u64,
902 /// `KTSTR_PCPU_TIMELINE_COUNT` summed across CPUs — successful
903 /// timeline-event submissions across the three timeline
904 /// tracepoints (sched_switch + sched_migrate_task + sched_wakeup).
905 pub timeline_count: u64,
906 /// `KTSTR_PCPU_TIMELINE_DROPS` summed across CPUs — timeline
907 /// submissions that failed because the dedicated
908 /// `timeline_events` ringbuf was full at submit time.
909 pub timeline_drops: u64,
910 /// `KTSTR_PCPU_PI_COUNT` summed across CPUs — PI boost / unboost
911 /// records committed via `fexit/rt_mutex_setprio`.
912 pub pi_count: u64,
913 /// `KTSTR_PCPU_PI_ORPHAN_FEXITS` summed across CPUs — fexit
914 /// fires whose entry-side snapshot was never recorded (attach
915 /// race or `pi_scratch` overflow).
916 pub pi_orphan_fexits: u64,
917 /// `KTSTR_PCPU_PI_CLASS_CHANGE_COUNT` summed across CPUs —
918 /// PI events that observed a `sched_class` flip from fentry
919 /// to fexit (e.g. CFS → RT under a boost).
920 pub pi_class_change_count: u64,
921 /// `KTSTR_PCPU_PI_DROPS` summed across CPUs — TL_EVT_PI_BOOST
922 /// submissions that failed because the timeline ringbuf was
923 /// full at the PI fexit handler.
924 pub pi_drops: u64,
925 /// `KTSTR_PCPU_LOCK_CONTEND_COUNT` summed across CPUs —
926 /// `tp_btf/contention_begin` fires that committed a
927 /// TL_EVT_LOCK_CONTEND timeline record.
928 pub lock_contend_count: u64,
929 /// `KTSTR_PCPU_LOCK_CONTEND_DROPS` summed across CPUs —
930 /// TL_EVT_LOCK_CONTEND submissions that failed because the
931 /// timeline ringbuf was full.
932 pub lock_contend_drops: u64,
933 /// `KTSTR_PCPU_PREEMPT_DISABLE_COUNT` summed across CPUs —
934 /// `tp_btf/preempt_disable` outermost-transition fires.
935 pub preempt_disable_count: u64,
936 /// `KTSTR_PCPU_PREEMPT_ENABLE_COUNT` summed across CPUs —
937 /// `tp_btf/preempt_enable` outermost-transition fires.
938 pub preempt_enable_count: u64,
939 /// `KTSTR_PCPU_TRIGGER_COUNT` summed across CPUs — every
940 /// `tp_btf/sched_ext_exit` fire (including non-error
941 /// kinds like DONE / UNREG, not just error-class exits).
942 pub trigger_count: u64,
943}
944
945/// Top-level failure-dump report. One per freeze trigger.
946#[derive(Debug, Clone, Serialize, Deserialize)]
947#[non_exhaustive]
948pub struct FailureDumpReport {
949 /// Wire-format discriminant. Always `"single"` for this variant,
950 /// pinning [`SCHEMA_SINGLE`]. Consumers branch on this to
951 /// choose between [`FailureDumpReport`], [`DualFailureDumpReport`],
952 /// and [`DegradedFailureDumpReport`] before deserializing. Single
953 /// and Dual share top-level field names that would collide without
954 /// an explicit tag; Degraded carries a distinct field set
955 /// (`reason`, `watchpoint_hit`, `bss_latch_state`, `exit_kind`,
956 /// `elapsed_ms`) but still gets the tag so
957 /// [`FailureDumpReportAny::from_json`] can dispatch uniformly.
958 pub schema: String,
959 /// One entry per BPF map enumerated. Order matches the IDR walk
960 /// (i.e. allocation order); the report is otherwise unsorted so
961 /// callers that want a stable view should sort by name.
962 pub maps: Vec<FailureDumpMap>,
963 /// Per-vCPU register snapshots captured on each vCPU thread at
964 /// freeze time. Index matches vCPU id (BSP at 0, APs at 1..N).
965 /// `None` when a vCPU never parked (rendezvous timeout) or its
966 /// `KVM_GET_REGS` failed mid-shutdown. Attached to the report by
967 /// the freeze coordinator after `dump_state` returns.
968 #[serde(default, skip_serializing_if = "Vec::is_empty")]
969 pub vcpu_regs: Vec<Option<VcpuRegSnapshot>>,
970 /// Obj name of the currently-attached scheduler, identified by
971 /// matching each `BPF_MAP_TYPE_STRUCT_OPS` map's `value_kva` (the
972 /// guest-KVA of its `kvalue.data` payload) against the dereferenced
973 /// `*scx_root` value (the guest-KVA of the active `struct scx_sched`,
974 /// which is also the KVA of `scx_sched.ops` since `ops` sits at
975 /// offset 0). When the match succeeds, the struct_ops map's name
976 /// carries the obj prefix (libbpf convention: `<obj>.<struct_ops_var>`);
977 /// the prefix is split at the first `.` and stored here.
978 ///
979 /// `None` when:
980 /// - `scx_sched_state` is unavailable (no scheduler attached, BTF
981 /// missing the `scx_sched` type, or `*scx_root` could not be
982 /// resolved at capture time).
983 /// - No `BPF_MAP_TYPE_STRUCT_OPS` map had `value_kva` matching the
984 /// active sched_kva (capture race during a mid-attach window, or
985 /// the struct_ops map's value_kva was not yet populated).
986 /// - The matched map's name lacks a `<obj>.` prefix.
987 ///
988 /// [`crate::scenario::snapshot::Snapshot::active`] uses this as
989 /// the principled tiebreaker when the projection sees multiple
990 /// obj prefixes in global-section maps. On `None` the consumer
991 /// falls back to the prefix-grouping heuristic (single obj →
992 /// that one; multiple obj → `NoActiveScheduler` with a
993 /// diagnostic naming the observed obj_names + the walker's
994 /// failure cause).
995 #[serde(default, skip_serializing_if = "Option::is_none")]
996 pub active_obj_name: Option<String>,
997 /// Guest-KVAs of every `struct bpf_map` belonging to the
998 /// currently-attached scheduler's loaded BPF object, captured
999 /// alongside [`Self::active_obj_name`] from the same
1000 /// `*scx_root → struct_ops map → owning bpf_prog → used_maps`
1001 /// walk. The walker enumerates the matched struct_ops prog's
1002 /// `used_maps` array and records each entry's KVA so a
1003 /// downstream filter can identify the active scheduler's maps
1004 /// uniquely — even when two scheduler instances loaded from the
1005 /// SAME binary coexist post-[`crate::scenario::ops::Op::ReplaceScheduler`]
1006 /// (where their bss / data / rodata maps share the
1007 /// `<obj_name>.` prefix and cannot be distinguished by name
1008 /// alone).
1009 ///
1010 /// Empty when:
1011 /// - [`Self::active_obj_name`] is `None` (walker did not
1012 /// resolve the active obj — same reasons; see that field's
1013 /// doc).
1014 /// - The matched prog's `used_maps` could not be safely read
1015 /// (torn race per the kernel's `used_map_cnt` / `used_maps`
1016 /// pointer publication TOCTOU described at
1017 /// `monitor/bpf_prog.rs::find_active_struct_ops_obj_no_target`).
1018 /// - The walker ran but the kernel published an empty
1019 /// `used_maps` for the active prog (no maps registered — an
1020 /// unusual but legal sched_ext shape).
1021 ///
1022 /// **KVA aliasing caveat:** kernel BPF map allocations are
1023 /// vmalloc/slab-backed; a freed map's KVA can be reassigned to a
1024 /// new allocation across captures. [`crate::scenario::snapshot::Snapshot::active`]
1025 /// combines this set with [`Self::active_obj_name`] (both must
1026 /// match) to reject the aliasing case — a KVA hit whose owning
1027 /// map name does not share the active obj prefix is treated as
1028 /// stale and falls through to the obj-prefix heuristic.
1029 ///
1030 /// **Within-run identity ONLY.** KVAs reflect kernel address
1031 /// space allocation at capture time (subject to KASLR slide).
1032 /// Stable for the life of the map within a single VM run; NOT
1033 /// comparable across runs. Never persist or compare against
1034 /// checked-in baselines.
1035 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1036 pub active_map_kvas: Vec<u64>,
1037 /// Structured per-allocation views from sdt_alloc-backed
1038 /// allocators. One entry per discovered allocator; each carries
1039 /// every live leaf slot (capped at
1040 /// `super::sdt_alloc::MAX_SDT_ALLOC_ENTRIES`) BTF-rendered to
1041 /// named field views. Empty when no scheduler-side allocator
1042 /// could be located, when arena offsets / sdt_alloc offsets are
1043 /// absent, or when the program BTF lacks the `scx_allocator`
1044 /// type (scheduler doesn't link `lib/sdt_alloc.bpf.c`).
1045 ///
1046 /// Populated alongside the page-granular `ArenaSnapshot` in
1047 /// each map: a consumer can read either representation depending
1048 /// on whether they want raw bytes or named-field allocations.
1049 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1050 pub sdt_allocations: Vec<SdtAllocatorSnapshot>,
1051 /// Live `scx_static` bump-allocator regions discovered in
1052 /// `.bss`. One entry per `struct scx_static` instance with
1053 /// `memory != 0` and an in-range `(off, max_alloc_bytes)` pair.
1054 /// Distinct from [`Self::sdt_allocations`]: scx_static is a
1055 /// program-lifetime bump allocator (`lib/sdt_alloc.bpf.c:577`)
1056 /// with no per-allocation header, so the surfaced view is
1057 /// region-granular ranges rather than per-slot named allocations.
1058 /// Empty when the scheduler doesn't link `lib/sdt_alloc.bpf.c`,
1059 /// when the program BTF lacks `struct scx_static`, or when no
1060 /// `scx_static` instance has been initialised at freeze time.
1061 ///
1062 /// The dump pipeline uses the same ranges to populate the
1063 /// renderer's `ScxStaticRangeIndex` so a deferred-resolve arena
1064 /// chase whose target lives inside scx_static memory can
1065 /// fail-closed cleanly (no per-slot type recovery is possible
1066 /// without a per-call-site type hook from cast analysis).
1067 #[serde(
1068 default,
1069 skip_serializing_if = "super::scx_static_alloc::ScxStaticSnapshot::is_empty"
1070 )]
1071 pub scx_static_ranges: super::scx_static_alloc::ScxStaticSnapshot,
1072 /// Diagnostic reason for `sdt_allocations` being empty.
1073 ///
1074 /// - `None` → either the pre-pass ran and produced records (the vec
1075 /// is non-empty), or the pre-pass ran cleanly but the scheduler
1076 /// simply has no live allocations (the vec is empty for legitimate
1077 /// reasons that aren't worth a diagnostic). Default.
1078 /// - `Some(REASON_SDT_ALLOC_*)` → the pre-pass skipped before it
1079 /// could surface any allocator state. The string identifies which
1080 /// prerequisite was missing: deadline exhaustion, unaligned
1081 /// `user_vm_start`, missing scheduler `.bss`, missing arena
1082 /// snapshot, scheduler BTF without `struct scx_allocator`, or no
1083 /// `.bss` `scx_allocator` instance.
1084 ///
1085 /// Distinct from `dump_truncated_at_us` (which records deadline
1086 /// truncation across the whole dump) and from
1087 /// [`Self::scx_static_ranges`] (which has its own walker independent
1088 /// of the typed-allocator pre-pass). Mirrors the
1089 /// `prog_runtime_stats_unavailable` / `task_enrichments_unavailable`
1090 /// pattern.
1091 #[serde(default, skip_serializing_if = "Option::is_none")]
1092 pub sdt_alloc_unavailable: Option<String>,
1093 /// Per-program BPF runtime stats summed across CPUs at freeze
1094 /// time (cnt, nsecs, misses). One entry per discovered
1095 /// struct_ops BPF program. Empty when no struct_ops programs are
1096 /// loaded OR when the prog accessor was unavailable to
1097 /// `dump_state` — see [`Self::prog_runtime_stats_unavailable`]
1098 /// for the reason.
1099 ///
1100 /// Per-CPU offset resolution failure does NOT empty the vec —
1101 /// each program still contributes one entry, but with
1102 /// `cnt`/`nsecs`/`misses` summed only over CPUs whose per-CPU
1103 /// `bpf_prog_stats` slot translated successfully (out-of-range
1104 /// CPUs return None per `super::bpf_map::read_percpu_array_value`
1105 /// semantics).
1106 ///
1107 /// See `super::bpf_prog::ProgRuntimeStats` for field semantics
1108 /// and the kernel-source-grounded provenance of each counter.
1109 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1110 pub prog_runtime_stats: Vec<super::bpf_prog::ProgRuntimeStats>,
1111 /// Diagnostic reason for `prog_runtime_stats` being empty.
1112 ///
1113 /// Distinguishes the three causes a consumer can't otherwise tell
1114 /// apart from an empty vec:
1115 /// - `None` (field absent on wire) → vec was populated normally
1116 /// (or the dump path didn't run). Default.
1117 /// - `Some("no struct_ops programs loaded")` → walker ran, no
1118 /// struct_ops programs were in `prog_idr` at freeze time.
1119 /// - `Some("prog accessor unavailable")` → caller passed
1120 /// `prog_capture: None`. Typical causes: `prog_idr` symbol
1121 /// missing, `BpfProgOffsets` BTF parse failed, or
1122 /// `__per_cpu_offset` resolution didn't yield non-zero offsets
1123 /// yet (still-booting guest).
1124 ///
1125 /// Set by `dump_state` only when prog_runtime_stats ends up
1126 /// empty AND a definite cause is identifiable; left None
1127 /// otherwise so the field stays absent in the JSON for
1128 /// already-populated dumps.
1129 #[serde(default, skip_serializing_if = "Option::is_none")]
1130 pub prog_runtime_stats_unavailable: Option<String>,
1131 /// Per-CPU CPU-time / softirq / IRQ counters captured from
1132 /// `kernel_cpustat`, `kernel_stat`, and (under NO_HZ)
1133 /// `tick_sched`. One entry per CPU enumerated by the walker.
1134 /// Empty when the dump caller passed no `CpuTimeCapture` or
1135 /// when symbol/BTF resolution failed.
1136 ///
1137 /// See `PerCpuTimeStats` for field semantics. Surfaces the
1138 /// per-CPU interrupt and idle-time data the failure dump
1139 /// otherwise leaves implicit (the existing scx walker reads
1140 /// `rq->nr_iowait` but not the cumulative time accounting).
1141 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1142 pub per_cpu_time: Vec<PerCpuTimeStats>,
1143 /// Per-cgroup PSI-irq samples for the test's workload cgroups, host-walked
1144 /// from the cgroup hierarchy at this freeze (Phase A). One entry per
1145 /// workload-root leaf cgroup with per-cgroup PSI accounting enabled. Empty
1146 /// when the dump caller passed no `CgroupPsiCapture`, the workload root
1147 /// isn't present yet, or `psi_cgroups_enabled` is off — loud-absent. RAW
1148 /// values; decoded + folded at the metric layer. See
1149 /// `super::cgroup_walk::CgroupPsiStat`.
1150 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1151 pub cgroup_psi: Vec<super::cgroup_walk::CgroupPsiStat>,
1152 /// Per-node NUMA event counters captured from
1153 /// `pglist_data->node_zones[]->vm_numa_event[]`. One row per
1154 /// NUMA node enumerated by the walker. Empty when the live
1155 /// walker has not landed yet (the BTF offsets and wire shape
1156 /// are wired; the reader is a follow-up).
1157 ///
1158 /// See `PerNodeNumaStats` for field semantics; see
1159 /// [`Self::per_node_numa_unavailable`] for the "why empty"
1160 /// diagnostic.
1161 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1162 pub per_node_numa: Vec<PerNodeNumaStats>,
1163 /// Diagnostic reason for `per_node_numa` being empty.
1164 /// `None` when the vec was populated normally (or the dump
1165 /// path didn't run); `Some(REASON_NO_NUMA_WALKER)` until the
1166 /// host-side walker lands.
1167 #[serde(default, skip_serializing_if = "Option::is_none")]
1168 pub per_node_numa_unavailable: Option<String>,
1169 /// Per-task failure-dump enrichments — identity (pid, tgid,
1170 /// comm), process tree (group_leader, real_parent, pgid, sid,
1171 /// nr_threads), scheduling (prio family, sched_class name,
1172 /// scx.weight, core_cookie), context-switch counters, watchdog
1173 /// disambiguation flag, and lock-slowpath stack matches.
1174 ///
1175 /// One entry per task the dump path's task walker reaches —
1176 /// today's task walkers are the rq->scx walker and the DSQ
1177 /// walker; both produce task KVAs that get enriched here.
1178 /// Empty when no task walker ran (typical until walker
1179 /// dispatch lands) or when the `TaskEnrichmentCapture` was
1180 /// absent.
1181 ///
1182 /// See `super::task_enrichment::TaskEnrichment` for field
1183 /// semantics; see [`Self::task_enrichments_unavailable`] for the
1184 /// "why empty" diagnostic.
1185 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1186 pub task_enrichments: Vec<super::task_enrichment::TaskEnrichment>,
1187 /// Diagnostic reason for `task_enrichments` being empty.
1188 ///
1189 /// - `None` → vec was populated normally (or the dump path
1190 /// didn't run).
1191 /// - `Some("no task walker available")` → the
1192 /// `TaskEnrichmentCapture` was missing from
1193 /// `DumpContext`. Until DSQ + rq->scx walker dispatch
1194 /// lands, this is the expected steady state for the dump
1195 /// pipeline; the offsets + walker library is wired and
1196 /// ready to populate as soon as a task-list producer hooks
1197 /// in.
1198 /// - `Some("task walker yielded zero tasks")` → walker
1199 /// produced no task KVAs (frozen guest with no runnable /
1200 /// queued scx tasks at the dump instant — possible on a
1201 /// completely-idle stall trigger).
1202 #[serde(default, skip_serializing_if = "Option::is_none")]
1203 pub task_enrichments_unavailable: Option<String>,
1204 /// Per-monitor-tick SCX_EV_* event counter timeline. Each entry
1205 /// is the cross-CPU sum of the 13 SCX_EV_* counters at one
1206 /// monitor sample. Empty when the dump caller passed no
1207 /// `EventCounterCapture` or no sample reported event counters
1208 /// (event-stat offsets unresolved, scx_root unset). Renderers
1209 /// build sparklines / per-counter delta plots from this vec.
1210 ///
1211 /// See `EventCounterSample` for field semantics; the kernel-
1212 /// source provenance lives on
1213 /// `super::ScxEventCounters` field doc.
1214 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1215 pub event_counter_timeline: Vec<EventCounterSample>,
1216 /// Per-CPU `rq->scx` snapshots — scalar fields the kernel's
1217 /// own `scx_dump_state` reads plus the runnable_list per-task
1218 /// KVAs that fed into the per-task enrichment capture.
1219 /// One entry per CPU walked. Empty when the
1220 /// `ScxWalkerCapture` was absent or every CPU's translate
1221 /// failed.
1222 ///
1223 /// See `super::scx_walker::RqScxState` for field semantics.
1224 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1225 pub rq_scx_states: Vec<super::scx_walker::RqScxState>,
1226 /// Per-DSQ snapshots — local, bypass, global, and user DSQs
1227 /// reachable from `*scx_root`. Each entry carries `nr` (depth),
1228 /// `seq` (BPF-iter counter), and the queued task KVAs.
1229 /// Surfaces data the kernel's own `scx_dump_state` does not
1230 /// emit (per-DSQ depth enumeration), so this vec adds value
1231 /// even on a kernel that prints its own dump.
1232 ///
1233 /// Empty when the `ScxWalkerCapture` was absent.
1234 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1235 pub dsq_states: Vec<super::scx_walker::DsqState>,
1236 /// Top-level `scx_sched` state captured from `*scx_root`:
1237 /// aborting flag, bypass_depth, exit_kind. `None` when no
1238 /// scheduler is attached or `*scx_root` was unreadable.
1239 #[serde(default, skip_serializing_if = "Option::is_none")]
1240 pub scx_sched_state: Option<super::scx_walker::ScxSchedState>,
1241 /// Diagnostic reason for `rq_scx_states` / `dsq_states` /
1242 /// `scx_sched_state` being absent. Mirrors the
1243 /// `prog_runtime_stats_unavailable` / `task_enrichments_unavailable`
1244 /// pattern.
1245 #[serde(default, skip_serializing_if = "Option::is_none")]
1246 pub scx_walker_unavailable: Option<String>,
1247 /// Per-vCPU hardware perf counter snapshot captured at the
1248 /// instant the failure dump fired. One entry per vCPU; index
1249 /// matches vCPU id (0 = BSP, 1..N = APs). `None` per-entry when
1250 /// the freeze-time `read(2)` failed for that vCPU. Empty vec
1251 /// when `DumpContext::perf_capture` was None (perf
1252 /// unavailable on this host) or the read errored wholesale.
1253 ///
1254 /// `exclude_host=1` means each counter ticks only during guest
1255 /// execution; the values here record the cumulative count from
1256 /// the start of the run. Diff against any
1257 /// `super::CpuSnapshot::vcpu_perf` in the monitor timeline to
1258 /// recover the count over a freeze-aligned window. See
1259 /// `super::perf_counters::VcpuPerfSample` for field semantics
1260 /// and the multiplexing math.
1261 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1262 pub vcpu_perf_at_freeze: Vec<Option<super::perf_counters::VcpuPerfSample>>,
1263 /// Microseconds from dump_state entry to the phase that exceeded
1264 /// the soft deadline supplied via `DumpContext::deadline`. `None`
1265 /// when no deadline was supplied, when every phase finished within
1266 /// the deadline, or when the deadline check happened before the
1267 /// dump started any heavy phase. A `Some(us)` value means the dump
1268 /// truncated remaining work (skipped further maps / tasks /
1269 /// walkers) at that elapsed offset to keep the freeze window
1270 /// bounded — the freeze coordinator's parked vCPUs cannot
1271 /// service guest IRQs or MMIO traps while the dump is running,
1272 /// so unbounded dump latency stretches every guest's KVM_RUN
1273 /// pause and risks the freeze rendezvous timeout firing on the
1274 /// next iteration.
1275 #[serde(default, skip_serializing_if = "Option::is_none")]
1276 pub dump_truncated_at_us: Option<u64>,
1277 /// Count of scheduler-under-test maps the per-map render loop
1278 /// skipped because the soft deadline had already been crossed
1279 /// (`dump_truncated_at_us` records WHEN, this records HOW MANY).
1280 /// `0` on a complete dump. A skipped map is absent from `maps`
1281 /// entirely — without this count a consumer reading `maps`
1282 /// cannot tell "the scheduler has N maps" from "the scheduler
1283 /// has N+k maps but k were dropped by truncation", so a
1284 /// degraded dump would silently under-report map state. Excludes
1285 /// ktstr's own framework maps, which are filtered before the
1286 /// deadline check and never counted here.
1287 #[serde(default, skip_serializing_if = "is_zero_u32")]
1288 pub maps_truncated: u32,
1289 /// Probe BPF program's per-CPU diagnostic counter snapshot
1290 /// (see `ProbeBssCounters`). Populated by the host-side
1291 /// reader in `decode_probe_counters_snapshot` which sums
1292 /// each `KTSTR_PCPU_*` slot across CPUs. `None` when the
1293 /// probe `.bss` map isn't enumerated (probe not loaded), the
1294 /// program BTF can't be parsed, or the array's offset doesn't
1295 /// resolve.
1296 ///
1297 /// A populated `trigger_count > 0` is the structural signal
1298 /// that the BPF tp_btf/sched_ext_exit handler fired during
1299 /// the run — distinct from the boolean `trigger_fired` flag
1300 /// in `super::probe::process::ProbeDiagnostics` (which
1301 /// also records host-side observations like a watchdog
1302 /// teardown). The cross-product is the failure-dump E2E
1303 /// test's structural assertion: a stall scenario must show
1304 /// both flag=true AND `trigger_count > 0`, otherwise the
1305 /// probe attached without firing or fired without the host
1306 /// observing.
1307 #[serde(default, skip_serializing_if = "Option::is_none")]
1308 pub probe_counters: Option<ProbeBssCounters>,
1309 /// `true` when this report was produced by
1310 /// [`Self::placeholder`] — i.e. the capture pipeline could
1311 /// not produce real data (typical cause: freeze rendezvous
1312 /// timed out). Periodic-sample temporal assertions skip
1313 /// placeholder reports rather than treating their empty
1314 /// vectors as "no progress" signals; the `*_unavailable`
1315 /// fields carry the reason string for human consumers, but
1316 /// the boolean flag is the machine-checkable discriminant a
1317 /// pattern can branch on without re-deriving placeholder
1318 /// status from the absence of every field.
1319 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
1320 pub is_placeholder: bool,
1321}
1322
1323impl Default for FailureDumpReport {
1324 /// Empty report with `schema = "single"`. Pinning the schema
1325 /// here keeps `FailureDumpReport::default()` and a
1326 /// freshly-constructed `FailureDumpReport { ..., schema:
1327 /// SCHEMA_SINGLE.into(), ... }` indistinguishable to consumers,
1328 /// so the schema discriminant is never quietly missing on a
1329 /// default-built report.
1330 fn default() -> Self {
1331 Self {
1332 schema: SCHEMA_SINGLE.to_string(),
1333 active_map_kvas: Vec::new(),
1334 maps: Vec::new(),
1335 vcpu_regs: Vec::new(),
1336 sdt_allocations: Vec::new(),
1337 scx_static_ranges: super::scx_static_alloc::ScxStaticSnapshot::default(),
1338 sdt_alloc_unavailable: None,
1339 prog_runtime_stats: Vec::new(),
1340 prog_runtime_stats_unavailable: None,
1341 per_cpu_time: Vec::new(),
1342 cgroup_psi: Vec::new(),
1343 per_node_numa: Vec::new(),
1344 per_node_numa_unavailable: None,
1345 task_enrichments: Vec::new(),
1346 task_enrichments_unavailable: None,
1347 event_counter_timeline: Vec::new(),
1348 rq_scx_states: Vec::new(),
1349 dsq_states: Vec::new(),
1350 scx_sched_state: None,
1351 scx_walker_unavailable: None,
1352 vcpu_perf_at_freeze: Vec::new(),
1353 dump_truncated_at_us: None,
1354 maps_truncated: 0,
1355 probe_counters: None,
1356 is_placeholder: false,
1357 active_obj_name: None,
1358 }
1359 }
1360}
1361
1362impl FailureDumpReport {
1363 /// Build a placeholder report for a capture that could not
1364 /// produce real data. Every `*_unavailable` field is set to
1365 /// `Some(reason)` so downstream consumers (`perf-delta`,
1366 /// failure-rendering tooling) can distinguish "capture
1367 /// happened, no data" from "capture path failed for reason X".
1368 /// All vector / option fields stay at their `Default` empty
1369 /// state so the report is structurally a real
1370 /// `FailureDumpReport`, not a sentinel that breaks consumer
1371 /// type contracts.
1372 ///
1373 /// Used by the freeze coordinator's user-watchpoint dispatch,
1374 /// periodic-capture drain, and final-drain teardown — every
1375 /// site that needs to publish a "capture attempted, did not
1376 /// land" entry on the snapshot bridge.
1377 pub fn placeholder(reason: impl Into<String>) -> Self {
1378 let reason = reason.into();
1379 Self {
1380 prog_runtime_stats_unavailable: Some(reason.clone()),
1381 per_node_numa_unavailable: Some(reason.clone()),
1382 task_enrichments_unavailable: Some(reason.clone()),
1383 scx_walker_unavailable: Some(reason.clone()),
1384 sdt_alloc_unavailable: Some(reason),
1385 is_placeholder: true,
1386 ..Self::default()
1387 }
1388 }
1389}
1390
1391/// Identify the obj name (libbpf `<obj>` prefix) of the
1392/// currently-attached scheduler from the captured BPF state, plus
1393/// the live scheduler's `used_maps` KVA set when available.
1394///
1395/// **No `*scx_root` dependency -- works on every supported kernel.**
1396/// `scx_root` (the global pointer to the active `struct scx_sched`)
1397/// only exists on v6.16+ (added by commit 48e126777386); pre-6.16
1398/// kernels track the active scheduler via the global `scx_ops`
1399/// (`struct sched_ext_ops`) plus the atomic enable-state instead, so
1400/// any `*scx_root`-based identification is blind on 6.14/6.15. Both
1401/// paths below read only `prog_idr` / struct_ops map names /
1402/// global-section map names, which are present and BTF-offset-stable
1403/// across the whole range, so this helper resolves the active obj
1404/// uniformly regardless of kernel version.
1405///
1406/// **PRIMARY: target-free `prog_idr` walk.** Delegates to
1407/// `prog_accessor.find_active_struct_ops_obj_no_target` (see
1408/// `monitor::bpf_prog`), which walks `prog_idr` for an alive
1409/// `BPF_PROG_TYPE_STRUCT_OPS` prog whose `aux->used_maps` carries a
1410/// sibling `<obj>.bss/.data/.rodata` global-section map, and returns
1411/// that prog's obj prefix + full `used_map_kvas` snapshot. The
1412/// returned prefix is cross-checked against the captured `maps[]`
1413/// (a `<prefix>.bss/.data/.rodata` must exist) so a torn `used_maps`
1414/// read cannot publish a garbage prefix. The walk takes no target
1415/// and reads live guest memory, so it produces the correct live
1416/// scheduler even mid-swap and on pre-6.16 kernels. Used only when
1417/// the walker returns a NON-EMPTY `used_map_kvas`; an empty
1418/// whitelist cannot disambiguate two same-prefix copies downstream.
1419///
1420/// The walker matches the FIRST such prog and does not gate on the
1421/// kernel enable-state, so uniqueness rests on ktstr's swap
1422/// sequencing (kill the outgoing scheduler and wait for its process
1423/// to exit before loading the next), NOT on a kernel
1424/// old-prog-removed-before-new-prog-added guarantee: the kernel
1425/// does not serialize those (a detached struct_ops prog leaves
1426/// `prog_idr` only when its owning map's last fd closes and an RCU
1427/// grace elapses). See `find_active_struct_ops_obj_no_target` for
1428/// the single-tenant threat model.
1429///
1430/// **FALLBACK: prefix grouping over struct_ops map names.** Runs
1431/// when `prog_walker` is `None` (the prog accessor has not published
1432/// yet) OR the walker found no global-section-bearing prog OR its
1433/// whitelist was empty. Picks the first `BPF_MAP_TYPE_STRUCT_OPS`
1434/// map whose name prefix has an UNAMBIGUOUS global-section sibling
1435/// set (each of `.bss/.data/.rodata` count <= 1) and returns
1436/// `(prefix, vec![])`. Section counts use full-name equality so they
1437/// stay in lockstep with the consumer's classifier at
1438/// [`crate::scenario::snapshot::Snapshot::active`] and the walker's
1439/// `strip_suffix` in
1440/// `monitor::bpf_prog::extract_global_section_obj_prefix` (private
1441/// helper, cited by path rather than intra-doc link). A multi-copy
1442/// prefix (the same-binary `Op::ReplaceScheduler` swap window leaves
1443/// the dying scheduler's globals beside the new scheduler's) is
1444/// skipped here: the empty-whitelist `(prefix, vec![])` cannot
1445/// disambiguate downstream, so the helper instead returns `None` and
1446/// the consumer surfaces an actionable `NoActiveScheduler`.
1447///
1448/// Returns `None` when neither path resolves: the walker was absent
1449/// / found nothing / returned an unbacked or empty result, AND no
1450/// struct_ops map had an unambiguous global-section sibling set. On
1451/// `None`, callers fall back to
1452/// [`crate::scenario::snapshot::Snapshot::active`]'s own
1453/// prefix-grouping, whose per-section-count check enforces the same
1454/// multi-copy detection, so a helper-`None` does not silently
1455/// downgrade into the prefix-only `AmbiguousVar` surface.
1456fn identify_active_obj_from_struct_ops(
1457 maps: &[super::bpf_map::BpfMapInfo],
1458 prog_walker: Option<(
1459 &dyn super::bpf_prog::BpfProgAccessor,
1460 &super::btf_offsets::BpfMapOffsets,
1461 )>,
1462) -> Option<(String, Vec<u64>)> {
1463 // PRIMARY PATH: target-free prog_idr walk. Returns the first
1464 // alive `BPF_PROG_TYPE_STRUCT_OPS` prog whose `aux->used_maps`
1465 // contains a sibling `<obj>.bss/.data/.rodata` global-section map,
1466 // with that prog's obj prefix + full `used_map_kvas` snapshot.
1467 // Reads live guest memory and takes no target, so it resolves the
1468 // live scheduler on every supported kernel -- including pre-6.16,
1469 // where `scx_root` does not exist. In ktstr scenarios only one
1470 // matching prog is alive at a time: ktstr's swap sequence kills
1471 // the outgoing scheduler and waits for its process to exit before
1472 // loading the next, which closes the outgoing struct_ops map's
1473 // last fd and lets the RCU-deferred map+prog free remove the OLD
1474 // prog from `prog_idr`. (The kernel does NOT serialize
1475 // old-prog-removal before new-prog-add; the single-alive-prog
1476 // property is ktstr's sequencing, not a kernel invariant.) The
1477 // returned prefix is cross-checked against the captured `maps[]`
1478 // so a torn `used_maps` read can't publish a garbage prefix.
1479 if let Some((prog_accessor, map_offsets)) = prog_walker
1480 && let Some(walker_match) = prog_accessor.find_active_struct_ops_obj_no_target(map_offsets)
1481 && !walker_match.used_map_kvas.is_empty()
1482 {
1483 // Defense against torn used_maps reads: the walker's returned
1484 // prefix MUST appear as `<prefix>.<section>` in the captured
1485 // `maps[]`. A walker that read garbage from a mid-mutation
1486 // used_maps window would name an obj prefix that no captured
1487 // map matches; the cross-check rejects that case.
1488 let (wb, wd, wr) = count_global_sections_for_prefix(maps, &walker_match.obj_name);
1489 if wb + wd + wr > 0 {
1490 return Some((walker_match.obj_name, walker_match.used_map_kvas));
1491 }
1492 }
1493
1494 // FALLBACK PATH: prefix grouping by struct_ops map name. Runs
1495 // when the prog walker is unavailable (`prog_walker` is `None`,
1496 // e.g. `owned_prog_accessor` hasn't published yet at boot) OR the
1497 // live STRUCT_OPS prog has no global-section maps in its
1498 // used_maps (libbpf-named struct_ops case without a `.bss/.data/
1499 // .rodata` sibling, observed when the scheduler keeps all its
1500 // state in non-libbpf-named maps).
1501 //
1502 // Pick the first STRUCT_OPS map whose prefix has an unambiguous
1503 // global-section sibling set (each section ≤ 1). Multi-copy
1504 // collisions skip -- the consumer surfaces NoActiveScheduler with
1505 // an actionable diagnostic.
1506 for active_struct_ops in maps
1507 .iter()
1508 .filter(|m| m.map_type == super::bpf_map::BPF_MAP_TYPE_STRUCT_OPS)
1509 {
1510 if active_struct_ops.map_kva == 0 {
1511 continue;
1512 }
1513 let so_name = active_struct_ops.name();
1514 let Some(prefix) = so_name.split('.').next().filter(|s| !s.is_empty()) else {
1515 continue;
1516 };
1517 let (bss_count, data_count, rodata_count) = count_global_sections_for_prefix(maps, prefix);
1518 let has_matching_global = bss_count + data_count + rodata_count > 0;
1519 let unambiguous = bss_count <= 1 && data_count <= 1 && rodata_count <= 1;
1520 if has_matching_global && unambiguous {
1521 return Some((prefix.to_string(), Vec::new()));
1522 }
1523 }
1524 None
1525}
1526
1527/// Count captured maps named exactly `<prefix>.bss`, `<prefix>.data`,
1528/// `<prefix>.rodata`. Full-name equality (not prefix matching) so it
1529/// aligns with the consumer's `name.ends_with(".bss"/...)` classifier
1530/// at [`crate::scenario::snapshot::Snapshot::active`] and with the
1531/// walker's `strip_suffix(".bss")` in
1532/// `monitor::bpf_prog::extract_global_section_obj_prefix` (private
1533/// helper, cited by path rather than intra-doc link). A hypothetical
1534/// `<prefix>.bss.shared` map would count as zero here (the walker
1535/// treats it the same way), so the counts stay in lockstep across
1536/// the three sites that classify global-section maps for a scheduler
1537/// obj.
1538///
1539/// Skips `BPF_MAP_TYPE_STRUCT_OPS` maps so the active scheduler's
1540/// own struct_ops map (which `Snapshot::active` filters by type as
1541/// well) never inflates the global-section totals.
1542fn count_global_sections_for_prefix(
1543 maps: &[super::bpf_map::BpfMapInfo],
1544 prefix: &str,
1545) -> (usize, usize, usize) {
1546 let bss_name = format!("{prefix}.bss");
1547 let data_name = format!("{prefix}.data");
1548 let rodata_name = format!("{prefix}.rodata");
1549 maps.iter()
1550 .filter(|m| m.map_type != super::bpf_map::BPF_MAP_TYPE_STRUCT_OPS)
1551 .fold((0usize, 0usize, 0usize), |(b, d, r), m| {
1552 let n = m.name();
1553 (
1554 b + usize::from(n == bss_name),
1555 d + usize::from(n == data_name),
1556 r + usize::from(n == rodata_name),
1557 )
1558 })
1559}
1560
1561/// Pair of failure-dump snapshots captured at two points in a stall.
1562///
1563/// `early` is taken when the host-side runnable_at scanner observes
1564/// any task with `jiffies - p->scx.runnable_at > watchdog_timeout/2`
1565/// (mirrors the kernel's `check_rq_for_timeouts` walk over
1566/// `rq->scx.runnable_list`). `late` is taken at the same trigger as
1567/// the single-snapshot path: the BPF probe's
1568/// `ktstr_err_exit_detected` latch flipping after a sched_ext
1569/// error-class exit.
1570///
1571/// `early == None` when the watchdog half-way threshold never
1572/// triggered before `late` fired (e.g. an immediate scheduler error
1573/// in `init_task` before any task became runnable). Diffing
1574/// `late` against `early` shows what BPF state changed during the
1575/// stall window — the value-add over the single-snapshot dump.
1576///
1577/// **No user toggle — auto-repro engages this automatically.** Only
1578/// the auto-repro VM emits this shape;
1579/// `crate::test_support::probe::attempt_auto_repro` is the
1580/// single call site flipping the builder's `dual_snapshot` flag,
1581/// and there is no public ktstr surface for asking for it from a
1582/// primary VM. Test authors don't need to know about it — when an
1583/// auto-repro fires, the file at
1584/// `<test>-<variant_hash>.repro.failure-dump.json` changes shape from
1585/// [`FailureDumpReport`] to this wrapper.
1586///
1587/// Note: there is no `Default` impl. The `late` field is required
1588/// by the doc invariant ("the freeze coordinator only writes a
1589/// `DualFailureDumpReport` after the late snapshot has been
1590/// captured"); a `Default::default()` would have produced a wrapper
1591/// with an empty late report whose `maps`/`vcpu_regs` vectors
1592/// silently lie about a successful capture. Construct via the
1593/// struct literal with an explicit `late: FailureDumpReport`.
1594#[derive(Debug, Clone, Serialize, Deserialize)]
1595#[non_exhaustive]
1596pub struct DualFailureDumpReport {
1597 /// Wire-format discriminant. Always `"dual"` for this variant,
1598 /// pinning [`SCHEMA_DUAL`]. Mirror of [`FailureDumpReport::schema`]
1599 /// — consumers branch on it before deserializing.
1600 pub schema: String,
1601 /// Snapshot at the watchdog half-way point. `None` when the
1602 /// stall fired before the half-way scanner crossed its threshold.
1603 #[serde(default, skip_serializing_if = "Option::is_none")]
1604 pub early: Option<FailureDumpReport>,
1605 /// Snapshot at the error-exit latch trigger. Always present
1606 /// (the freeze coordinator only writes a `DualFailureDumpReport`
1607 /// after the late snapshot has been captured; if the run ends
1608 /// with only an early snapshot the file is not written at all).
1609 pub late: FailureDumpReport,
1610 /// Maximum `jiffies - p->scx.runnable_at` observed by the
1611 /// runnable_at scanner at the moment the early snapshot fired.
1612 /// Zero when `early` is `None`.
1613 ///
1614 /// To recover the kernel's full `watchdog_timeout`, double
1615 /// [`Self::early_threshold_jiffies`] — the scanner trigger
1616 /// fires at half the watchdog, so the threshold field carries
1617 /// `watchdog_timeout / 2`. Diff `early_max_age_jiffies` against
1618 /// `2 * early_threshold_jiffies` to see how close the system
1619 /// was to the SCX_EXIT_ERROR_STALL emission line at the
1620 /// early-trigger point.
1621 #[serde(default, skip_serializing_if = "is_zero_u64")]
1622 pub early_max_age_jiffies: u64,
1623 /// The half-way trigger threshold the scanner compared against
1624 /// when capturing the early snapshot, expressed in guest
1625 /// jiffies. Equals `(watchdog_timeout_ms * CONFIG_HZ) / 1000 / 2`
1626 /// at the moment the snapshot fired. Zero when `early` is
1627 /// `None`.
1628 ///
1629 /// Surfaced alongside `early_max_age_jiffies` so a downstream
1630 /// consumer reading the JSON does not have to recompute the
1631 /// kernel-internal jiffies arithmetic to reproduce the
1632 /// trigger condition.
1633 #[serde(default, skip_serializing_if = "is_zero_u64")]
1634 pub early_threshold_jiffies: u64,
1635 /// Structured reason the early snapshot is absent. `None` when
1636 /// the early snapshot was captured (the [`Self::early`] field is
1637 /// `Some`). When the early field is `None`, this carries a short
1638 /// machine-friendly string identifying which of the known
1639 /// failure modes occurred:
1640 ///
1641 /// - `"scan prerequisites unavailable: <prereq>"` — the
1642 /// per-CPU `runnable_at` scan never resolved its dependencies
1643 /// (most often `<prereq>` names the missing kernel symbol /
1644 /// BTF entry).
1645 /// - `"max_age never crossed threshold (peak={peak}j,
1646 /// threshold={threshold}j)"` — the scan ran but the maximum
1647 /// observed runnable-age stayed below the half-way mark for
1648 /// the whole VM lifetime. Indicates a non-stall err-class exit
1649 /// (e.g. `scx_bpf_error()`).
1650 /// - `"scx_tick stall — no per-task runnable_at data"` — the
1651 /// stall path that drove the late capture has no per-task
1652 /// `runnable_at` to scan (the kernel's "watchdog failed to
1653 /// check in" path raises `SCX_EXIT_ERROR_STALL` from the
1654 /// scx_tick kernel side without any task on
1655 /// `rq->scx.runnable_list`).
1656 ///
1657 /// Display rendering at `super::display` surfaces this string
1658 /// directly; the previous "stall fired before half-way threshold,
1659 /// or runnable_at scan setup failed" generic text is replaced
1660 /// with the structured reason whenever this field is `Some`.
1661 #[serde(default, skip_serializing_if = "Option::is_none")]
1662 pub early_skipped_reason: Option<String>,
1663}
1664
1665fn is_zero_u64(v: &u64) -> bool {
1666 *v == 0
1667}
1668
1669fn is_zero_u32(v: &u32) -> bool {
1670 *v == 0
1671}
1672
1673/// Top-level degraded failure-dump report. Emitted by the freeze
1674/// coordinator when a real error-class trigger fires but the dump
1675/// path aborts before a full [`FailureDumpReport`] can be captured —
1676/// today only the vCPU rendezvous-timeout path produces this shape.
1677///
1678/// Carries the partial state the coordinator did collect (per-vCPU
1679/// registers from any vCPU that parked before timeout) plus the
1680/// observable trigger state at the moment of degradation
1681/// (watchpoint hit, BPF `.bss` latch status, live `exit_kind` if the
1682/// gate read it). An operator inspecting the JSON learns WHY the
1683/// dump degraded from the `reason` field and WHICH vCPUs stalled
1684/// from the `vcpu_regs` Vec's per-slot `None` / `Some` pattern.
1685///
1686/// Schema discriminant: [`SCHEMA_DEGRADED`]. Parsed via the same
1687/// [`FailureDumpReportAny::from_json`] dispatcher as the other two
1688/// variants.
1689#[derive(Debug, Clone, Serialize, Deserialize)]
1690#[non_exhaustive]
1691pub struct DegradedFailureDumpReport {
1692 /// Wire-format discriminant. Always `"degraded"` for this
1693 /// variant, pinning [`SCHEMA_DEGRADED`]. Mirror of
1694 /// [`FailureDumpReport::schema`] / [`DualFailureDumpReport::schema`]
1695 /// — consumers branch on it before deserializing.
1696 pub schema: String,
1697 /// Operator-readable reason the dump degraded. Carries one of
1698 /// the `REASON_DEGRADED_*` constants as the canonical prefix,
1699 /// followed by dynamic detail filled in at emit time (e.g.
1700 /// timeout milliseconds, parked-vCPU counts). Stable wire format
1701 /// per the [`SCHEMA_DEGRADED`] discriminant contract: new degraded
1702 /// causes add new `REASON_DEGRADED_*` constants rather than
1703 /// mutating the existing ones.
1704 pub reason: String,
1705 /// Per-vCPU register snapshots collected before degradation.
1706 /// Index matches vCPU id (BSP at 0, APs at 1..N). `None` entries
1707 /// identify the vCPUs that never parked (the operator's
1708 /// signal for which vCPUs stalled) — distinct from
1709 /// [`FailureDumpReport::vcpu_regs`]'s `None`, which usually
1710 /// means `KVM_GET_REGS` failed mid-shutdown.
1711 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1712 pub vcpu_regs: Vec<Option<VcpuRegSnapshot>>,
1713 /// Hardware-watchpoint hit state at degradation. `true` when
1714 /// the freeze-coordinator's `*scx_root->exit_kind` watchpoint
1715 /// fired on a vCPU thread; `false` when only the BPF `.bss`
1716 /// latch fired (or the trigger source was a deferred-capture
1717 /// request).
1718 pub watchpoint_hit: bool,
1719 /// BPF probe `.bss` latch state at degradation. One of
1720 /// `"triggered"` (probe latched err exit), `"not_triggered"`
1721 /// (latch readable, value still 0), `"out_of_bounds"` (cached
1722 /// `.bss` PA no longer 4-byte-readable — probe map freed
1723 /// mid-run), or `"not_resolved"` (cached `.bss` PA was never
1724 /// populated). Mirror of `crate::vmm::freeze_coord`'s
1725 /// internal `BssReadState` enum, serialised as the snake-case
1726 /// of each variant. String-typed for wire-format stability with
1727 /// the rest of the `REASON_*` / state-name surface — see
1728 /// [`SCHEMA_DEGRADED`] for the contract.
1729 pub bss_latch_state: String,
1730 /// Live `*scx_root->exit_kind` value at degradation, when the
1731 /// gate read it. `None` when the dump path aborted before
1732 /// reaching the gate (rendezvous timed out earlier) or when the
1733 /// KVA translation failed. `Some(kind)` carries the raw `u32`
1734 /// from `enum scx_exit_kind` — operators read it against
1735 /// the kernel's `scx_exit_kind` enum definition to identify
1736 /// whether the scheduler's intended exit class matched the
1737 /// trigger that fired.
1738 #[serde(default, skip_serializing_if = "Option::is_none")]
1739 pub exit_kind: Option<u32>,
1740 /// Wall-clock milliseconds from the freeze trigger (capture
1741 /// start) to the degraded-emit decision. Lets an operator see
1742 /// how long the coordinator spent trying to capture before
1743 /// giving up. Mirrors the `elapsed_ms` field
1744 /// [`FailureDumpReport`] surfaces via the post-dump log line —
1745 /// here it's structured so consumers can read it without
1746 /// parsing the log.
1747 #[serde(default, skip_serializing_if = "is_zero_u64")]
1748 pub elapsed_ms: u64,
1749}
1750
1751/// Either-or wrapper that owns a parsed [`FailureDumpReport`],
1752/// [`DualFailureDumpReport`], or [`DegradedFailureDumpReport`]. Lets
1753/// a consumer hold and render a failure-dump file without prematurely
1754/// committing to one schema — the discriminant lives in the JSON's
1755/// `schema` field, not in the type the consumer holds.
1756///
1757/// Centralises the schema-tag dispatch logic that previously lived
1758/// inline at every read site (the auto-repro tail renderer, the
1759/// failure-dump-e2e test, any future consumer that wants to inspect
1760/// any shape). Use [`Self::from_json`] to parse an arbitrary
1761/// failure-dump JSON blob; the Display impl forwards to the
1762/// underlying report's existing Display so the rendered output is
1763/// indistinguishable from holding the unwrapped report directly.
1764///
1765/// `non_exhaustive` so a future fourth schema can be added without
1766/// breaking external pattern matches.
1767#[non_exhaustive]
1768pub enum FailureDumpReportAny {
1769 /// Single-snapshot report, schema=`"single"`. Emitted by the
1770 /// primary VM's freeze coordinator when an error-class SCX exit
1771 /// fires.
1772 Single(Box<FailureDumpReport>),
1773 /// Dual-snapshot wrapper, schema=`"dual"`. Emitted by the
1774 /// auto-repro VM when the dual-snapshot path is enabled. Carries
1775 /// optional `early` + required `late` snapshots plus jiffies
1776 /// metadata for the early-trigger condition.
1777 ///
1778 /// Boxed to keep [`FailureDumpReportAny`]'s on-stack size bounded
1779 /// — `DualFailureDumpReport` carries the early+late snapshots
1780 /// inline and is roughly 2x the size of [`FailureDumpReport`].
1781 Dual(Box<DualFailureDumpReport>),
1782 /// Degraded report, schema=`"degraded"`. Emitted when an
1783 /// error-class trigger fires but the dump path aborts before a
1784 /// full single/dual report can be captured — today only the
1785 /// vCPU rendezvous-timeout path produces this shape. Carries
1786 /// partial vCPU register data + trigger-state diagnostics
1787 /// instead of the full map / scx-walker output.
1788 ///
1789 /// Boxed for size parity with the other variants.
1790 Degraded(Box<DegradedFailureDumpReport>),
1791}
1792
1793impl FailureDumpReportAny {
1794 /// Parse a failure-dump JSON blob, choosing the variant by the
1795 /// `schema` field. Returns `None` on any of:
1796 ///
1797 /// - the blob does not parse as JSON
1798 /// - the `schema` field is absent (degraded variant requires an
1799 /// explicit discriminant; the previous "absent ⇒ single"
1800 /// fallback would silently mis-route a richer wrapper as a
1801 /// lossy single shape)
1802 /// - the `schema` field carries an unknown value
1803 /// - the typed deserialisation under the chosen schema fails
1804 pub fn from_json(json: &str) -> Option<Self> {
1805 let value: serde_json::Value = serde_json::from_str(json).ok()?;
1806 let schema = value.get("schema").and_then(|v| v.as_str())?;
1807 match schema {
1808 SCHEMA_DUAL => serde_json::from_str(json)
1809 .ok()
1810 .map(|d| Self::Dual(Box::new(d))),
1811 SCHEMA_SINGLE => serde_json::from_str(json)
1812 .ok()
1813 .map(|r| Self::Single(Box::new(r))),
1814 SCHEMA_DEGRADED => serde_json::from_str(json)
1815 .ok()
1816 .map(|d| Self::Degraded(Box::new(d))),
1817 _ => None,
1818 }
1819 }
1820}
1821
1822/// Rendering of one BPF map's contents.
1823///
1824/// Unifies the map-type rendering paths under a single
1825/// representation: single-entry ARRAY maps (incl. the
1826/// `.bss`/`.data`/`.rodata` global sections) populate `value`;
1827/// multi-entry ARRAY maps populate `array_entries`; keyed HASH maps
1828/// populate `entries`; per-CPU maps populate `percpu_entries`.
1829/// Exactly one of these is non-empty for a successful render; on
1830/// failure `error` is set and the rest empty.
1831#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1832#[non_exhaustive]
1833pub struct FailureDumpMap {
1834 /// Map name as registered with the kernel. Truncated to
1835 /// `BPF_OBJ_NAME_LEN` (16) by the kernel; libbpf composes
1836 /// `"<obj_name>.<section>"` for global-section maps.
1837 pub name: String,
1838 /// Guest-KVA of this map's `struct bpf_map` allocation. Unique
1839 /// per loaded map instance — two map copies sharing the same
1840 /// `name` (e.g. two `<obj>.bss` maps from two scheduler
1841 /// instances loaded from the same binary post-
1842 /// [`crate::scenario::ops::Op::ReplaceScheduler`]) have distinct
1843 /// KVAs and are distinguishable on this field alone.
1844 ///
1845 /// Sourced from [`crate::monitor::bpf_map::BpfMapInfo::map_kva`]
1846 /// at capture time. Within-run stable (the kernel does not
1847 /// relocate `struct bpf_map`); not comparable across runs
1848 /// (KASLR slide differs).
1849 ///
1850 /// `0` when capture did not record a KVA (e.g., synthetic test
1851 /// fixtures constructed via `..Default::default()`); consumers
1852 /// treating `0` as "no kernel identity" gracefully fall back to
1853 /// name-based matching.
1854 #[serde(default, skip_serializing_if = "is_zero_u64")]
1855 pub map_kva: u64,
1856 /// Raw `map_type` from `struct bpf_map` (e.g. `BPF_MAP_TYPE_ARRAY`).
1857 /// Kept as `u32` rather than an enum to avoid bumping a serde
1858 /// schema each time the kernel adds a kind.
1859 pub map_type: u32,
1860 /// Declared per-entry value size. Captured even when rendering
1861 /// fails so the operator can see the map shape.
1862 pub value_size: u32,
1863 /// Declared maximum entry count from `struct bpf_map.max_entries`.
1864 /// Surfaces alongside the rendered slice so a consumer can spot
1865 /// when the dump shows fewer entries than the map declares
1866 /// (e.g. ARRAY / HASH truncated at `MAX_ARRAY_KEYS` /
1867 /// `MAX_HASH_ENTRIES`; PERCPU_ARRAY truncated at
1868 /// `MAX_PERCPU_KEYS`).
1869 pub max_entries: u32,
1870 /// Single-value render for a single-entry ARRAY map
1871 /// (`max_entries <= 1`, incl. the `.bss`/`.data`/`.rodata`
1872 /// global sections). Multi-entry ARRAY maps use `array_entries`.
1873 #[serde(default, skip_serializing_if = "Option::is_none")]
1874 pub value: Option<RenderedValue>,
1875 /// (key, value) entries for HASH maps.
1876 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1877 pub entries: Vec<FailureDumpEntry>,
1878 /// Per-entry values for a multi-entry `BPF_MAP_TYPE_ARRAY` map,
1879 /// indexed by the array key (`u32`). Populated for
1880 /// `max_entries > 1`; the single-entry case uses `value`, so
1881 /// exactly one of `value` / `array_entries` is set for an ARRAY
1882 /// render. Capped at `MAX_ARRAY_KEYS`; truncation and per-key
1883 /// read failures surface in `error`.
1884 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1885 pub array_entries: Vec<FailureDumpArrayEntry>,
1886 /// Per-CPU slots for PERCPU_ARRAY maps. Outer Vec indexed by key,
1887 /// inner Vec indexed by CPU id.
1888 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1889 pub percpu_entries: Vec<FailureDumpPercpuEntry>,
1890 /// Per-key per-CPU slots for `PERCPU_HASH` / `LRU_PERCPU_HASH`
1891 /// maps. Same shape as `percpu_entries` but the outer key is
1892 /// arbitrary bytes (rendered via BTF when a key type id is
1893 /// available, hex otherwise) instead of the implicit u32 key
1894 /// of a per-CPU array.
1895 #[serde(default, skip_serializing_if = "Vec::is_empty")]
1896 pub percpu_hash_entries: Vec<FailureDumpPercpuHashEntry>,
1897 /// Page snapshot for `BPF_MAP_TYPE_ARENA` maps. `None` for all
1898 /// other map types.
1899 #[serde(default, skip_serializing_if = "Option::is_none")]
1900 pub arena: Option<ArenaSnapshot>,
1901 /// Position counters and capacity for `BPF_MAP_TYPE_RINGBUF` /
1902 /// `BPF_MAP_TYPE_USER_RINGBUF` maps. Surfaces stuck-consumer
1903 /// diagnostics — pending bytes far below the watermark plus
1904 /// non-zero `pending_pos` indicates a producer holding a
1905 /// reservation; pending bytes near capacity indicates a stalled
1906 /// consumer. `None` for non-ringbuf maps or when the BTF offsets
1907 /// for `bpf_ringbuf_map` / `bpf_ringbuf` weren't resolvable.
1908 #[serde(default, skip_serializing_if = "Option::is_none")]
1909 pub ringbuf: Option<FailureDumpRingbuf>,
1910 /// Per-bucket trace summary for `BPF_MAP_TYPE_STACK_TRACE` maps.
1911 /// `None` for non-STACK_TRACE maps or when the BTF offsets for
1912 /// `bpf_stack_map` / `stack_map_bucket` weren't resolvable.
1913 #[serde(default, skip_serializing_if = "Option::is_none")]
1914 pub stack_trace: Option<FailureDumpStackTrace>,
1915 /// Populated-slot summary for FD-array families (`PROG_ARRAY`,
1916 /// `PERF_EVENT_ARRAY`, `CGROUP_ARRAY`, `ARRAY_OF_MAPS`,
1917 /// `HASH_OF_MAPS`, `DEVMAP*`, `SOCKMAP*`, `CPUMAP`, `XSKMAP`,
1918 /// `REUSEPORT_SOCKARRAY`). `None` for non-FD-array maps.
1919 #[serde(default, skip_serializing_if = "Option::is_none")]
1920 pub fd_array: Option<FailureDumpFdArray>,
1921 /// Reason this map's contents are missing or partial. Empty on
1922 /// successful render.
1923 #[serde(default, skip_serializing_if = "Option::is_none")]
1924 pub error: Option<String>,
1925}
1926
1927/// Ringbuf occupancy snapshot read from `struct bpf_ringbuf` at the
1928/// freeze instant.
1929///
1930/// Capacity, consumer/producer positions, and the in-flight reservation
1931/// frontier (`pending_pos`) are all that's readable without walking the
1932/// records. Pending bytes (= `producer_pos - consumer_pos`, computed
1933/// with unsigned wraparound) is the operator-visible indicator: low
1934/// values = consumer keeping up; values approaching capacity = consumer
1935/// stuck or kernel producer overrunning. A non-zero gap between
1936/// `producer_pos` and `pending_pos` means a producer is mid-reserve
1937/// and the consumer can't advance past `pending_pos`.
1938///
1939/// Read via `crate::monitor::btf_offsets::BpfRingbufOffsets`; rendered
1940/// in `render_ringbuf_state`.
1941#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1942#[non_exhaustive]
1943pub struct FailureDumpRingbuf {
1944 /// Ring data area capacity in bytes (= `mask + 1`). Always a
1945 /// power of two; matches the map's declared `max_entries`.
1946 pub capacity: u64,
1947 /// Consumer position. Byte index of the next record userspace
1948 /// will read. Monotonically advances; the kernel never writes
1949 /// here.
1950 pub consumer_pos: u64,
1951 /// Producer position. Byte index past the last reserved record.
1952 /// Monotonically advances; updated by the kernel on each
1953 /// `bpf_ringbuf_reserve`.
1954 pub producer_pos: u64,
1955 /// Pending position. Byte index of the oldest in-flight (still
1956 /// being filled) reservation. Records below `pending_pos` are
1957 /// committed and visible to the consumer; records between
1958 /// `pending_pos` and `producer_pos` are reserved but not yet
1959 /// committed.
1960 pub pending_pos: u64,
1961 /// Pending bytes (= `producer_pos.wrapping_sub(consumer_pos)`).
1962 /// 0 = consumer caught up; capacity = ring full / consumer
1963 /// stalled. Computed with unsigned wraparound to match the
1964 /// kernel's dispatch-path arithmetic.
1965 pub pending_bytes: u64,
1966}
1967
1968/// Per-bucket summary of populated stack traces in a STACK_TRACE map.
1969///
1970/// Each `entry` is one populated bucket whose pointer was non-null at
1971/// the freeze instant. `nr` is the number of trace samples (PCs) in
1972/// the bucket; `pcs` carries the actual u64 PC values when readable
1973/// (build-id stacks render the raw bytes hex since the per-entry
1974/// shape is `struct bpf_stack_build_id`, not a u64). The dump caps
1975/// per-bucket entries at `MAX_STACK_TRACE_PCS` to bound memory.
1976#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1977#[non_exhaustive]
1978pub struct FailureDumpStackTrace {
1979 /// `bpf_stack_map.n_buckets` — the rounded-up power-of-two slot
1980 /// count. Iteration upper bound; differs from `max_entries` which
1981 /// the kernel rounds.
1982 pub n_buckets: u32,
1983 /// One entry per non-null bucket pointer. Sorted by bucket id.
1984 pub entries: Vec<FailureDumpStackTraceEntry>,
1985 /// True when any populated bucket was truncated at
1986 /// `MAX_STACK_TRACE_PCS` PCs.
1987 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
1988 pub truncated: bool,
1989 /// Count of buckets whose pointer slot or bucket struct could
1990 /// not be translated to a guest physical address at capture
1991 /// time (an unmapped page in the bucket array or a dangling
1992 /// bucket KVA). These buckets are absent from `entries`
1993 /// entirely. `0` when every non-null bucket was readable.
1994 /// Without this count `entries.len()` undercounts the live
1995 /// buckets and the gap reads as "fewer stacks present" rather
1996 /// than "stacks present but unreadable".
1997 #[serde(default, skip_serializing_if = "is_zero_u32")]
1998 pub buckets_unreadable: u32,
1999}
2000
2001/// One populated stack trace from a STACK_TRACE map.
2002#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2003#[non_exhaustive]
2004pub struct FailureDumpStackTraceEntry {
2005 /// Bucket id (= stack ID returned by `bpf_get_stackid`).
2006 pub bucket_id: u32,
2007 /// Number of trace samples (kernel `stack_map_bucket.nr`).
2008 pub nr: u32,
2009 /// PC values (u64) when the map is in non-build-id mode. Empty
2010 /// when `BPF_F_STACK_BUILD_ID` is set on the map (each entry
2011 /// is then a `bpf_stack_build_id` record — its raw bytes land
2012 /// in `data_hex`).
2013 #[serde(default, skip_serializing_if = "Vec::is_empty")]
2014 pub pcs: Vec<u64>,
2015 /// Hex-encoded raw bucket data bytes. Always populated alongside
2016 /// `pcs` so the operator can decode build-id stacks or correlate
2017 /// trace samples with the wire format.
2018 pub data_hex: String,
2019}
2020
2021/// Per-FD-array snapshot of populated indices.
2022///
2023/// FD-array families store `void *` slots in `bpf_array.ptrs`; each
2024/// slot is either NULL (empty) or a kernel pointer (struct bpf_prog *,
2025/// struct file *, etc.). The dump path reads up to
2026/// `MAX_FD_ARRAY_SLOTS` slots, counts non-zero, and lists the
2027/// populated indices.
2028#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2029#[non_exhaustive]
2030pub struct FailureDumpFdArray {
2031 /// Number of populated (non-zero) slots seen.
2032 pub populated: u32,
2033 /// Total slots scanned. Capped at `MAX_FD_ARRAY_SLOTS`.
2034 pub scanned: u32,
2035 /// Indices of populated slots. Truncated to
2036 /// `MAX_FD_ARRAY_INDICES` entries.
2037 #[serde(default, skip_serializing_if = "Vec::is_empty")]
2038 pub indices: Vec<u32>,
2039 /// True when iteration capped at `MAX_FD_ARRAY_SLOTS` and
2040 /// `scanned < max_entries`.
2041 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
2042 pub truncated: bool,
2043 /// True when `populated > indices.len()` because
2044 /// `MAX_FD_ARRAY_INDICES` capped the index list.
2045 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
2046 pub indices_truncated: bool,
2047 /// Count of scanned slots whose KVA could not be translated to a
2048 /// guest physical address (an unmapped page in the `ptrs` flex
2049 /// array). These slots are neither confirmed empty nor counted
2050 /// as `populated`, so `populated` is a lower bound when this is
2051 /// non-zero. `0` when every scanned slot was readable.
2052 #[serde(default, skip_serializing_if = "is_zero_u32")]
2053 pub unreadable: u32,
2054}
2055
2056/// One (key, value) pair from a hash map. Both sides are rendered via
2057/// BTF when key/value type ids are available; a `None` rendering
2058/// preserves the raw bytes.
2059#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2060#[non_exhaustive]
2061pub struct FailureDumpEntry {
2062 /// Rendered key. `None` when no BTF type is available for the key.
2063 #[serde(default, skip_serializing_if = "Option::is_none")]
2064 pub key: Option<RenderedValue>,
2065 /// Hex-encoded raw key bytes. Kept alongside `key` so the operator
2066 /// can correlate rendered output with the wire format.
2067 pub key_hex: String,
2068 /// Rendered value. `None` when no BTF type is available.
2069 #[serde(default, skip_serializing_if = "Option::is_none")]
2070 pub value: Option<RenderedValue>,
2071 /// Hex-encoded raw value bytes.
2072 pub value_hex: String,
2073 /// Typed render of the per-entry sdt_alloc payload, when the value
2074 /// carries a `struct sdt_data __arena *` field that points into a
2075 /// captured arena page and a payload type id was discovered for
2076 /// the matching allocator. `None` when the entry carries no arena
2077 /// pointer to chase, no allocator metadata was found, the payload
2078 /// type was ambiguous, or the arena read failed.
2079 ///
2080 /// `value` already renders the surface struct (e.g.
2081 /// `scx_task_map_val { tid, tptr, data: 0x100000... -> sdt_data {
2082 /// tid: { idx, genn } } }`), but `sdt_data.payload[]` is a flex
2083 /// array — BTF reports its size as 0, so the per-task struct that
2084 /// actually lives in the payload bytes never decodes through the
2085 /// surface render. This field carries that decoded payload
2086 /// alongside the surface struct so the operator sees both views
2087 /// at once.
2088 #[serde(default, skip_serializing_if = "Option::is_none")]
2089 pub payload: Option<RenderedValue>,
2090}
2091
2092/// One entry of a multi-entry `BPF_MAP_TYPE_ARRAY` map: the array key
2093/// (`u32`) and its rendered value. Mirrors [`FailureDumpPercpuEntry`]'s
2094/// typed `u32` key (ARRAY keys are kernel-imposed indices, not
2095/// user-typed bytes) but carries a single value rather than a per-CPU
2096/// vector.
2097///
2098/// `value` is `None` only when the entry's guest page was unmapped at
2099/// the freeze instant; a BTF-render miss falls back to
2100/// `RenderedValue::Bytes` (hex), so `None` unambiguously means
2101/// "unreadable", not "un-rendered". ARRAY values are not
2102/// `sdt_data`-arena-chased — no in-tree sched_ext ARRAY stores arena
2103/// pointers; add a `payload` field here if one ever does.
2104#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2105#[non_exhaustive]
2106pub struct FailureDumpArrayEntry {
2107 /// Array index (kernel key).
2108 pub key: u32,
2109 /// Rendered value (BTF when `btf_value_type_id` is non-zero, hex
2110 /// fallback otherwise). `None` when the entry was unreadable.
2111 #[serde(default, skip_serializing_if = "Option::is_none")]
2112 pub value: Option<RenderedValue>,
2113}
2114
2115/// One key from a per-CPU array, with one rendered value per CPU
2116/// (None for CPUs whose per-CPU page was unmapped or out-of-range).
2117#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2118#[non_exhaustive]
2119pub struct FailureDumpPercpuEntry {
2120 pub key: u32,
2121 pub per_cpu: Vec<Option<RenderedValue>>,
2122}
2123
2124/// One key from a `PERCPU_HASH` / `LRU_PERCPU_HASH` map, with one
2125/// rendered value per CPU. Mirrors [`FailureDumpEntry`] for the key
2126/// side (rendered + hex) and [`FailureDumpPercpuEntry`] for the
2127/// per-CPU value vector.
2128#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2129#[non_exhaustive]
2130pub struct FailureDumpPercpuHashEntry {
2131 /// Rendered key. `None` when no BTF type is available for the key.
2132 #[serde(default, skip_serializing_if = "Option::is_none")]
2133 pub key: Option<RenderedValue>,
2134 /// Hex-encoded raw key bytes.
2135 pub key_hex: String,
2136 /// One slot per CPU. `None` when the CPU's per-CPU slot was
2137 /// unmapped or out-of-range; `Some` rendered (BTF when value type
2138 /// id is non-zero) or hex bytes otherwise.
2139 pub per_cpu: Vec<Option<RenderedValue>>,
2140}
2141
2142/// Sanity cap on a single BTF blob read.
2143///
2144/// BPF program BTF is normally <100 KB; vmlinux BTF caps around
2145/// ~10 MB. A bogus `data_size` (corrupted `struct btf`) shouldn't
2146/// pull megabytes of unrelated guest memory into the renderer or the
2147/// freeze coordinator. Shared between [`load_program_btf_kva`] and
2148/// `vmm::load_probe_bss_offset`; defining it here keeps the bound
2149/// in one place so a future tightening doesn't drift between sites.
2150pub(crate) const MAX_BTF_BLOB: usize = 32 * 1024 * 1024;
2151
2152/// Hard cap on the per-task enrichment loop inside [`dump_state`].
2153///
2154/// A hostile or pathologically broken guest can produce a runnable_list
2155/// chain whose length is bounded only by the number of
2156/// `task_struct`s in the kernel — tens of thousands on a busy box.
2157/// Each enrichment call walks task/signal/pid/upid offsets, the
2158/// sched_class registry, and the lock-slowpath stack matcher, so an
2159/// uncapped loop turns the freeze window from milliseconds into
2160/// minutes. 4096 is well above any healthy SCX runnable_list depth
2161/// (the kernel's own watchdog fires long before that many tasks
2162/// queue up) and still bounds the worst-case freeze cost. When the
2163/// cap fires, [`dump_state`] truncates without enriching the tail
2164/// and stamps [`FailureDumpReport::dump_truncated_at_us`] so the
2165/// operator knows to attribute missing tasks to truncation rather
2166/// than walker failure.
2167pub const MAX_ENRICHED_TASKS: usize = 4096;
2168
2169/// Bare-named ktstr framework maps to skip during enumeration.
2170///
2171/// These are declared in `src/bpf/probe.bpf.c` without a libbpf
2172/// `<obj>.<section>` prefix (`SEC(".maps")` declarations like
2173/// `func_meta_map`, `probe_data`, `probe_scratch`, `ktstr_events`);
2174/// the kernel registers them under the bare names listed here.
2175/// They're framework-internal — the user looking at a failure dump
2176/// for their scheduler doesn't care about ktstr's own kprobe
2177/// scratch — so the dump path drops them.
2178///
2179/// The framework's ringbuf is named `ktstr_events` (not `events`)
2180/// so a user scheduler that legitimately names its own ringbuf
2181/// `events` is not silently dropped from the dump.
2182///
2183/// Future ktstr probe additions need to be added here AND the
2184/// matching `<obj_name>.` prefix needs to be in the
2185/// `render_map`-internal starts_with list (see `dump_state`).
2186const KTSTR_INTERNAL_MAPS: &[&str] = &[
2187 "func_meta_map",
2188 "probe_data",
2189 "probe_scratch",
2190 "ktstr_events",
2191];
2192
2193/// All inputs the failure-dump renderer needs, bundled so future
2194/// capture sites (DSQ walker, rq->scx walker, NUMA stats, ...) can
2195/// land as new optional fields without churning every call site.
2196///
2197/// `accessor` is currently the concrete guest-memory backend. The
2198/// trait dispatch claim in [`BpfMapAccessor`]'s module-level doc
2199/// is aspirational: `dump_state` reaches through the accessor for
2200/// map enumeration AND for the sdt_alloc post-pass walk, which
2201/// needs the underlying [`super::guest::GuestKernel`] handle —
2202/// only the guest-memory backend exposes that. When the live-host
2203/// backend lands, sdt_alloc walking will move into a
2204/// backend-specific path and `accessor` here can become
2205/// `&'a dyn BpfMapAccessor`.
2206///
2207/// `arena_offsets` and `prog_capture` are both optional borrows
2208/// (uniform shape): `None` for either disables that
2209/// capture leg without affecting the rest. A scheduler running on
2210/// an older kernel without arena support lands here with
2211/// `arena_offsets: None` and the failure dump renders maps + regs
2212/// without arena pages; a setup where the BpfProgAccessor couldn't
2213/// resolve `prog_idr` lands with `prog_capture: None` and
2214/// `prog_runtime_stats` stays empty.
2215pub struct DumpContext<'a> {
2216 /// BPF map accessor. Concrete guest-memory backend today; see
2217 /// the type-level doc for why this is not `&dyn BpfMapAccessor`.
2218 pub accessor: &'a GuestMemMapAccessor<'a>,
2219 /// Host-resolved vmlinux BTF. The renderer uses it as the base
2220 /// for split-BTF parsing on programs that ship their own type
2221 /// info; it's also the fallback when a map's program BTF can't
2222 /// be loaded.
2223 pub btf: &'a Btf,
2224 /// Guest's `nr_cpu_ids`. Forwarded into per-CPU map rendering
2225 /// so PERCPU_ARRAY readers know how many slots to enumerate.
2226 /// Pass `1` for non-percpu-only dumps if the caller doesn't
2227 /// have the value handy.
2228 pub num_cpus: u32,
2229 /// BTF-resolved arena field offsets. Enables
2230 /// `BPF_MAP_TYPE_ARENA` page snapshotting via the accessor
2231 /// trait's `read_arena_pages`. `None` skips arena rendering
2232 /// (older kernel without arena support, or BTF lacking
2233 /// `struct bpf_arena`).
2234 pub arena_offsets: Option<&'a BpfArenaOffsets>,
2235 /// Per-program runtime stats capture. `None` skips
2236 /// prog-runtime capture; the dump still renders every map the
2237 /// accessor enumerates.
2238 pub prog_capture: Option<&'a ProgRuntimeCapture<'a>>,
2239 /// Per-CPU CPU-time / softirq / IRQ capture. `None` skips the
2240 /// per-CPU time walk; the rest of the dump still renders. Same
2241 /// "borrowed-only, optional" shape as
2242 /// [`Self::prog_capture`] / [`Self::arena_offsets`] so a
2243 /// future capture site lands as another optional field without
2244 /// churning the call sites already plumbed through here.
2245 pub cpu_time_capture: Option<&'a CpuTimeCapture<'a>>,
2246 /// Per-cgroup PSI-irq capture (Phase A). `None` skips the cgroup
2247 /// hierarchy walk; the rest of the dump still renders. Same
2248 /// borrowed-only/optional shape as [`Self::cpu_time_capture`].
2249 pub cgroup_psi_capture: Option<&'a CgroupPsiCapture<'a>>,
2250 /// Per-task enrichment capture. `None` skips the per-task walk
2251 /// and `task_enrichments` stays empty; the rest of the dump
2252 /// still renders.
2253 ///
2254 /// Today's freeze coordinator passes `None` because the DSQ
2255 /// and rq->scx task walkers have not yet landed dispatch. The
2256 /// `TaskEnrichmentOffsets` + `SchedClassRegistry` + the
2257 /// `walk_task_enrichment` library are wired and ready —
2258 /// the producer side just needs to populate
2259 /// [`TaskEnrichmentCapture::tasks`] from the rq->scx walker.
2260 pub task_enrichment_capture: Option<&'a TaskEnrichmentCapture<'a>>,
2261 /// SCX_EV_* event counter timeline capture. `None` skips
2262 /// timeline rendering and `event_counter_timeline` stays
2263 /// empty; the rest of the dump still renders. Same
2264 /// "borrowed-only, optional" shape as
2265 /// [`Self::cpu_time_capture`].
2266 pub event_counter_capture: Option<&'a EventCounterCapture<'a>>,
2267 /// SCX rq->scx + DSQ walker capture. `None` skips the walk;
2268 /// `rq_scx_states` / `dsq_states` / `scx_sched_state` stay
2269 /// empty/None and `scx_walker_unavailable` records why.
2270 pub scx_walker_capture: Option<&'a ScxWalkerCapture<'a>>,
2271 /// Host-side per-vCPU hardware perf counters (cycles,
2272 /// instructions, cache-misses, branch-misses) opened with
2273 /// `exclude_host=1`, so each counter only ticks during guest
2274 /// execution. `None` skips the freeze-time read; the
2275 /// [`FailureDumpReport::vcpu_perf_at_freeze`] vec stays empty.
2276 /// See [`super::perf_counters`] for the kernel-source-grounded
2277 /// rationale and capture semantics.
2278 ///
2279 /// The same capture is shared (via `Arc` in the freeze
2280 /// coordinator) with the per-tick monitor sampler; per-tick
2281 /// samples land on each [`super::CpuSnapshot::vcpu_perf`]. The
2282 /// freeze-time read here records the absolute counter values at
2283 /// the instant the failure dump fired, which lets a consumer
2284 /// diff against any earlier sample to compute IPC over a
2285 /// freeze-aligned window.
2286 pub perf_capture: Option<&'a super::perf_counters::PerfCountersCapture>,
2287 /// Soft deadline for the dump's heavy phases (per-map render
2288 /// loop, walk_rq_scx, walk_local_dsqs, walk_dsqs sched-rooted,
2289 /// walk_task_enrichment, sdt_alloc post-pass). When supplied,
2290 /// each phase boundary checks `Instant::now() > deadline`; the
2291 /// first crossing truncates remaining work and stamps
2292 /// [`FailureDumpReport::dump_truncated_at_us`]. `None` disables
2293 /// the bailout — the dump runs every phase to completion.
2294 ///
2295 /// Set by the freeze coordinator to `capture_start +
2296 /// watchdog_timeout/2` so a slow dump can't keep vCPUs parked
2297 /// past the kernel's own SCX_EXIT_ERROR_STALL emission line. The
2298 /// deadline is a soft bound: each phase that has already started
2299 /// runs to completion before checking, so the actual elapsed
2300 /// time at truncation can exceed the deadline by one phase's
2301 /// worth of work.
2302 pub deadline: Option<std::time::Instant>,
2303 /// BPF cast-analysis output for the scheduler's program object,
2304 /// produced once at builder time by parsing the scheduler
2305 /// binary's `.bpf.objs` ELF blob (no libbpf, no kernel
2306 /// interaction). Threaded into every per-map [`RenderMapCtx`]
2307 /// so the renderer's
2308 /// [`super::btf_render::MemReader::cast_lookup`] can promote
2309 /// `u64` fields the analyzer flagged into typed-pointer
2310 /// renders. `None` skips cast-driven promotion entirely (every
2311 /// `u64` renders as a plain unsigned counter, the
2312 /// pre-integration default); same effect as passing an empty
2313 /// map but cheaper to thread.
2314 pub cast_map: Option<&'a super::cast_analysis::CastMap>,
2315 /// Unique alloc_sizes captured from `scx_static_alloc_internal`
2316 /// call sites. Threaded to the renderer as a last-resort fallback.
2317 pub alloc_size_types: &'a [(u64, String)],
2318 /// Cross-BTF Fwd resolution context: every parsed embedded
2319 /// BPF object's program BTF plus a name-keyed index over
2320 /// every complete struct/union across them. Threaded into
2321 /// every per-map [`RenderMapCtx`] so the renderer's
2322 /// [`super::btf_render::MemReader::cross_btf_resolve_fwd`]
2323 /// can chase a `BTF_KIND_FWD` whose body lives in a sibling
2324 /// embedded object's BTF. Borrowed slices in the
2325 /// `(btfs, fwd_index)` pair point into the
2326 /// [`crate::vmm::cast_analysis_load::CastAnalysisOutput`] the
2327 /// freeze coordinator holds alive via `Arc` for the dump
2328 /// pass; `None` (no scheduler binary, or analyzer found no
2329 /// complete struct/union definitions) keeps the renderer's
2330 /// "forward declaration; body not in this BTF" skip path
2331 /// intact.
2332 pub cross_btf_fwd_index: Option<CrossBtfFwdIndex<'a>>,
2333}
2334
2335/// Per-dump cross-BTF Fwd resolution context: every parsed program
2336/// BTF the cast-analysis pre-pass discovered, plus a name-keyed
2337/// index over the complete (`!is_fwd`) struct/union definitions
2338/// across them.
2339///
2340/// Built once at the freeze-coordinator side from
2341/// [`crate::vmm::cast_analysis_load::CastAnalysisOutput`] and
2342/// threaded through [`DumpContext::cross_btf_fwd_index`] into
2343/// every per-map `AccessorMemReader`. The renderer's
2344/// [`super::btf_render::MemReader::cross_btf_resolve_fwd`]
2345/// override range-looks up the hit and returns a
2346/// [`super::btf_render::CrossBtfRef`] whose `btf` borrow points at
2347/// the matching `Arc<Btf>` inside `btfs`.
2348///
2349/// Empty `btfs` / empty `fwd_index` are valid (no scheduler binary,
2350/// or analyzer found no Struct/Union definitions); the bridge stays
2351/// dormant and the chase falls through to the legacy
2352/// "forward declaration" skip path.
2353pub struct CrossBtfFwdIndex<'a> {
2354 /// Every parsed program BTF in the order
2355 /// `crate::vmm::cast_analysis_load::iter_embedded_bpf_objects`
2356 /// yielded the embedded objects. Index 0 is the first object's
2357 /// BTF, etc. Empty when the scheduler binary had no parseable
2358 /// `.bpf.objs`. Borrowed from the
2359 /// [`crate::vmm::cast_analysis_load::CastAnalysisOutput`] held
2360 /// alive by the freeze coordinator's `Arc` for the dump pass.
2361 pub btfs: &'a [std::sync::Arc<Btf>],
2362 /// `name -> FwdIndexEntry` over every complete struct/union
2363 /// across `btfs`. See
2364 /// [`crate::vmm::cast_analysis_load::CastAnalysisOutput::fwd_index`]
2365 /// for the construction policy (first-write-wins on duplicate
2366 /// names, anonymous types skipped).
2367 pub fwd_index:
2368 &'a std::collections::HashMap<String, crate::vmm::cast_analysis_load::FwdIndexEntry>,
2369}
2370
2371/// Reconstruct an `ScxSchedState` from the probe BPF program's
2372/// `.bss` snapshot (`ktstr_exit_*` vars).
2373///
2374/// Used as a fallback by [`dump_state`] when
2375/// [`super::scx_walker::read_scx_sched_state`] returned `None`
2376/// because `*scx_root == 0` at freeze time. The probe's tp_btf
2377/// handler captured the same scalars BEFORE the kernel teardown
2378/// nulled `scx_root`, so this path produces a coherent view of
2379/// what the scheduler looked like AT THE INSTANT IT ERRORED OUT —
2380/// which is exactly the state an operator wants to debug.
2381///
2382/// Returns `None` when:
2383/// - the probe `.bss` map isn't loaded yet (boot-race window),
2384/// - the probe's program BTF can't be parsed,
2385/// - the snapshot's `ktstr_exit_kind_snap` is still 0 (latch
2386/// never fired this run, so the snapshot is empty defaults),
2387/// - or any individual var lookup / read fails wholesale.
2388///
2389/// Variable names match the probe BPF declarations one-for-one
2390/// (`ktstr_exit_aborting`, `ktstr_exit_bypass_depth`,
2391/// `ktstr_exit_kind_snap`, `ktstr_exit_sched_kva`,
2392/// `ktstr_exit_watchdog_timeout`); each is resolved by name via the
2393/// program-BTF Datasec walk so a future addition / reorder of `.bss`
2394/// vars does not silently misalign offsets.
2395/// Decode the probe BPF program's per-CPU counter array
2396/// (`ktstr_pcpu_counters`) and sum each slot across CPUs.
2397///
2398/// The probe declares `pcpu_counter ktstr_pcpu_counters[MAX_CPUS]
2399/// [KTSTR_PCPU_NR]` in `.bss`; each `pcpu_counter` is a single
2400/// `long` field forced to 128-byte alignment, so each per-CPU slot
2401/// occupies its own cacheline. The host walks each
2402/// `(cpu, slot)` 8-byte slice and sums into a [`ProbeBssCounters`]
2403/// — see the BPF source for the
2404/// `ktstr_pcpu_inc(KTSTR_PCPU_<NAME>)` fire sites.
2405///
2406/// Returns `None` when:
2407/// - the probe `.bss` map isn't loaded yet (boot-race window),
2408/// - the probe's program BTF can't be parsed,
2409/// - the BTF doesn't carry a `ktstr_pcpu_counters` var (probe
2410/// build that pre-dates the per-CPU conversion), or
2411/// - the array's bytes can't be read wholesale.
2412///
2413/// All values use `u64` for wire compatibility; the underlying
2414/// kernel `long` is signed but every fire site only ever
2415/// increments, so a positive cumulative count is the only outcome
2416/// in practice. Negative reads (would indicate guest-memory
2417/// corruption) saturate to 0 via `as u64`.
2418fn decode_probe_counters_snapshot(
2419 accessor: &GuestMemMapAccessor<'_>,
2420 base_btf: &Btf,
2421) -> Option<ProbeBssCounters> {
2422 use super::bpf_map::BpfMapAccessor;
2423
2424 // Slot indices must match `enum ktstr_pcpu_idx` in
2425 // src/bpf/probe.bpf.c. A reorder in the BPF source breaks
2426 // every reader; the explicit constants here keep the slot
2427 // mapping localized and reviewable.
2428 const PCPU_PROBE_COUNT: usize = 0;
2429 const PCPU_KPROBE_RETURNS: usize = 1;
2430 const PCPU_META_MISS: usize = 2;
2431 const PCPU_RINGBUF_DROPS: usize = 3;
2432 const PCPU_TIMELINE_COUNT: usize = 4;
2433 const PCPU_TIMELINE_DROPS: usize = 5;
2434 const PCPU_PI_COUNT: usize = 6;
2435 const PCPU_PI_ORPHAN_FEXITS: usize = 7;
2436 const PCPU_PI_CLASS_CHANGE_COUNT: usize = 8;
2437 const PCPU_PI_DROPS: usize = 9;
2438 const PCPU_LOCK_CONTEND_COUNT: usize = 10;
2439 const PCPU_LOCK_CONTEND_DROPS: usize = 11;
2440 const PCPU_PREEMPT_DISABLE_COUNT: usize = 12;
2441 const PCPU_PREEMPT_ENABLE_COUNT: usize = 13;
2442 const PCPU_TRIGGER_COUNT: usize = 14;
2443 const PCPU_NR: usize = 15;
2444 /// Per-CPU slot stride in bytes — `pcpu_counter` is forced to
2445 /// 128-byte alignment in the BPF source so each slot occupies
2446 /// one cacheline. Mirroring the alignment here keeps the
2447 /// host-side walk in lockstep with the BPF storage layout;
2448 /// any future change to the alignment must update both.
2449 const PCPU_SLOT_STRIDE: usize = 128;
2450 /// Per-CPU dimension. Matches `MAX_CPUS` in `src/bpf/probe.bpf.c`
2451 /// (CPU_MASK + 1 = 256). Walking every CPU slot is cheap (256
2452 /// CPUs × 15 slots × 8 bytes = 30 KB of reads); slots beyond
2453 /// the actual `nr_cpus` are zero-init `.bss` and contribute
2454 /// nothing to the sum.
2455 const MAX_CPUS: usize = 256;
2456
2457 // Locate the probe's `.bss` map. Same suffix the freeze
2458 // coordinator's lazy-discovery path uses (matched by suffix
2459 // to avoid colliding with a scheduler-under-test's own
2460 // `.bss`).
2461 let bss_map = accessor.find_array_map("probe_bp.bss")?;
2462 if bss_map.btf_kva == 0 {
2463 // Probe not yet loaded — accessor enumerated a stub.
2464 return None;
2465 }
2466
2467 // Load the probe's program BTF as split BTF on top of the
2468 // host vmlinux BTF (matches the freeze coordinator's
2469 // load_probe_bss_offset pattern). Failure is silent — the
2470 // dump path stays best-effort and falls through to None so
2471 // the caller leaves `probe_counters` as None rather than
2472 // emitting a misleading partial.
2473 let prog_btf = load_program_btf_kva(accessor, bss_map.btf_kva, base_btf)?;
2474
2475 // Resolve the array's byte offset within the `.bss` Datasec.
2476 // A missing var (e.g. probe build that pre-dates the per-CPU
2477 // conversion) means the snapshot wasn't emitted — bail.
2478 let array_off = super::btf_offsets::resolve_var_offset_in_section(
2479 &prog_btf,
2480 ".bss",
2481 "ktstr_pcpu_counters",
2482 )? as usize;
2483
2484 // Read the entire array as one slab — 256 * 15 * 128 = 480 KiB.
2485 // A single slab read is cheaper than 256 * 17 individual reads
2486 // through the page-walking accessor; the read primitive
2487 // tolerates over-large requests (truncates at the map's
2488 // value_size) so a future MAX_CPUS / PCPU_NR shrink doesn't
2489 // need a coordinated host update.
2490 let total_bytes = MAX_CPUS * PCPU_NR * PCPU_SLOT_STRIDE;
2491 let array_bytes = accessor.read_value(&bss_map, array_off, total_bytes)?;
2492 if array_bytes.len() < total_bytes {
2493 // Short read — the map's value_size bounds were tighter
2494 // than the array's compile-time shape. A future probe
2495 // build that shrinks MAX_CPUS or PCPU_NR is the expected
2496 // case; bail rather than misalign the slot indexing.
2497 return None;
2498 }
2499
2500 // Sum every CPU's slot. Each slot's `long value` lives at
2501 // offset 0 within the cacheline-aligned `pcpu_counter`
2502 // struct, so the per-(cpu, slot) byte offset is
2503 // `(cpu * PCPU_NR + slot) * PCPU_SLOT_STRIDE`.
2504 let sum_slot = |slot: usize| -> u64 {
2505 let mut total: u64 = 0;
2506 for cpu in 0..MAX_CPUS {
2507 let off = (cpu * PCPU_NR + slot) * PCPU_SLOT_STRIDE;
2508 // BPF runs in little-endian byte order on every
2509 // host arch ktstr targets (x86_64, aarch64). A future
2510 // big-endian host would need an arch gate — flagged
2511 // in the probe BPF source's byte-order section.
2512 let mut buf = [0u8; 8];
2513 buf.copy_from_slice(&array_bytes[off..off + 8]);
2514 // The kernel's `long` is signed but counters only
2515 // increment; cast through `i64` then to `u64` to
2516 // saturate any negative value (corruption signal) to 0.
2517 let v = i64::from_le_bytes(buf);
2518 if v > 0 {
2519 total = total.saturating_add(v as u64);
2520 }
2521 }
2522 total
2523 };
2524
2525 Some(ProbeBssCounters {
2526 probe_count: sum_slot(PCPU_PROBE_COUNT),
2527 kprobe_returns: sum_slot(PCPU_KPROBE_RETURNS),
2528 meta_miss: sum_slot(PCPU_META_MISS),
2529 ringbuf_drops: sum_slot(PCPU_RINGBUF_DROPS),
2530 timeline_count: sum_slot(PCPU_TIMELINE_COUNT),
2531 timeline_drops: sum_slot(PCPU_TIMELINE_DROPS),
2532 pi_count: sum_slot(PCPU_PI_COUNT),
2533 pi_orphan_fexits: sum_slot(PCPU_PI_ORPHAN_FEXITS),
2534 pi_class_change_count: sum_slot(PCPU_PI_CLASS_CHANGE_COUNT),
2535 pi_drops: sum_slot(PCPU_PI_DROPS),
2536 lock_contend_count: sum_slot(PCPU_LOCK_CONTEND_COUNT),
2537 lock_contend_drops: sum_slot(PCPU_LOCK_CONTEND_DROPS),
2538 preempt_disable_count: sum_slot(PCPU_PREEMPT_DISABLE_COUNT),
2539 preempt_enable_count: sum_slot(PCPU_PREEMPT_ENABLE_COUNT),
2540 trigger_count: sum_slot(PCPU_TRIGGER_COUNT),
2541 })
2542}
2543
2544fn decode_probe_sched_state_snapshot(
2545 accessor: &GuestMemMapAccessor<'_>,
2546 base_btf: &Btf,
2547) -> Option<super::scx_walker::ScxSchedState> {
2548 use super::bpf_map::BpfMapAccessor;
2549
2550 // Locate the probe's `.bss` map. Same suffix the freeze
2551 // coordinator's lazy-discovery path uses (matched by suffix to
2552 // avoid colliding with a scheduler-under-test's own `.bss`).
2553 let bss_map = accessor.find_array_map("probe_bp.bss")?;
2554 if bss_map.btf_kva == 0 {
2555 // Probe not yet loaded — accessor enumerated a stub. The
2556 // var offsets live in the program BTF the loader hasn't
2557 // attached yet.
2558 return None;
2559 }
2560
2561 // Load the probe's program BTF as a split BTF on top of the
2562 // host vmlinux BTF (matches the freeze coordinator's
2563 // load_probe_bss_offset pattern). Failure is silent — the dump
2564 // path stays best-effort and falls through to None so the
2565 // caller leaves `scx_sched_state` as None rather than emitting
2566 // a misleading partial.
2567 let prog_btf = load_program_btf_kva(accessor, bss_map.btf_kva, base_btf)?;
2568
2569 // Resolve each `ktstr_exit_*` var's byte offset within the
2570 // `.bss` Datasec. A missing var (e.g. probe build that pre-
2571 // dates the snapshot vars) means the snapshot wasn't emitted —
2572 // bail rather than render zero defaults that would alias as
2573 // "scheduler healthy and exited cleanly".
2574 let kind_off = super::btf_offsets::resolve_var_offset_in_section(
2575 &prog_btf,
2576 ".bss",
2577 "ktstr_exit_kind_snap",
2578 )?;
2579 let aborting_off = super::btf_offsets::resolve_var_offset_in_section(
2580 &prog_btf,
2581 ".bss",
2582 "ktstr_exit_aborting",
2583 )?;
2584 let bypass_depth_off = super::btf_offsets::resolve_var_offset_in_section(
2585 &prog_btf,
2586 ".bss",
2587 "ktstr_exit_bypass_depth",
2588 )?;
2589 let sched_kva_off = super::btf_offsets::resolve_var_offset_in_section(
2590 &prog_btf,
2591 ".bss",
2592 "ktstr_exit_sched_kva",
2593 )?;
2594 let watchdog_timeout_off = super::btf_offsets::resolve_var_offset_in_section(
2595 &prog_btf,
2596 ".bss",
2597 "ktstr_exit_watchdog_timeout",
2598 )?;
2599
2600 // Read each var's bytes via the accessor. `.bss` maps have a
2601 // single key (zero) and the value bytes ARE the section bytes,
2602 // so `read_value(map, off, size)` is the read primitive. A
2603 // failed read on any field bails the whole snapshot — partial
2604 // values would mislead the consumer.
2605 let kind_bytes = accessor.read_value(&bss_map, kind_off as usize, 4)?;
2606 let kind = u32::from_le_bytes(kind_bytes.as_slice().try_into().ok()?);
2607
2608 // The snapshot is sticky: `ktstr_exit_kind_snap` stays at 0
2609 // until the BPF tp_btf handler latches an error-class exit. A
2610 // 0 here means the latch never fired — the snapshot vars are
2611 // all at their initial 0/false defaults and the dump should
2612 // honour `*scx_root == 0` as "no scheduler state to surface"
2613 // rather than render a fake healthy-exit ScxSchedState.
2614 if kind == 0 {
2615 return None;
2616 }
2617
2618 let aborting_bytes = accessor.read_value(&bss_map, aborting_off as usize, 1)?;
2619 let aborting = aborting_bytes.first().copied()? != 0;
2620
2621 let bypass_depth_bytes = accessor.read_value(&bss_map, bypass_depth_off as usize, 4)?;
2622 let bypass_depth = i32::from_le_bytes(bypass_depth_bytes.as_slice().try_into().ok()?);
2623
2624 let sched_kva_bytes = accessor.read_value(&bss_map, sched_kva_off as usize, 8)?;
2625 let sched_kva = u64::from_le_bytes(sched_kva_bytes.as_slice().try_into().ok()?);
2626
2627 let watchdog_timeout_bytes = accessor.read_value(&bss_map, watchdog_timeout_off as usize, 8)?;
2628 let watchdog_timeout = u64::from_le_bytes(watchdog_timeout_bytes.as_slice().try_into().ok()?);
2629
2630 Some(super::scx_walker::ScxSchedState {
2631 aborting,
2632 bypass_depth,
2633 exit_kind: kind,
2634 watchdog_timeout: Some(watchdog_timeout),
2635 source: Some(super::scx_walker::SCX_SCHED_STATE_SOURCE_BSS.to_string()),
2636 // `sched_kva == 0` would mean the BPF probe handler ran
2637 // BEFORE `*scx_root` was populated (impossibly early — the
2638 // tp_btf hook is on `sched_ext_exit`, which only fires after
2639 // a sched_ext scheduler attached and ran). Surface it as
2640 // None so the consumer can distinguish "snapshot data exists
2641 // but no slab address" from "snapshot has the address" via
2642 // a single Option rather than a magic-zero check.
2643 sched_kva: if sched_kva == 0 {
2644 None
2645 } else {
2646 Some(sched_kva)
2647 },
2648 })
2649}
2650
2651/// Snapshot every BPF map visible to the host accessor.
2652///
2653/// The dump is best-effort: a map that fails to render lands in the
2654/// report with `error: Some(...)` rather than aborting the whole walk,
2655/// so a single corrupt map can't blind the operator to the rest of
2656/// the scheduler's state.
2657pub fn dump_state(ctx: DumpContext<'_>) -> FailureDumpReport {
2658 let DumpContext {
2659 accessor,
2660 btf,
2661 num_cpus,
2662 arena_offsets,
2663 prog_capture,
2664 cpu_time_capture,
2665 cgroup_psi_capture,
2666 task_enrichment_capture,
2667 event_counter_capture,
2668 scx_walker_capture,
2669 perf_capture,
2670 deadline,
2671 cast_map,
2672 cross_btf_fwd_index,
2673 alloc_size_types,
2674 } = ctx;
2675 let cross_btf_fwd_index_ref = cross_btf_fwd_index.as_ref();
2676 // Wall-clock origin for per-phase elapsed_us tracing and the
2677 // soft-deadline bailout. Each heavy phase compares
2678 // `Instant::now()` against `deadline` AFTER it finishes, so a
2679 // truncation captures the phase's data before short-circuiting
2680 // the remaining ones (consistent with the doc on
2681 // [`DumpContext::deadline`]).
2682 let dump_start = std::time::Instant::now();
2683 // Tracks the elapsed_us of the first phase to observe a deadline
2684 // crossing. Stamped onto [`FailureDumpReport::dump_truncated_at_us`]
2685 // at the end so the operator can attribute missing maps / tasks /
2686 // walker results to truncation rather than walker failure.
2687 let mut truncated_at_us: Option<u64> = None;
2688 // Helper closure: returns `true` once the deadline (if any) has
2689 // been crossed. Sets `truncated_at_us` on the FIRST crossing so
2690 // the report records WHERE truncation began, not the last phase
2691 // to short-circuit. Idempotent on repeated calls — once stamped,
2692 // every later phase sees the same elapsed_us.
2693 let deadline_exceeded = |truncated_at: &mut Option<u64>| -> bool {
2694 if let Some(deadline) = deadline {
2695 let now = std::time::Instant::now();
2696 if now > deadline {
2697 if truncated_at.is_none() {
2698 let elapsed_us = dump_start.elapsed().as_micros() as u64;
2699 *truncated_at = Some(elapsed_us);
2700 tracing::warn!(
2701 elapsed_us,
2702 "dump_state: deadline exceeded, truncating remaining phases"
2703 );
2704 }
2705 return true;
2706 }
2707 }
2708 false
2709 };
2710 let maps = accessor.maps();
2711 let (prog_runtime_stats, prog_runtime_stats_unavailable) = match prog_capture {
2712 Some(cap) => {
2713 let stats = cap.accessor.struct_ops_runtime_stats(cap.per_cpu_offsets);
2714 let reason = if stats.is_empty() {
2715 Some(REASON_NO_STRUCT_OPS_LOADED.to_string())
2716 } else {
2717 None
2718 };
2719 (stats, reason)
2720 }
2721 None => (
2722 Vec::new(),
2723 Some(REASON_PROG_ACCESSOR_UNAVAILABLE.to_string()),
2724 ),
2725 };
2726 let per_cpu_time = match cpu_time_capture {
2727 Some(cap) => collect_per_cpu_time(cap),
2728 None => Vec::new(),
2729 };
2730 // Per-cgroup PSI-irq for the test's workload leaves (Phase A). Empty
2731 // when no capture was supplied, the workload root isn't present, or
2732 // psi_cgroups is off — loud-absent.
2733 let cgroup_psi = match cgroup_psi_capture {
2734 Some(cap) => collect_cgroup_psi(cap),
2735 None => Vec::new(),
2736 };
2737 let task_enrichment_t0 = std::time::Instant::now();
2738 let (task_enrichments, task_enrichments_unavailable) = match task_enrichment_capture {
2739 Some(cap) => {
2740 // Cap iteration AND Vec capacity at MAX_ENRICHED_TASKS so
2741 // a hostile guest with a corrupt or absurdly long
2742 // runnable_list can't drag the freeze window into the
2743 // tens-of-seconds range.
2744 let total = cap.tasks.len();
2745 let cap_n = total.min(MAX_ENRICHED_TASKS);
2746 let mut enrichments = Vec::with_capacity(cap_n);
2747 for entry in cap.tasks.iter().take(cap_n) {
2748 if let Some(e) = super::task_enrichment::walk_task_enrichment(
2749 cap.kernel,
2750 entry.task_kva,
2751 cap.offsets,
2752 cap.sched_classes,
2753 cap.lock_slowpaths,
2754 entry.is_runnable_in_scx,
2755 entry.running_pc,
2756 ) {
2757 enrichments.push(e);
2758 }
2759 }
2760 if total > cap_n {
2761 tracing::warn!(
2762 cap = MAX_ENRICHED_TASKS,
2763 total,
2764 "dump_state task_enrichment: capped at MAX_ENRICHED_TASKS, dropping tail"
2765 );
2766 }
2767 let reason = if enrichments.is_empty() {
2768 tracing::debug!(
2769 tasks_count = total,
2770 "dump_state task_enrichment: walker yielded zero entries — \
2771 scx_tasks list and rq->scx.runnable_list both empty, or every \
2772 walk_task_enrichment call returned None (translate failures)",
2773 );
2774 Some(REASON_TASK_WALKER_ZERO_TASKS.to_string())
2775 } else {
2776 None
2777 };
2778 (enrichments, reason)
2779 }
2780 None => {
2781 tracing::debug!(
2782 "dump_state task_enrichment: capture is None — \
2783 freeze coordinator passed no TaskEnrichmentCapture \
2784 (scx_owned, scx_walker_offsets, or task_enrichment_offsets unresolved)",
2785 );
2786 (Vec::new(), Some(REASON_NO_TASK_WALKER.to_string()))
2787 }
2788 };
2789 tracing::debug!(
2790 elapsed_us = task_enrichment_t0.elapsed().as_micros() as u64,
2791 enriched = task_enrichments.len(),
2792 "dump_state phase: walk_task_enrichment"
2793 );
2794 deadline_exceeded(&mut truncated_at_us);
2795 let event_counter_timeline = match event_counter_capture {
2796 Some(cap) => cap
2797 .samples
2798 .iter()
2799 .filter_map(EventCounterSample::from_monitor_sample)
2800 .collect(),
2801 None => Vec::new(),
2802 };
2803 let (rq_scx_states, dsq_states, scx_sched_state, scx_walker_unavailable) =
2804 match scx_walker_capture {
2805 Some(cap) => {
2806 // Sub-group offsets resolved per kernel struct;
2807 // surface the absent groups in the diagnostic so a
2808 // partial walk announces which passes were skipped.
2809 let missing = cap.offsets.missing_groups();
2810
2811 // 1. Read scalar scx_sched state and recover the
2812 // sched_pa for the sched-rooted DSQ walker passes.
2813 // `sched_state` is None when the BTF lacked the
2814 // `sched` sub-group OR when *scx_root == 0
2815 // (no scheduler attached) — both surface as a
2816 // None scx_sched_state in the report. The
2817 // distinction is encoded in `scx_walker_unavailable`
2818 // via REASON_SCX_ROOT_NULL.
2819 let (sched_pa_opt, sched_state) = match super::scx_walker::read_scx_sched_state(
2820 cap.kernel,
2821 cap.scx_root_kva,
2822 cap.offsets,
2823 ) {
2824 Some((sched_kva, state)) => {
2825 // Translate sched_kva → PA (slab/vmalloc; use
2826 // translate_any_kva via the GuestKernel handle).
2827 let mem = cap.kernel.mem();
2828 let walk = cap.kernel.walk_context();
2829 let pa = super::idr::translate_any_kva(
2830 mem,
2831 walk.cr3_pa,
2832 walk.page_offset,
2833 sched_kva,
2834 walk.l5,
2835 walk.tcr_el1,
2836 );
2837 (pa, Some(state))
2838 }
2839 None => {
2840 // Live read failed — `*scx_root == 0` because
2841 // the scheduler has already torn down by
2842 // freeze time. Fall back to the BPF .bss
2843 // snapshot the probe's tp_btf handler latched
2844 // at err-exit time. The snapshot is the
2845 // strict subset of scheduler state the host
2846 // renderer needs; the sched_pa stays None
2847 // because the slab page that backed the live
2848 // `scx_sched` was freed during teardown and
2849 // the sched-rooted DSQ passes (per-node
2850 // global, user dsq_hash) cannot reach it any
2851 // longer. The caller's `unavail` selector
2852 // below now sees `Some(state)` and skips
2853 // REASON_SCX_ROOT_NULL — the consumer reads
2854 // `state.source = "bss_snapshot"` to
2855 // distinguish snapshot from live.
2856 let snap = decode_probe_sched_state_snapshot(accessor, btf);
2857 if snap.is_some() {
2858 tracing::debug!(
2859 scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
2860 "dump_state scx walker: live read returned None; \
2861 BPF .bss snapshot fallback populated scx_sched_state \
2862 (scheduler torn down before freeze, snapshot \
2863 captured at err-exit instant)",
2864 );
2865 }
2866 (None, snap)
2867 }
2868 };
2869
2870 // 2. Per-CPU rq->scx walk. Per-CPU runs only when the
2871 // rq + scx_rq + task sub-groups are present;
2872 // walk_rq_scx returns None to skip otherwise.
2873 let walk_rq_scx_t0 = std::time::Instant::now();
2874 let mut rq_states = Vec::with_capacity(cap.rq_kvas.len());
2875 if !deadline_exceeded(&mut truncated_at_us) {
2876 for (cpu, (&rq_kva, &rq_pa)) in
2877 cap.rq_kvas.iter().zip(cap.rq_pas.iter()).enumerate()
2878 {
2879 if let Some((state, _entries)) = super::scx_walker::walk_rq_scx(
2880 cap.kernel,
2881 cpu as u32,
2882 rq_kva,
2883 rq_pa,
2884 cap.offsets,
2885 ) {
2886 rq_states.push(state);
2887 }
2888 }
2889 }
2890 tracing::debug!(
2891 elapsed_us = walk_rq_scx_t0.elapsed().as_micros() as u64,
2892 cpus = cap.rq_kvas.len(),
2893 rq_states = rq_states.len(),
2894 "dump_state phase: walk_rq_scx"
2895 );
2896
2897 // 3. Per-CPU local DSQ walk runs unconditionally —
2898 // `rq->scx.local_dsq` is initialized at boot
2899 // (init_dsq from kernel/sched/ext.c:4581 for every
2900 // possible CPU) and survives scheduler teardown,
2901 // so it produces data even when *scx_root is NULL.
2902 // This is the data source that survives
2903 // scx_bypass's runnable_list drain
2904 // (kernel/sched/ext.c:5448-5548) during teardown.
2905 let walk_local_dsqs_t0 = std::time::Instant::now();
2906 let mut dsqs: Vec<super::scx_walker::DsqState> = Vec::new();
2907 if !deadline_exceeded(&mut truncated_at_us)
2908 && let Some((local_states, _entries)) = super::scx_walker::walk_local_dsqs(
2909 cap.kernel,
2910 cap.rq_kvas,
2911 cap.rq_pas,
2912 cap.per_cpu_offsets,
2913 cap.offsets,
2914 )
2915 {
2916 dsqs.extend(local_states);
2917 }
2918 tracing::debug!(
2919 elapsed_us = walk_local_dsqs_t0.elapsed().as_micros() as u64,
2920 local_dsqs = dsqs.len(),
2921 "dump_state phase: walk_local_dsqs"
2922 );
2923
2924 // 4. Sched-rooted DSQ passes (per-CPU bypass, per-node
2925 // global, user dsq_hash) require the sched_pa we
2926 // resolved in step 1. Without it, no scheduler is
2927 // attached and these DSQs don't exist at all.
2928 let walk_dsqs_t0 = std::time::Instant::now();
2929 if !deadline_exceeded(&mut truncated_at_us)
2930 && let Some(sched_pa) = sched_pa_opt
2931 {
2932 let (sched_states, _entries) = super::scx_walker::walk_dsqs(
2933 cap.kernel,
2934 sched_pa,
2935 cap.per_cpu_offsets,
2936 cap.nr_nodes,
2937 cap.offsets,
2938 );
2939 dsqs.extend(sched_states);
2940 }
2941 tracing::debug!(
2942 elapsed_us = walk_dsqs_t0.elapsed().as_micros() as u64,
2943 total_dsqs = dsqs.len(),
2944 "dump_state phase: walk_dsqs"
2945 );
2946
2947 // Diagnostic priority:
2948 // 1. Partial-degradation (sub-group(s) missing) —
2949 // announces exactly which passes were skipped.
2950 // 2. *scx_root is NULL — sched/bypass/global/user
2951 // passes blinded but rq->scx + local DSQ still
2952 // work; surface this distinct reason so the
2953 // operator knows the scheduler isn't attached.
2954 // 3. Walker reached no state at all — typical when
2955 // every read fails.
2956 // 4. None — every pass had data to surface.
2957 let unavail = if !missing.is_empty() {
2958 tracing::debug!(
2959 missing_groups = ?missing,
2960 rq_states_count = rq_states.len(),
2961 dsq_count = dsqs.len(),
2962 sched_state_some = sched_state.is_some(),
2963 "dump_state scx walker: partial degradation — missing BTF sub-groups",
2964 );
2965 Some(format!(
2966 "scx walker partial: missing offset groups [{}]",
2967 missing.join(", ")
2968 ))
2969 } else if sched_state.is_none() {
2970 tracing::debug!(
2971 scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
2972 rq_states_count = rq_states.len(),
2973 dsq_count = dsqs.len(),
2974 "dump_state scx walker: scx_root is NULL — no scheduler attached; \
2975 rq->scx and local DSQ captures populated, sched/bypass/global/user passes blinded",
2976 );
2977 Some(REASON_SCX_ROOT_NULL.to_string())
2978 } else if rq_states.is_empty() && dsqs.is_empty() {
2979 tracing::debug!(
2980 scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
2981 "dump_state scx walker: every walker read failed — no rq->scx, no DSQ, but sched_state present",
2982 );
2983 Some(REASON_SCX_WALKER_NO_STATE.to_string())
2984 } else {
2985 None
2986 };
2987 (rq_states, dsqs, sched_state, unavail)
2988 }
2989 None => {
2990 tracing::debug!(
2991 "dump_state scx walker: capture is None — \
2992 freeze coordinator passed no ScxWalkerCapture (offsets/symbols/per_cpu_offsets unresolved)",
2993 );
2994 (
2995 Vec::new(),
2996 Vec::new(),
2997 None,
2998 Some(REASON_NO_SCX_WALKER.to_string()),
2999 )
3000 }
3001 };
3002 // Freeze-time per-vCPU perf-counter snapshot. With `exclude_host=1`
3003 // each counter ticks only during guest execution; the freeze
3004 // coordinator has parked every vCPU before reaching this site, so
3005 // the read returns the cumulative count at the last guest exit
3006 // for each vCPU. A single per-vCPU read failure is recorded as
3007 // `None` for that entry; a failure on one vCPU does not blank the
3008 // others. When `perf_capture` is None the vec stays empty (the
3009 // host lacked perf, or `perf_event_open` failed at run start).
3010 let vcpu_perf_at_freeze: Vec<Option<super::perf_counters::VcpuPerfSample>> = match perf_capture
3011 {
3012 Some(cap) => cap.per_vcpu.iter().map(|p| p.read().ok()).collect(),
3013 None => Vec::new(),
3014 };
3015
3016 // Snapshot the probe's per-CPU diagnostic counters before the
3017 // per-map render loop walks `.bss` itself — the read goes
3018 // through the same `read_value` path the renderer uses, but
3019 // captures the array as a structured `ProbeBssCounters` rather
3020 // than the BTF Datasec render. Best-effort: a None result
3021 // (probe not loaded, BTF missing the var) leaves the report's
3022 // `probe_counters` empty and the existing `.bss` map render
3023 // still surfaces the raw bytes.
3024 let probe_counters = decode_probe_counters_snapshot(accessor, btf);
3025
3026 // Resolve the active-scheduler obj name. The prog accessor +
3027 // BpfMapOffsets pair powers the target-free `prog_idr` walker
3028 // (the primary path; needs no `scx_root`, so it works on
3029 // pre-6.16 kernels too) -- when absent, the helper degrades to the
3030 // prefix-grouping fallback over struct_ops map names.
3031 let prog_walker = prog_capture.map(|cap| {
3032 (
3033 cap.accessor as &dyn super::bpf_prog::BpfProgAccessor,
3034 accessor.offsets(),
3035 )
3036 });
3037 let (active_obj_name, active_map_kvas) =
3038 match identify_active_obj_from_struct_ops(&maps, prog_walker) {
3039 Some((name, kvas)) => (Some(name), kvas),
3040 None => (None, Vec::new()),
3041 };
3042 let mut report = FailureDumpReport {
3043 schema: SCHEMA_SINGLE.to_string(),
3044 active_map_kvas,
3045 maps: Vec::with_capacity(maps.len()),
3046 vcpu_regs: Vec::new(),
3047 sdt_allocations: Vec::new(),
3048 sdt_alloc_unavailable: None,
3049 prog_runtime_stats,
3050 prog_runtime_stats_unavailable,
3051 per_cpu_time,
3052 cgroup_psi,
3053 // Per-node NUMA wire fields: empty Vec + the well-defined
3054 // diagnostic string until the host-side walker lands.
3055 per_node_numa: Vec::new(),
3056 per_node_numa_unavailable: Some(REASON_NO_NUMA_WALKER.to_string()),
3057 task_enrichments,
3058 task_enrichments_unavailable,
3059 event_counter_timeline,
3060 rq_scx_states,
3061 dsq_states,
3062 scx_sched_state,
3063 scx_walker_unavailable,
3064 vcpu_perf_at_freeze,
3065 dump_truncated_at_us: None,
3066 maps_truncated: 0,
3067 probe_counters,
3068 scx_static_ranges: Default::default(),
3069 is_placeholder: false,
3070 active_obj_name,
3071 };
3072
3073 // Per-map program-BTF cache, keyed by `btf_kva`. Each unique
3074 // `struct btf *` lives in the kernel BTF IDR — multiple maps from
3075 // the same BPF program point at the same KVA, so caching dedupes
3076 // the heavy `Btf::from_bytes`/`from_split_bytes` parse across them
3077 // (a scheduler with N maps backed by one BPF object pays one
3078 // parse, not N). Lookups go through this cache before falling
3079 // back to the caller-supplied vmlinux `btf`.
3080 //
3081 // Populated by an explicit pre-pass below so the sdt_alloc walk
3082 // can read it before the per-map render loop runs (the renderer
3083 // needs the resulting allocator metadata via `RenderMapCtx`).
3084 let mut program_btfs: std::collections::HashMap<u64, Btf> = std::collections::HashMap::new();
3085
3086 // Pre-pass: locate the first non-internal `BPF_MAP_TYPE_ARENA`
3087 // map (skipping the same ktstr-internal name set the main loop
3088 // skips) and snapshot it once before any map renders. This lets
3089 // the per-map `MemReader` chase `__arena` pointers no matter
3090 // which slot the arena map occupies in the iteration order —
3091 // the previous design ran `snapshot_arena` lazily inside
3092 // `render_map`'s arena arm, so non-arena maps that rendered
3093 // earlier saw `arena_snapshot: None` and silently failed every
3094 // arena pointer chase. `lib/arena_map.h` declares one `__weak`
3095 // arena per BPF object so a single shared snapshot covers every
3096 // `__arena` pointer the scheduler emits; additional arena maps
3097 // (multi-object schedulers, theoretical) still get their own
3098 // snapshot inside `render_map`'s arena arm — they just don't
3099 // contribute to the cross-map pointer-chase context.
3100 let shared_arena_snapshot: Option<(BpfMapInfo, ArenaSnapshot)> =
3101 arena_offsets.and_then(|off| {
3102 for info in &maps {
3103 let name = info.name();
3104 if name.starts_with("probe_bp.")
3105 || name.starts_with("fentry_p.")
3106 || name == "probe_bp"
3107 || name == "fentry_p"
3108 || KTSTR_INTERNAL_MAPS.contains(&name.as_ref())
3109 {
3110 continue;
3111 }
3112 if info.map_type == BPF_MAP_TYPE_ARENA {
3113 let snap = snapshot_arena(accessor.kernel(), info, off);
3114 return Some((info.clone(), snap));
3115 }
3116 }
3117 None
3118 });
3119 let shared_arena_ref: Option<(&ArenaSnapshot, u64)> = shared_arena_snapshot
3120 .as_ref()
3121 .map(|(info, snap)| (snap, info.map_kva));
3122
3123 // Cache `kern_vm_start` from the pre-pass snapshot for the
3124 // sdt_alloc walk. Pulling directly from `shared_arena_snapshot`
3125 // (rather than scraping each rendered map's `arena` field in the
3126 // main loop) keeps the walk gating decoupled from per-map render
3127 // order — the data the walker needs is finalized before the
3128 // loop runs.
3129 let arena_kern_vm_start: u64 = shared_arena_snapshot
3130 .as_ref()
3131 .map(|(_, snap)| snap.kern_vm_start)
3132 .unwrap_or(0);
3133
3134 // Pre-pass: load every non-internal map's program BTF and locate
3135 // the scheduler's `.bss` raw bytes. Both inputs feed the
3136 // sdt_alloc walk below — moving them out of the main render loop
3137 // means the allocator metadata that decoration in the
3138 // TASK_STORAGE arm needs (`elem_size`, `target_type_id`) is
3139 // available BEFORE any map renders, instead of getting derived
3140 // post-loop only to be unusable for per-entry payload chase.
3141 let mut sched_bss_bytes: Option<(Vec<u8>, u64)> = None; // (bytes, btf_kva)
3142 for info in &maps {
3143 let name = info.name();
3144 if name.starts_with("probe_bp.")
3145 || name.starts_with("fentry_p.")
3146 || name == "probe_bp"
3147 || name == "fentry_p"
3148 || KTSTR_INTERNAL_MAPS.contains(&name.as_ref())
3149 {
3150 continue;
3151 }
3152 if info.btf_kva != 0
3153 && !program_btfs.contains_key(&info.btf_kva)
3154 && let Some(loaded) = accessor.load_program_btf(info, btf)
3155 {
3156 program_btfs.insert(info.btf_kva, loaded);
3157 }
3158 if sched_bss_bytes.is_none()
3159 && info.map_type == BPF_MAP_TYPE_ARRAY
3160 && info.btf_kva != 0
3161 && name.ends_with(".bss")
3162 && let Some(bytes) = accessor.read_value(info, 0, info.value_size as usize)
3163 {
3164 sched_bss_bytes = Some((bytes, info.btf_kva));
3165 }
3166 }
3167
3168 // Pre-pass: walk sdt_alloc trees if all prerequisites lined up.
3169 // Runs BEFORE the main render loop so the allocator metadata it
3170 // discovers (`elem_size`, `target_type_id`,
3171 // `data_header_size`) is available to per-map decoration —
3172 // specifically, the TASK_STORAGE arm uses it to expand each
3173 // entry's `struct sdt_data __arena *` pointer into a typed
3174 // payload render via [`render_map`]'s
3175 // [`crate::monitor::dump::render_map::SdtAllocMeta`].
3176 //
3177 // The walk is best-effort and silent: any missing prerequisite
3178 // (no scheduler .bss, no arena window, no program BTF, no
3179 // `scx_allocator` type) leaves `sdt_allocations` empty rather
3180 // than failing the dump. `sdt_alloc_metas` stays empty in the
3181 // same cases, so each per-entry payload field also degrades to
3182 // `None` (the surface struct still renders).
3183 //
3184 // Build the dump-pass arena page index here too — once outside
3185 // the per-map loop so each per-map `mem_reader` borrows the
3186 // existing table instead of rebuilding it. The sdt_alloc walk
3187 // below uses the same index for its own MemReader.
3188 let arena_page_index = crate::monitor::dump::render_map::build_arena_page_index(
3189 shared_arena_snapshot.as_ref().map(|(_, snap)| snap),
3190 );
3191 let sdt_alloc_t0 = std::time::Instant::now();
3192 // Every typed allocator the program declares; the per-map
3193 // selector in [`render_map`] picks the matching entry by name
3194 // (e.g. `scx_task_allocator` matches `scx_task_map`). A
3195 // single-allocator scheduler hits the unique-candidate path —
3196 // every map gets that allocator. A multi-allocator scheduler
3197 // (per-task + per-cgroup) lets each local-storage map render
3198 // its own payload type instead of forcing the renderer to give
3199 // up.
3200 let mut sdt_alloc_metas: Vec<crate::monitor::dump::render_map::SdtAllocMeta> = Vec::new();
3201 // `slot_start → ArenaSlotInfo` lookup populated as each
3202 // allocator walk completes.
3203 // [`MemReader::resolve_arena_type`] consults this index via a
3204 // range lookup: given a chased address, find the slot whose
3205 // `[slot_start, slot_start + elem_size)` range contains it,
3206 // then route on `offset_in_slot`:
3207 //
3208 // - `offset_in_slot == 0` (slot-start pointer, e.g. the
3209 // `data` field of `scx_task_map_val` storing the raw
3210 // `sdt_alloc()` return) → render the payload skipping
3211 // `header_size` bytes of header.
3212 // - `offset_in_slot == header_size` (payload-start pointer,
3213 // e.g. the return of `scx_task_data(p)` cached in
3214 // `cached_taskc_raw`) → render the payload directly.
3215 // - Other in-slot offsets → no resolve; the renderer falls
3216 // back to its existing skip behaviour.
3217 //
3218 // Built incrementally inside the walk loop so the
3219 // per-allocator snapshot moves into `report.sdt_allocations`
3220 // after each iteration without a clone.
3221 //
3222 // [`crate::monitor::sdt_alloc::TreeWalker::emit_leaf`]
3223 // populates each [`SdtAllocEntry::user_addr`] as
3224 // `data_ptr & 0xFFFF_FFFF` — the slot-START address windowed
3225 // to the low 32 bits. The index keys directly on this
3226 // windowed slot start, paired with an [`ArenaSlotInfo`] that
3227 // carries `elem_size`, `header_size`, and the payload BTF
3228 // type id so the [`MemReader::resolve_arena_type`] range
3229 // lookup has every value it needs to decide the chase shape.
3230 //
3231 // Slot non-overlap invariant: the kernel allocator places
3232 // slots back-to-back inside one `sdt_chunk` and never re-uses
3233 // a position while the bitmap still has it marked allocated
3234 // (see `lib/sdt_alloc.bpf.c::scx_alloc_internal`'s
3235 // bitmap-then-data ordering). Two distinct slots cannot have
3236 // overlapping `[start, start + elem_size)` ranges, so
3237 // dedup-on-exact-key here is sufficient — we cannot land on a
3238 // case where `slot_a + elem_a > slot_b > slot_a` with
3239 // `slot_b` separately keyed.
3240 //
3241 // Duplicates (two slots reporting the same slot start,
3242 // indicating a stale snapshot from a freed allocation racing
3243 // with the freeze) keep the FIRST entry; this matches the
3244 // [`build_arena_page_index`] policy on duplicate user_addr
3245 // pages and emits a `tracing::warn!` line so an operator
3246 // diagnosing a wrong-render can spot the collision.
3247 let mut arena_slot_index = crate::monitor::dump::render_map::ArenaSlotIndex::new();
3248 // 4 GiB-alignment invariant: the bridge keys on the low 32
3249 // bits of slot start. That is correct iff `user_vm_start` is
3250 // 4 GiB-aligned — `slot_full_addr - slot_low32 == user_vm_start`
3251 // and the renderer reconstructs full addresses by masking
3252 // chased values with `0xFFFF_FFFF`. Every in-tree scx scheduler
3253 // sets `map_extra` to a 4 GiB-aligned value (`1 << 32`,
3254 // `1 << 44`); the kernel auto-pick path in
3255 // `bpf_arena_map_alloc` (kernel/bpf/arena.c) rounds the user
3256 // VM area up to `SZ_4G` before mounting. The kernel does
3257 // accept arbitrary `map_extra` from userspace, so an
3258 // out-of-tree scheduler could in theory pass an unaligned
3259 // value — surface a warning and skip the index build rather
3260 // than silently misroute every chase.
3261 let user_vm_aligned = shared_arena_snapshot
3262 .as_ref()
3263 .map(|(_, snap)| snap.user_vm_start & 0xFFFF_FFFF == 0)
3264 .unwrap_or(false);
3265 if !user_vm_aligned && let Some((_, snap)) = shared_arena_snapshot.as_ref() {
3266 tracing::warn!(
3267 user_vm_start = format_args!("{:#x}", snap.user_vm_start),
3268 "sdt_alloc bridge skipped: user_vm_start is not 4 GiB-aligned; \
3269 low-32 keying would misroute every chase",
3270 );
3271 }
3272 // `user_vm_start == 0` is technically 4 GiB-aligned (and the
3273 // gate above accepts it), but the
3274 // [`super::dump::render_map::is_arena_addr_in_snapshot`] helper
3275 // rejects every address when `user_vm_start == 0` — silently
3276 // disabling the bridge. Surface a warn so an operator
3277 // diagnosing missing typed-pointer renders sees the cause
3278 // (likely a snapshot capture failure that produced an
3279 // uninitialized arena VM start).
3280 if let Some((_, snap)) = shared_arena_snapshot.as_ref()
3281 && snap.user_vm_start == 0
3282 {
3283 tracing::warn!(
3284 "sdt_alloc bridge effectively disabled: user_vm_start == 0 \
3285 (snapshot capture may have failed before resolving the \
3286 arena's user VM window); every chase will skip with \
3287 `is_arena_addr` = false",
3288 );
3289 }
3290 // Resolve the unavailable reason in the order the gate checks
3291 // run. The first failing prerequisite wins — subsequent reasons
3292 // (which would all surface for the same missing prerequisite) are
3293 // suppressed. `None` here means the pre-pass actually runs; the
3294 // sdt_alloc_unavailable field is then populated post-loop based
3295 // on whether any allocator was discovered.
3296 let sdt_alloc_skip_reason: Option<&'static str> = if deadline_exceeded(&mut truncated_at_us) {
3297 Some(REASON_SDT_ALLOC_DEADLINE_EXCEEDED)
3298 } else if !user_vm_aligned {
3299 Some(REASON_SDT_ALLOC_UNALIGNED_USER_VM)
3300 } else if sched_bss_bytes.is_none() {
3301 Some(REASON_SDT_ALLOC_NO_BSS)
3302 } else if arena_kern_vm_start == 0 {
3303 Some(REASON_SDT_ALLOC_NO_ARENA)
3304 } else if let Some((_, btf_kva)) = sched_bss_bytes.as_ref()
3305 && !program_btfs.contains_key(btf_kva)
3306 {
3307 Some(REASON_SDT_ALLOC_NO_BSS)
3308 } else if let Some((_, btf_kva)) = sched_bss_bytes.as_ref()
3309 && let Some(prog_btf) = program_btfs.get(btf_kva)
3310 && SdtAllocOffsets::from_btf(prog_btf).is_err()
3311 {
3312 Some(REASON_SDT_ALLOC_NO_TYPE)
3313 } else {
3314 None
3315 };
3316 if let Some(reason) = sdt_alloc_skip_reason {
3317 report.sdt_alloc_unavailable = Some(reason.to_string());
3318 }
3319 // Track whether the pre-pass body ran (every prerequisite
3320 // satisfied). Distinct from `sdt_alloc_skip_reason`: if the
3321 // body runs but discovers no `.bss` instance of
3322 // `struct scx_allocator`, the unavailable reason flips to
3323 // [`REASON_SDT_ALLOC_NO_INSTANCE`] AFTER the loop.
3324 let mut sdt_alloc_pre_pass_ran = false;
3325 if !deadline_exceeded(&mut truncated_at_us)
3326 && user_vm_aligned
3327 && let Some((bss_bytes, btf_kva)) = sched_bss_bytes
3328 && arena_kern_vm_start != 0
3329 && let Some(prog_btf) = program_btfs.get(&btf_kva)
3330 && let Ok(sdt_offsets) = SdtAllocOffsets::from_btf(prog_btf)
3331 {
3332 sdt_alloc_pre_pass_ran = true;
3333 // One MemReader for every leaf payload render, so an
3334 // arena pointer embedded in a per-task / per-cgroup
3335 // sdt_alloc payload chases into typed contents instead
3336 // of opaque hex.
3337 //
3338 // The arena type index is intentionally `None` on this
3339 // pre-pass reader: the walk produces the entries the
3340 // index is built from, so the index does not yet exist
3341 // when the leaf payload renders run. A nested `__arena
3342 // *` pointer inside a payload that targets a separate
3343 // allocator slot whose payload type is forward-declared
3344 // in the program BTF degrades to the existing chase
3345 // behaviour during the pre-pass; the index is wired
3346 // into the per-map renders below where the typical
3347 // bridge call site lives (TASK_STORAGE / HASH maps
3348 // holding `struct sdt_data __arena *` entry pointers).
3349 let sdt_mem = accessor.mem_reader(
3350 shared_arena_snapshot.as_ref().map(|(_, snap)| snap),
3351 &arena_page_index,
3352 num_cpus,
3353 // Threaded in from [`DumpContext::cast_map`]: same
3354 // cast-analysis output the per-map renderer below
3355 // consumes. Letting the sdt_alloc pre-pass see it
3356 // means typed-allocator payload chases (per-task /
3357 // per-cgroup contents inside arena) get the same
3358 // `u64` → typed-pointer promotion as the rest of
3359 // the dump, instead of degrading to plain counters
3360 // for fields the analyzer recovered.
3361 cast_map,
3362 None,
3363 // The sdt_alloc pre-pass populates `sdt_alloc_metas`
3364 // itself; the metas slice is empty until this loop
3365 // finishes, so pass an empty slice here. The
3366 // metas-driven fallback only fires for the per-map
3367 // renders below (after this loop has produced every
3368 // allocator's metadata).
3369 &[],
3370 // The sdt_alloc pre-pass reads BTF type metadata for
3371 // the typed allocator payload from the scheduler's
3372 // own program BTF; no cross-BTF Fwd resolution is
3373 // needed here. The per-map renders below pass the
3374 // built `cross_btf_fwd_index` where it matters.
3375 None,
3376 // The sdt_alloc pre-pass runs BEFORE the
3377 // [`crate::monitor::scx_static_alloc`] walker (the
3378 // walks are independent, but the per-allocator leaf
3379 // payload renders here happen during the sdt_alloc
3380 // walk's own loop, so the scx_static index is not yet
3381 // built). Pass `None` so the bridge stays a no-op for
3382 // these pre-pass renders; the per-map renders below
3383 // pass the built `scx_static_index` where it matters.
3384 None,
3385 // The sdt_alloc pre-pass IS the surface that produces
3386 // the rendered-slot set; pass `None` so no dedup gate
3387 // fires while the typed-allocator surface itself is
3388 // being assembled. The per-map renders below pass the
3389 // built set where it matters.
3390 None,
3391 alloc_size_types,
3392 // The sdt_alloc pre-pass walks against the scheduler's
3393 // program BTF identified by `btf_kva` (the same BTF the
3394 // allocator metadata was resolved against — see
3395 // `append_arena_slot_index_for_allocator` below). Pass
3396 // this kva as the requesting-BTF identifier so the
3397 // pre-pass's own leaf renders compare against it
3398 // (currently the index is still being built so the gate
3399 // never fires here, but the threading keeps the contract
3400 // consistent across all `mem_reader` call sites).
3401 btf_kva,
3402 );
3403 // Locate every sdt_alloc allocator instance declared in
3404 // `.bss`. The Datasec walk gives us each variable's name and
3405 // offset; we filter to types matching `struct scx_allocator`
3406 // by re-resolving the var's chained type. A scheduler may
3407 // declare more than one allocator (e.g. one per-task, one
3408 // per-cgroup) so we iterate all of them.
3409 for (var_name, var_offset, var_type_id) in iter_bss_vars_with_type(prog_btf, ".bss") {
3410 // Only walk vars whose type is `struct scx_allocator`.
3411 if !is_scx_allocator_type(prog_btf, var_type_id) {
3412 continue;
3413 }
3414 // Slice the in-bss bytes for one full `struct scx_allocator`.
3415 // The size comes from BTF (resolved into `allocator_size`
3416 // by `SdtAllocOffsets::from_btf`); using the BTF-reported
3417 // size means a future field appended to scx_allocator
3418 // doesn't silently slip past the slice end.
3419 let Some(slice_end) = var_offset.checked_add(sdt_offsets.allocator_size) else {
3420 continue;
3421 };
3422 let slice = match bss_bytes.get(var_offset..slice_end) {
3423 Some(s) => s,
3424 None => continue,
3425 };
3426
3427 // Discover the payload BTF type id from the elem_size
3428 // we'd read in the walker. We do a small read here just
3429 // to drive the heuristic; the walker re-reads it.
3430 let pool_off = sdt_offsets.allocator_pool + sdt_offsets.pool_elem_size;
3431 let elem_size = if pool_off + 8 <= slice.len() {
3432 let mut buf = [0u8; 8];
3433 buf.copy_from_slice(&slice[pool_off..pool_off + 8]);
3434 u64::from_le_bytes(buf)
3435 } else {
3436 0
3437 };
3438 let payload_size =
3439 elem_size.saturating_sub(sdt_offsets.data_header_size as u64) as usize;
3440 // `prog_btf` is split BTF: the scheduler's program types
3441 // layered on the vmlinux base. `discover_payload_btf_id`
3442 // probes only the program section's id range (via
3443 // `Btf::split`), so vmlinux base `*_ctx` structs of the
3444 // same size cannot shadow the scheduler's payload struct.
3445 let choice = discover_payload_btf_id(prog_btf, payload_size, &var_name);
3446
3447 let snap = walk_sdt_allocator(
3448 accessor.kernel(),
3449 arena_kern_vm_start,
3450 slice,
3451 &sdt_offsets,
3452 prog_btf,
3453 choice.target_type_id,
3454 choice.reason.clone(),
3455 var_name.clone(),
3456 &sdt_mem,
3457 );
3458 // Accumulate every allocator with a typed payload AND
3459 // append its live slots to the bridge index. The
3460 // per-map selector (`select_sdt_alloc_meta`) picks the
3461 // right one by matching `var_name` (the .bss symbol —
3462 // e.g. `scx_task_allocator`) against each rendered map's
3463 // name (e.g. `scx_task_map`). Schedulers that declare
3464 // multiple typed allocators no longer lose payload
3465 // expansion — each map renders against the matching
3466 // allocator's payload type. Only allocators with a
3467 // resolved payload type id contribute to the bridge
3468 // index — without a typed payload there is no useful
3469 // BTF id to surface to the renderer, and the index
3470 // would just point every chase at 0 (which the bridge
3471 // gate filters as "no payload type").
3472 if choice.target_type_id != 0 {
3473 sdt_alloc_metas.push(crate::monitor::dump::render_map::SdtAllocMeta {
3474 allocator_name: var_name.clone(),
3475 elem_size,
3476 header_size: sdt_offsets.data_header_size,
3477 target_type_id: choice.target_type_id,
3478 kern_vm_start: arena_kern_vm_start,
3479 });
3480 // Append this allocator's slots to the bridge index.
3481 // The helper handles the size-fits-u32 check, the
3482 // dedup-on-duplicate-slot-start, and the
3483 // `tracing::warn!` collision diagnostic — see
3484 // [`append_arena_slot_index_for_allocator`] for the
3485 // full contract. Bridge gate (`target_type_id
3486 // != 0`) is encoded inside the helper as well; the
3487 // outer guard here is a fast-path bail before we
3488 // even allocate metadata for the allocator.
3489 crate::monitor::dump::render_map::append_arena_slot_index_for_allocator(
3490 &mut arena_slot_index,
3491 &var_name,
3492 choice.target_type_id,
3493 sdt_offsets.data_header_size,
3494 elem_size,
3495 &snap.all_slot_addrs,
3496 // Stamp the slot with the program BTF the
3497 // `target_type_id` was resolved against. The
3498 // per-map renderer's
3499 // [`MemReader::resolve_arena_type`] gate
3500 // compares this against each requesting map's
3501 // `btf_kva` and suppresses the hit on mismatch
3502 // so the BTF id cannot leak into a sibling
3503 // BTF's id space (multi-`.bpf.o` schedulers).
3504 btf_kva,
3505 );
3506 }
3507 // Surface only allocators with a non-empty result OR a
3508 // diagnostic elem_size; an all-zero snapshot from a
3509 // never-initialized allocator is just noise.
3510 if !snap.entries.is_empty() || snap.elem_size != 0 {
3511 report.sdt_allocations.push(snap);
3512 }
3513 }
3514 }
3515 // Post-loop: if the pre-pass ran but discovered nothing, the
3516 // scheduler has program BTF + arena + bss but no `struct
3517 // scx_allocator` declared in `.bss`. Surface a distinct
3518 // diagnostic so an operator can tell the schedule-doesn't-link
3519 // case from the link-but-no-instance case.
3520 if sdt_alloc_pre_pass_ran
3521 && report.sdt_allocations.is_empty()
3522 && report.sdt_alloc_unavailable.is_none()
3523 {
3524 report.sdt_alloc_unavailable = Some(REASON_SDT_ALLOC_NO_INSTANCE.to_string());
3525 }
3526 let arena_slot_index_ref = if arena_slot_index.is_empty() {
3527 None
3528 } else {
3529 Some(&arena_slot_index)
3530 };
3531 // Build the rendered-slot set for arena chase dedup. Keys on
3532 // every slot that is actually rendered under
3533 // `report.sdt_allocations` — typed AND untyped allocators both
3534 // contribute via [`SdtAllocEntry::user_addr`] (emit_leaf records
3535 // an entry on every leaf it visits, regardless of whether
3536 // payload BTF resolution succeeded). Keying on
3537 // `arena_slot_index.keys()` alone would have missed slots from
3538 // allocators whose `target_type_id == 0` — those allocators
3539 // never reach `append_arena_slot_index_for_allocator` (the
3540 // helper short-circuits on a zero target id) yet their slots
3541 // ARE rendered in `report.sdt_allocations`.
3542 //
3543 // Two address keys per entry: the slot start (`user_addr`) and
3544 // the payload start (`user_addr + header_size`). Chase targets
3545 // resolved via `scx_task_data(p)` and similar helpers point at
3546 // the payload, not the slot start; without the payload-start
3547 // key the dedup misses every chase that uses the helper-
3548 // computed pointer.
3549 //
3550 // Slots past [`super::sdt_alloc::MAX_SDT_ALLOC_ENTRIES`] are
3551 // intentionally excluded — they are NOT rendered in
3552 // `snap.entries`, so the per-map renderer must surface their
3553 // payload (otherwise the truncated tail would appear nowhere
3554 // in the dump). The walker's `truncated` flag is the operator-
3555 // visible signal that some slots are only available via the
3556 // per-map render path.
3557 //
3558 // The per-map renderer's [`MemReader::is_already_rendered`]
3559 // consults this set to skip re-rendering the same allocation
3560 // when a TASK_STORAGE / HASH map's value pointer chases back
3561 // into it. Only TYPED allocators (target_type_id != 0) enter
3562 // the set — untyped pre-pass renders (hex fallback) must not
3563 // suppress per-map chases because the cast analyzer's shape
3564 // inference may resolve a concrete target type the heuristic
3565 // missed.
3566 let header_size_by_allocator: std::collections::HashMap<&str, usize> = sdt_alloc_metas
3567 .iter()
3568 .map(|meta| (meta.allocator_name.as_str(), meta.header_size))
3569 .collect();
3570 let rendered_slot_addrs: std::collections::HashSet<u32> = report
3571 .sdt_allocations
3572 .iter()
3573 .filter_map(|snap| {
3574 // Only dedup slots from allocators with a resolved
3575 // payload type. Untyped allocators (target_type_id == 0,
3576 // rendered as hex in the pre-pass) must NOT suppress
3577 // per-map chases — the cast analyzer's shape inference
3578 // may resolve a concrete target type that the pre-pass
3579 // heuristic missed.
3580 let &header_size = header_size_by_allocator.get(snap.allocator_name.as_str())?;
3581 Some(snap.entries.iter().flat_map(move |e| {
3582 let slot_start = e.user_addr as u32;
3583 let payload_start = slot_start.wrapping_add(header_size as u32);
3584 [slot_start, payload_start]
3585 }))
3586 })
3587 .flatten()
3588 .collect();
3589 let rendered_slot_addrs_ref = if rendered_slot_addrs.is_empty() {
3590 None
3591 } else {
3592 Some(&rendered_slot_addrs)
3593 };
3594 tracing::debug!(
3595 elapsed_us = sdt_alloc_t0.elapsed().as_micros() as u64,
3596 allocations = report.sdt_allocations.len(),
3597 index_entries = arena_slot_index.len(),
3598 "dump_state phase: sdt_alloc"
3599 );
3600
3601 // Pre-pass: walk every `scx_static` bump-allocator instance in
3602 // `.bss` and surface its live-allocated range. Distinct from the
3603 // sdt_alloc per-instance allocator walk above:
3604 //
3605 // - sdt_alloc (`struct scx_allocator`) hands out fixed-stride
3606 // slots via a 3-level radix tree with per-slot metadata; the
3607 // walker produces one entry per live slot keyed on slot start.
3608 // - scx_static (`struct scx_static`) is a flat bump allocator
3609 // with no per-slot metadata; the walker produces one entry
3610 // per live region keyed on the region's base address.
3611 //
3612 // The walk runs only when:
3613 // - we have a scheduler `.bss` blob to read from (re-located
3614 // here because the sdt_alloc walk above consumed the
3615 // pre-pass `sched_bss_bytes` Option),
3616 // - we have a program BTF to resolve `struct scx_static`
3617 // against,
3618 // - the program BTF carries `struct scx_static`.
3619 //
3620 // When any prerequisite is missing, the walk leaves
3621 // `report.scx_static_ranges` empty (default) rather than failing
3622 // the dump — schedulers that don't link `lib/sdt_alloc.bpf.c` or
3623 // don't use the static allocator simply skip the walk.
3624 //
3625 // Membership-only: the walker produces an UNTYPED range index.
3626 // Per-allocation type recovery requires a per-call-site type
3627 // hook from cast analysis that does not exist today (see the
3628 // module-level doc in [`crate::monitor::scx_static_alloc`] for
3629 // why). When the renderer's deferred-resolve arena chase lands
3630 // on an address inside an scx_static range, the bridge
3631 // recognises the address as "in scx_static memory" and
3632 // fails closed (returns `None` from `resolve_arena_type`)
3633 // rather than returning a wrong type — the "no invalid data
3634 // made" contract.
3635 let scx_static_t0 = std::time::Instant::now();
3636 if !deadline_exceeded(&mut truncated_at_us)
3637 && let Some((bss_bytes, prog_btf)) = relocate_sched_bss(&maps, accessor, &program_btfs)
3638 && let Ok(scx_static_offsets) =
3639 crate::monitor::scx_static_alloc::ScxStaticOffsets::from_btf(prog_btf)
3640 {
3641 let snap = crate::monitor::scx_static_alloc::walk_scx_static(
3642 &bss_bytes,
3643 &scx_static_offsets,
3644 iter_bss_vars_with_type(prog_btf, ".bss"),
3645 |type_id| is_scx_static_type(prog_btf, type_id),
3646 );
3647 report.scx_static_ranges = snap;
3648 }
3649 let scx_static_index =
3650 crate::monitor::scx_static_alloc::build_scx_static_range_index(&report.scx_static_ranges);
3651 let scx_static_index_ref = if scx_static_index.is_empty() {
3652 None
3653 } else {
3654 Some(&scx_static_index)
3655 };
3656 tracing::debug!(
3657 elapsed_us = scx_static_t0.elapsed().as_micros() as u64,
3658 ranges = report.scx_static_ranges.ranges.len(),
3659 skipped = report.scx_static_ranges.skipped,
3660 index_entries = scx_static_index.len(),
3661 "dump_state phase: scx_static"
3662 );
3663
3664 let render_map_t0 = std::time::Instant::now();
3665 let mut maps_rendered: usize = 0;
3666 let mut maps_truncated: usize = 0;
3667 for info in maps {
3668 // Skip ktstr's own framework maps so the report only shows
3669 // the scheduler-under-test's state. Three distinct shapes
3670 // need filtering:
3671 //
3672 // 1. Global-section maps from the probe skeleton: libbpf
3673 // composes `<obj_name>.<section>` so `probe_bp.bss`,
3674 // `probe_bp.data`, `probe_bp.rodata` all match the
3675 // `probe_bp.` prefix. (`probe_bp` matching the bare obj
3676 // name covers any single-name section the kernel might
3677 // surface, though libbpf today always adds the suffix.)
3678 // 2. Global-section maps from the fentry skeleton, named
3679 // with the `fentry_p.` prefix following the same
3680 // libbpf convention.
3681 // 3. Bare-named maps declared via `SEC(".maps")` in
3682 // src/bpf/probe.bpf.c — these don't get an obj prefix
3683 // because they're not from a global section. The
3684 // explicit denylist [`KTSTR_INTERNAL_MAPS`] enumerates
3685 // them.
3686 //
3687 // A future tighter filter would consult bpf_prog ownership
3688 // (the program-attachment ID list pinned to each map), but
3689 // name-based filtering is enough today and avoids loading
3690 // the full prog_idr walk on the freeze hot path.
3691 {
3692 let info_name = info.name();
3693 if info_name.starts_with("probe_bp.")
3694 || info_name.starts_with("fentry_p.")
3695 || info_name == "probe_bp"
3696 || info_name == "fentry_p"
3697 || KTSTR_INTERNAL_MAPS.contains(&info_name.as_ref())
3698 {
3699 continue;
3700 }
3701 }
3702
3703 // Deadline check before each map render — bigger maps
3704 // (large hashes, arenas) can each take a meaningful slice
3705 // of the freeze window, so we re-check between renders to
3706 // bound the worst case rather than letting one
3707 // straggler push us past the watchdog.
3708 if deadline_exceeded(&mut truncated_at_us) {
3709 maps_truncated += 1;
3710 continue;
3711 }
3712
3713 // Per-map BTF resolution.
3714 //
3715 // The map's `btf_value_type_id` / `btf_key_type_id` index
3716 // the *map's own* BTF, NOT the kernel vmlinux BTF — when
3717 // `btf_kva != 0` the type IDs are program-local and using
3718 // vmlinux BTF with them would resolve to unrelated kernel
3719 // types (or out-of-range nonsense). So:
3720 //
3721 // - `BPF_MAP_TYPE_STRUCT_OPS` → use vmlinux
3722 // BTF. The wrapper struct `bpf_struct_ops_<name>` is
3723 // declared in the kernel's vmlinux BTF and the
3724 // wrapper type id stored on the map (in
3725 // `btf_vmlinux_value_type_id`) indexes vmlinux. Using
3726 // the program BTF here would fail to resolve the
3727 // wrapper.
3728 // - `btf_kva != 0` AND program BTF loaded by pre-pass → use it.
3729 // - `btf_kva != 0` AND program BTF load failed in pre-pass
3730 // → render hex-only (None map_btf), no fallback.
3731 // - `btf_kva == 0` (kernel-builtin map) → use the
3732 // caller-supplied vmlinux BTF; the type IDs (if any)
3733 // genuinely index vmlinux BTF in this case.
3734 let map_btf: Option<&Btf> = if info.map_type == super::bpf_map::BPF_MAP_TYPE_STRUCT_OPS {
3735 Some(btf)
3736 } else if info.btf_kva != 0 {
3737 program_btfs.get(&info.btf_kva)
3738 } else {
3739 Some(btf)
3740 };
3741
3742 let rendered = render_map(
3743 &RenderMapCtx {
3744 accessor,
3745 btf: map_btf,
3746 num_cpus,
3747 arena_offsets,
3748 shared_arena: shared_arena_ref,
3749 arena_page_index: &arena_page_index,
3750 sdt_alloc_metas: &sdt_alloc_metas,
3751 // Threaded in from
3752 // [`DumpContext::cast_map`]: the BPF
3753 // cast-analysis output for the scheduler's
3754 // program object. `Some(&map)` lets the
3755 // renderer promote `u64` fields the analyzer
3756 // flagged into typed-pointer renders via
3757 // [`super::btf_render::MemReader::cast_lookup`];
3758 // `None` keeps every `u64` rendered as a plain
3759 // unsigned counter (the trait default).
3760 cast_map,
3761 // Built from the sdt_alloc pre-pass above:
3762 // `slot_start → ArenaSlotInfo` for every live
3763 // allocator slot. Lets the renderer range-lookup
3764 // the slot a chased arena address falls in and
3765 // recover a `BTF_KIND_FWD` pointee's real
3766 // struct id (plus a `header_skip` byte count)
3767 // via [`MemReader::resolve_arena_type`] — a
3768 // `struct sdt_data __arena *` field (or a `data`
3769 // field caching the raw `sdt_alloc()` return)
3770 // whose pointee body lives in the sdt_alloc
3771 // library's BTF still chases as the typed
3772 // per-task / per-cgroup struct, instead of
3773 // skipping with "forward declaration; body not
3774 // in this BTF". `None` when no allocator with a
3775 // typed payload was discovered.
3776 arena_slot_index: arena_slot_index_ref,
3777 // Threaded in from
3778 // [`DumpContext::cross_btf_fwd_index`]: the
3779 // cross-BTF Fwd resolution context populated by
3780 // the cast-analysis pre-pass over every embedded
3781 // BPF object's BTF. `Some(&idx)` lets the
3782 // renderer chase a `BTF_KIND_FWD` whose body
3783 // lives in a sibling embedded object via
3784 // [`MemReader::cross_btf_resolve_fwd`]. `None`
3785 // keeps the renderer's "forward declaration;
3786 // body not in this BTF" skip path intact.
3787 cross_btf_fwd_index: cross_btf_fwd_index_ref,
3788 // Built from the scx_static pre-pass above:
3789 // `start_low32 → size` for every live
3790 // `scx_static` bump-allocator region. Lets the
3791 // renderer's
3792 // [`MemReader::resolve_arena_type`]
3793 // diagnose-and-skip on a chased arena address
3794 // that lands inside scx_static memory — the
3795 // bridge cannot recover a per-allocation type
3796 // (no per-slot header) so the chase falls through
3797 // to the historical Fwd-skip / cross-BTF
3798 // resolution path. `None` when no live
3799 // `scx_static` instance was discovered.
3800 scx_static_index: scx_static_index_ref,
3801 // Built from the sdt_alloc pre-pass above: the
3802 // low-32 windowed `slot_start` of every allocator
3803 // slot already rendered into
3804 // `report.sdt_allocations`. Lets the renderer's
3805 // [`MemReader::is_already_rendered`] short-circuit
3806 // the arena chase when a TASK_STORAGE / HASH
3807 // map's value pointer lands in a slot the
3808 // typed-allocator surface already shows — no
3809 // duplicate payload in the dump. `None` when no
3810 // allocator pre-pass produced any rendered slot.
3811 rendered_slot_addrs: rendered_slot_addrs_ref,
3812 alloc_size_types,
3813 },
3814 &info,
3815 );
3816
3817 report.maps.push(rendered);
3818 maps_rendered += 1;
3819 }
3820 tracing::debug!(
3821 elapsed_us = render_map_t0.elapsed().as_micros() as u64,
3822 rendered = maps_rendered,
3823 truncated = maps_truncated,
3824 "dump_state phase: per-map render"
3825 );
3826
3827 report.dump_truncated_at_us = truncated_at_us;
3828 report.maps_truncated = maps_truncated as u32;
3829 report
3830}
3831
3832/// Walk every CPU's `kernel_cpustat`, `kernel_stat`, and (under
3833/// NO_HZ) `tick_sched` slots and produce a [`PerCpuTimeStats`]
3834/// vector — one entry per CPU index in `cap.per_cpu_offsets`.
3835///
3836/// Reads against the supplied [`super::reader::GuestMem`]. Each CPU's
3837/// per-CPU base for a symbol `S` is `S + per_cpu_offsets[cpu]`,
3838/// converted to a guest physical offset via
3839/// [`super::symbols::kva_to_pa`] using the supplied `page_offset`
3840/// (the standard direct-mapping translation; per-CPU pages always
3841/// live in the direct mapping).
3842///
3843/// `cpustat[]` is read as 8 contiguous u64s starting at the
3844/// resolved offset (length matches the indices captured —
3845/// CPUTIME_USER through CPUTIME_STEAL — leaving CPUTIME_GUEST /
3846/// CPUTIME_GUEST_NICE / CPUTIME_FORCEIDLE unread; the dump
3847/// surfaces them as zero in the unread slots, which is acceptable
3848/// since they're virt-guest specific or kernel-config gated and
3849/// distinct from the failure-dump narrative). `softirqs[]` reads
3850/// as `NR_SOFTIRQS` u32s, widened to u64 for the report. `irqs_sum`
3851/// is `unsigned long` (read as u64 — 64-bit only kernels are the
3852/// supported configuration). `iowait_sleeptime` is `ktime_t` /
3853/// `s64`; the value is cast to u64 (the kernel never produces
3854/// negative iowait time).
3855fn collect_per_cpu_time(cap: &CpuTimeCapture<'_>) -> Vec<PerCpuTimeStats> {
3856 use super::btf_offsets::{
3857 CPUTIME_IDLE, CPUTIME_IOWAIT, CPUTIME_IRQ, CPUTIME_NICE, CPUTIME_SOFTIRQ, CPUTIME_STEAL,
3858 CPUTIME_SYSTEM, CPUTIME_USER, NR_SOFTIRQS,
3859 };
3860 let mut out = Vec::with_capacity(cap.per_cpu_offsets.len());
3861 for (cpu_idx, &per_cpu_off) in cap.per_cpu_offsets.iter().enumerate() {
3862 let cpu = cpu_idx as u32;
3863
3864 // kernel_cpustat::cpustat[N]: each slot is a u64 in nsec.
3865 // Read CPUTIME_USER through CPUTIME_STEAL (indices 0..=7).
3866 let cpustat_kva =
3867 super::symbols::per_cpu_kva(cap.kernel_cpustat_kva, cap.kaslr_offset, per_cpu_off);
3868 let cpustat_pa = super::symbols::kva_to_pa(cpustat_kva, cap.page_offset);
3869 let cpustat_base = cap.offsets.kernel_cpustat_cpustat;
3870 let read_cpustat = |idx: usize| -> u64 {
3871 // sizeof(u64) == 8.
3872 cap.mem.read_u64(cpustat_pa, cpustat_base + idx * 8)
3873 };
3874 let cpustat_user_ns = read_cpustat(CPUTIME_USER);
3875 let cpustat_nice_ns = read_cpustat(CPUTIME_NICE);
3876 let cpustat_system_ns = read_cpustat(CPUTIME_SYSTEM);
3877 let cpustat_softirq_ns = read_cpustat(CPUTIME_SOFTIRQ);
3878 let cpustat_irq_ns = read_cpustat(CPUTIME_IRQ);
3879 let cpustat_idle_ns = read_cpustat(CPUTIME_IDLE);
3880 let cpustat_iowait_ns = read_cpustat(CPUTIME_IOWAIT);
3881 let cpustat_steal_ns = read_cpustat(CPUTIME_STEAL);
3882
3883 // kernel_stat::softirqs[N]: each slot is a u32 (count).
3884 // Widen to u64 for reporting consistency with cpustat.
3885 let kstat_kva = super::symbols::per_cpu_kva(cap.kstat_kva, cap.kaslr_offset, per_cpu_off);
3886 let kstat_pa = super::symbols::kva_to_pa(kstat_kva, cap.page_offset);
3887 let mut softirqs = [0u64; NR_SOFTIRQS];
3888 for (i, slot) in softirqs.iter_mut().enumerate() {
3889 // sizeof(unsigned int) == 4.
3890 *slot = cap
3891 .mem
3892 .read_u32(kstat_pa, cap.offsets.kstat_softirqs + i * 4) as u64;
3893 }
3894
3895 // kernel_stat::irqs_sum: unsigned long. 64-bit only
3896 // kernels are supported, so read as u64.
3897 let irqs_sum = cap.mem.read_u64(kstat_pa, cap.offsets.kstat_irqs_sum);
3898
3899 // tick_sched::iowait_sleeptime: ktime_t (s64) ns,
3900 // accumulated only under NO_HZ when the CPU enters idle
3901 // with nr_iowait > 0. Skip when the symbol or BTF offset
3902 // is absent.
3903 let iowait_sleeptime_ns = cap
3904 .tick_cpu_sched_kva
3905 .zip(cap.offsets.tick_sched_iowait_sleeptime)
3906 .map(|(tick_sym_kva, off)| {
3907 let kva = super::symbols::per_cpu_kva(tick_sym_kva, cap.kaslr_offset, per_cpu_off);
3908 let pa = super::symbols::kva_to_pa(kva, cap.page_offset);
3909 cap.mem.read_u64(pa, off)
3910 });
3911
3912 out.push(PerCpuTimeStats {
3913 cpu,
3914 cpustat_user_ns,
3915 cpustat_nice_ns,
3916 cpustat_system_ns,
3917 cpustat_softirq_ns,
3918 cpustat_irq_ns,
3919 cpustat_idle_ns,
3920 cpustat_iowait_ns,
3921 cpustat_steal_ns,
3922 softirqs,
3923 irqs_sum,
3924 iowait_sleeptime_ns,
3925 });
3926 }
3927 out
3928}
3929
3930/// Walk the test's workload cgroups and read each leaf's PSI_IRQ_FULL (Phase A).
3931/// Thin adapter over
3932/// [`super::cgroup_walk::collect_workload_cgroup_psi`] that unpacks the
3933/// borrowed [`CgroupPsiCapture`]. Empty when no workload leaf has per-cgroup
3934/// PSI accounting (loud-absent).
3935fn collect_cgroup_psi(cap: &CgroupPsiCapture<'_>) -> Vec<super::cgroup_walk::CgroupPsiStat> {
3936 super::cgroup_walk::collect_workload_cgroup_psi(
3937 cap.mem,
3938 cap.cgroup_offsets,
3939 cap.psi_offsets,
3940 cap.root_cgroup_kva,
3941 cap.root_cgroup_pa,
3942 cap.workload_root_path,
3943 cap.page_offset,
3944 )
3945}
3946
3947/// Walk a Datasec section by name, yielding `(var_name, byte_offset,
3948/// type_id)` for every variable declared in it.
3949///
3950/// Used by [`dump_state`] to enumerate `.bss` variables when looking
3951/// for `scx_allocator` instances. Returns an empty iterator when the
3952/// Datasec doesn't exist or any chained Var resolution fails — the
3953/// caller treats that as "no sdt_alloc state to surface" rather than
3954/// a hard error.
3955fn iter_bss_vars_with_type(btf: &Btf, section_name: &str) -> Vec<(String, usize, u32)> {
3956 use btf_rs::BtfType;
3957 let mut out = Vec::new();
3958 let Ok(candidates) = btf.resolve_types_by_name(section_name) else {
3959 return out;
3960 };
3961 for ty in candidates {
3962 let btf_rs::Type::Datasec(ds) = ty else {
3963 continue;
3964 };
3965 for var_info in &ds.variables {
3966 let Ok(chained) = btf.resolve_chained_type(var_info) else {
3967 continue;
3968 };
3969 let btf_rs::Type::Var(var) = chained else {
3970 continue;
3971 };
3972 let Ok(name) = btf.resolve_name(&var) else {
3973 continue;
3974 };
3975 // The Var's type_id points to the variable's actual
3976 // type (e.g. struct scx_allocator). var_info.offset() is
3977 // the byte offset within the Datasec.
3978 let Some(type_id) = var.get_type_id() else {
3979 continue;
3980 };
3981 out.push((name, var_info.offset() as usize, type_id));
3982 }
3983 }
3984 out
3985}
3986
3987/// True iff `type_id` resolves to a struct named `scx_allocator`,
3988/// stripping the BTF modifier chain en route. The five modifier
3989/// kinds the loop unwraps — `Const`, `Volatile`, `Typedef`,
3990/// `Restrict`, `TypeTag` — are the complete set the kernel BPF
3991/// pipeline emits for global variable types in `.bss`. Any other
3992/// kind in the chain (Ptr, Array, etc.) terminates the lookup with
3993/// a non-match.
3994fn is_scx_allocator_type(btf: &Btf, type_id: u32) -> bool {
3995 use btf_rs::Type as T;
3996 // Mirror the modifier-chain pattern in
3997 // `btf_offsets::resolve_member_composite` — resolve the
3998 // chained type via the BtfType trait object so the type
3999 // aliases (Const = Volatile, TypeTag = Typedef) all share the
4000 // same path through the loop.
4001 let Ok(mut t) = btf.resolve_type_by_id(type_id) else {
4002 return false;
4003 };
4004 for _ in 0..20 {
4005 match t {
4006 T::Struct(s) => {
4007 return btf.resolve_name(&s).is_ok_and(|n| n == "scx_allocator");
4008 }
4009 T::Const(_) | T::Volatile(_) | T::Typedef(_) | T::Restrict(_) | T::TypeTag(_) => {
4010 let Some(btf_ty) = t.as_btf_type() else {
4011 return false;
4012 };
4013 let Ok(next) = btf.resolve_chained_type(btf_ty) else {
4014 return false;
4015 };
4016 t = next;
4017 }
4018 _ => return false,
4019 }
4020 }
4021 false
4022}
4023
4024/// True iff `type_id` resolves to a struct named `scx_static`,
4025/// stripping the BTF modifier chain en route. Mirrors the
4026/// modifier-handling shape of [`is_scx_allocator_type`] — the five
4027/// modifier kinds (`Const`, `Volatile`, `Typedef`, `Restrict`,
4028/// `TypeTag`) are the complete set the kernel BPF pipeline emits for
4029/// global variable types in `.bss`; any other kind terminates the
4030/// lookup with a non-match.
4031///
4032/// Used by the [`crate::monitor::scx_static_alloc::walk_scx_static`]
4033/// pre-pass to filter `.bss` Vars to only `struct scx_static`
4034/// instances. A scheduler that doesn't link `lib/sdt_alloc.bpf.c`
4035/// has no such Var; the filter rejects every candidate and the
4036/// walker produces an empty snapshot.
4037fn is_scx_static_type(btf: &Btf, type_id: u32) -> bool {
4038 use btf_rs::Type as T;
4039 let Ok(mut t) = btf.resolve_type_by_id(type_id) else {
4040 return false;
4041 };
4042 for _ in 0..20 {
4043 match t {
4044 T::Struct(s) => {
4045 return btf.resolve_name(&s).is_ok_and(|n| n == "scx_static");
4046 }
4047 T::Const(_) | T::Volatile(_) | T::Typedef(_) | T::Restrict(_) | T::TypeTag(_) => {
4048 let Some(btf_ty) = t.as_btf_type() else {
4049 return false;
4050 };
4051 let Ok(next) = btf.resolve_chained_type(btf_ty) else {
4052 return false;
4053 };
4054 t = next;
4055 }
4056 _ => return false,
4057 }
4058 }
4059 false
4060}
4061
4062/// Locate the scheduler's `.bss` array map and pull out (raw bytes,
4063/// program BTF) for the [`crate::monitor::scx_static_alloc`] pre-pass.
4064///
4065/// The earlier sdt_alloc pre-pass at the top of [`dump_state`]
4066/// already collected `sched_bss_bytes` once but the if-let chain at
4067/// the sdt_alloc walk consumed the `Option`. Re-locating here keeps
4068/// both walkers independent: each owns its own bss-bytes read so a
4069/// future ordering change can't accidentally leave one walker
4070/// without input. The cost is one extra map walk; small compared to
4071/// the per-map render loop that follows.
4072///
4073/// Returns `None` when:
4074/// - no `*.bss` map exists (libbpf only creates this map when the
4075/// program has any global non-const data),
4076/// - the map's `btf_kva == 0` (no program BTF — type resolution
4077/// would fail),
4078/// - the program BTF for that `btf_kva` was not loaded in the
4079/// pre-pass (parse failed earlier; the caller already logged),
4080/// - the map's value bytes can't be read.
4081fn relocate_sched_bss<'btf>(
4082 maps: &[BpfMapInfo],
4083 accessor: &GuestMemMapAccessor<'_>,
4084 program_btfs: &'btf std::collections::HashMap<u64, Btf>,
4085) -> Option<(Vec<u8>, &'btf Btf)> {
4086 for info in maps {
4087 let name = info.name();
4088 if name.starts_with("probe_bp.")
4089 || name.starts_with("fentry_p.")
4090 || name == "probe_bp"
4091 || name == "fentry_p"
4092 || KTSTR_INTERNAL_MAPS.contains(&name.as_ref())
4093 {
4094 continue;
4095 }
4096 if info.map_type == BPF_MAP_TYPE_ARRAY
4097 && info.btf_kva != 0
4098 && name.ends_with(".bss")
4099 && let Some(prog_btf) = program_btfs.get(&info.btf_kva)
4100 && let Some(bytes) = accessor.read_value(info, 0, info.value_size as usize)
4101 {
4102 return Some((bytes, prog_btf));
4103 }
4104 }
4105 None
4106}
4107
4108/// Load a BPF program's `struct btf` from guest memory at `btf_kva`.
4109///
4110/// Reads the kernel `struct btf` at `btf_kva`, follows its `data` /
4111/// `data_size` / `base_btf` fields, fetches the raw BTF blob via
4112/// page-walked vmalloc reads, and parses it. When `base_btf` is
4113/// non-NULL the program's BTF is split atop the vmlinux BTF (the
4114/// kernel's own base BTF) — pass the host's already-parsed vmlinux
4115/// `Btf` as the split base so type IDs resolve correctly.
4116///
4117/// Returns `None` when any step fails: missing offsets, untranslatable
4118/// pages, or `Btf::from_bytes` rejection (truncated / corrupted blob).
4119/// Failure is silent and the caller falls back to the host vmlinux
4120/// BTF — the dump is best-effort, a partial render still beats no
4121/// render.
4122///
4123/// Distinct from the [`super::bpf_map::BpfMapAccessor::load_program_btf`]
4124/// trait method (which dispatches across backends): this free function
4125/// is the guest-memory backend's actual KVA-based loader. The trait
4126/// method on `GuestMemMapAccessor` just forwards here.
4127pub(super) fn load_program_btf_kva(
4128 accessor: &GuestMemMapAccessor<'_>,
4129 btf_kva: u64,
4130 base_btf: &Btf,
4131) -> Option<Btf> {
4132 let kernel = accessor.kernel();
4133 let offsets = accessor.offsets();
4134 let mem = kernel.mem();
4135 let walk = kernel.walk_context();
4136
4137 // `struct btf` may be kmalloc'd (direct map) or vmalloc'd; use
4138 // translate_any_kva.
4139 let btf_pa = super::idr::translate_any_kva(
4140 mem,
4141 walk.cr3_pa,
4142 walk.page_offset,
4143 btf_kva,
4144 walk.l5,
4145 walk.tcr_el1,
4146 )?;
4147 let data_kva = mem.read_u64(btf_pa, offsets.btf_data);
4148 let data_size = mem.read_u32(btf_pa, offsets.btf_data_size) as usize;
4149 let base_kva = mem.read_u64(btf_pa, offsets.btf_base_btf);
4150
4151 if data_kva == 0 || data_size == 0 {
4152 return None;
4153 }
4154
4155 if data_size > MAX_BTF_BLOB {
4156 return None;
4157 }
4158
4159 // The BTF blob is vmalloc-backed — `btf->data` is allocated via
4160 // vmalloc / kvmalloc inside `kernel/bpf/btf.c`'s
4161 // `btf_parse_*` paths. Use the chunked vmalloc reader so a
4162 // 100 KB blob doesn't pay 100K syscalls of byte-wise translate.
4163 // The chunked reader honours all-or-nothing semantics, so a
4164 // short read returns None directly; no extra length check needed.
4165 let blob = kernel.read_kva_bytes_chunked(data_kva, data_size)?;
4166
4167 if base_kva != 0 {
4168 // Split BTF: the program's types extend the kernel's
4169 // vmlinux BTF. Pass the host's parsed vmlinux Btf as the
4170 // base so cross-base type IDs (e.g. `task_struct`) resolve.
4171 //
4172 // Uses host vmlinux BTF as split base — correct when host
4173 // kernel == guest kernel (ktstr's default and the common
4174 // CI configuration). A guest running a different kernel
4175 // version would silently mis-render cross-base type
4176 // references; flagged as a known limitation in the module
4177 // doc above.
4178 Btf::from_split_bytes(&blob, base_btf).ok()
4179 } else {
4180 Btf::from_bytes(&blob).ok()
4181 }
4182}
4183
4184/// Render a byte slice as space-separated hex pairs.
4185///
4186/// `pub(crate)` so [`super::sdt_alloc`] can reuse the same wire shape
4187/// for its hex-fallback payload renderings — keeps the dump's hex
4188/// output consistent across both renderers.
4189pub(crate) fn hex_dump(bytes: &[u8]) -> String {
4190 use std::fmt::Write;
4191 let mut s = String::with_capacity(bytes.len() * 3);
4192 for (i, b) in bytes.iter().enumerate() {
4193 if i > 0 {
4194 s.push(' ');
4195 }
4196 // unwrap is safe: write! to String never fails.
4197 let _ = write!(s, "{b:02x}");
4198 }
4199 s
4200}