ktstr/monitor/
bpf_prog.rs

1//! Host-side BPF program enumeration via guest physical memory.
2//!
3//! Walks the kernel's `prog_idr` xarray from the host to discover
4//! loaded BPF programs and read verifier stats from `bpf_prog_aux`.
5//! No guest cooperation is needed — all reads go through the guest
6//! physical memory mapping.
7
8use super::btf_offsets::{BpfMapOffsets, BpfProgOffsets};
9use super::idr::{translate_any_kva, xa_load};
10use super::reader::{GuestMem, WalkContext};
11use super::symbols::text_kva_to_pa_with_base;
12
13/// BPF_PROG_TYPE_STRUCT_OPS from include/uapi/linux/bpf.h.
14const BPF_PROG_TYPE_STRUCT_OPS: u32 = 27;
15
16/// Maximum `used_map_cnt` the walker will iterate. The kernel
17/// enforces a per-prog limit of 64 used_maps in `kernel/bpf/verifier.c`
18/// (`MAX_USED_MAPS = 64`), so a higher value here means the read
19/// raced against `bpf_prog_bind_map`'s "increment cnt, then swap
20/// pointer" sequence and got a stale-pointer + new-cnt observation.
21/// Capping at the kernel's own limit bounds the walk past the old
22/// allocation and matches the upper bound a healthy prog can ever
23/// reach.
24pub const MAX_USED_MAPS: u32 = 64;
25
26/// BPF_OBJ_NAME_LEN from include/linux/bpf.h.
27const BPF_OBJ_NAME_LEN: usize = 16;
28
29/// Iterate every alive `BPF_PROG_TYPE_STRUCT_OPS` prog in the
30/// kernel's `prog_idr`, invoking `payload` with each prog's
31/// `(prog_pa, aux_pa, aux_kva)`. The closure returns `Option<T>`;
32/// `Some(value)` is appended to the result vector, `None` skips.
33///
34/// Encapsulates the `prog_idr` walk shared by every per-struct-ops-
35/// prog reader in this module: translate `prog_idr_kva` → `idr_pa`,
36/// read the xarray head, iterate ids 0..idr_next (capped at 65536
37/// for safety against corrupted reads — a real kernel never
38/// approaches that limit), `xa_load` each entry, translate to
39/// `prog_pa`, filter on `prog_type == BPF_PROG_TYPE_STRUCT_OPS`,
40/// then translate `aux_kva` → `aux_pa`. Translation failures or
41/// zero pointers cause the entry to be skipped silently — matches
42/// the prior per-walker behavior and is the right policy under
43/// race conditions (torn reads from slab recycling) where the
44/// alternative is to publish garbage.
45fn for_each_struct_ops_prog<T, F>(
46    mem: &GuestMem,
47    walk: WalkContext,
48    prog_idr_kva: u64,
49    offsets: &BpfProgOffsets,
50    start_kernel_map: u64,
51    phys_base: u64,
52    mut payload: F,
53) -> Vec<T>
54where
55    F: FnMut(u64, u64, u64) -> Option<T>,
56{
57    let idr_pa = text_kva_to_pa_with_base(prog_idr_kva, start_kernel_map, phys_base);
58
59    let xa_head = mem.read_u64(idr_pa, offsets.idr_xa_head);
60    if xa_head == 0 {
61        return Vec::new();
62    }
63    // Cap at 64K entries. A real kernel never has millions of BPF
64    // programs; a larger `idr_next` means the PA is wrong or the
65    // IDR is corrupt. Bounds runaway loops on garbage reads.
66    let idr_next = mem.read_u32(idr_pa, offsets.idr_next).min(65536);
67
68    let mut out = Vec::new();
69    for id in 0..idr_next {
70        let Some(entry) = xa_load(
71            mem,
72            walk.page_offset,
73            xa_head,
74            id as u64,
75            offsets.xa_node_slots,
76            offsets.xa_node_shift,
77        ) else {
78            continue;
79        };
80        if entry == 0 {
81            continue;
82        }
83        let Some(prog_pa) = translate_any_kva(
84            mem,
85            walk.cr3_pa,
86            walk.page_offset,
87            entry,
88            walk.l5,
89            walk.tcr_el1,
90        ) else {
91            continue;
92        };
93        let prog_type = mem.read_u32(prog_pa, offsets.prog_type);
94        if prog_type != BPF_PROG_TYPE_STRUCT_OPS {
95            continue;
96        }
97        let aux_kva = mem.read_u64(prog_pa, offsets.prog_aux);
98        if aux_kva == 0 {
99            continue;
100        }
101        let Some(aux_pa) = translate_any_kva(
102            mem,
103            walk.cr3_pa,
104            walk.page_offset,
105            aux_kva,
106            walk.l5,
107            walk.tcr_el1,
108        ) else {
109            continue;
110        };
111        if let Some(value) = payload(prog_pa, aux_pa, aux_kva) {
112            out.push(value);
113        }
114    }
115    out
116}
117
118/// Per-program BPF verifier statistics collected from the host.
119#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
120pub struct ProgVerifierStats {
121    /// Program name as registered with the kernel.
122    pub name: String,
123    /// Instructions processed by the verifier (path-exploration count,
124    /// not static program size), from `bpf_prog_aux->verified_insns`.
125    pub verified_insns: u32,
126}
127
128/// Enumerate struct_ops BPF programs from the kernel's `prog_idr`.
129///
130/// Reads `prog_idr` from guest memory, walks the xarray, and for
131/// each `bpf_prog` with `type == BPF_PROG_TYPE_STRUCT_OPS`, reads
132/// `aux->verified_insns` and `aux->name`. `start_kernel_map` is the
133/// runtime kernel image base used to translate `prog_idr_kva` to a
134/// guest physical address.
135pub(crate) fn find_struct_ops_progs(
136    mem: &GuestMem,
137    walk: WalkContext,
138    prog_idr_kva: u64,
139    offsets: &BpfProgOffsets,
140    start_kernel_map: u64,
141    phys_base: u64,
142) -> Vec<ProgVerifierStats> {
143    for_each_struct_ops_prog(
144        mem,
145        walk,
146        prog_idr_kva,
147        offsets,
148        start_kernel_map,
149        phys_base,
150        |_prog_pa, aux_pa, _aux_kva| {
151            let verified_insns = mem.read_u32(aux_pa, offsets.aux_verified_insns);
152            let mut name_buf = [0u8; BPF_OBJ_NAME_LEN];
153            mem.read_bytes(aux_pa + offsets.aux_name as u64, &mut name_buf);
154            let name_len = name_buf
155                .iter()
156                .position(|&b| b == 0)
157                .unwrap_or(BPF_OBJ_NAME_LEN);
158            let name = String::from_utf8_lossy(&name_buf[..name_len]).to_string();
159            Some(ProgVerifierStats {
160                name,
161                verified_insns,
162            })
163        },
164    )
165}
166
167/// Target-free active-scheduler walker. See trait method
168/// [`BpfProgAccessor::find_active_struct_ops_obj_no_target`] for the
169/// motivation (Phase 0 sched_kva==value_kva equality is broken on
170/// kernels where `struct scx_sched` allocates fresh and copies
171/// `sched_ext_ops` into its embedded `ops` field).
172///
173/// Walks `prog_idr` for the FIRST `BPF_PROG_TYPE_STRUCT_OPS` prog
174/// whose `aux->used_maps` carries a sibling `<obj>.bss/.data/.rodata`
175/// global-section map. Returns that prog's obj prefix + full
176/// used_map_kvas snapshot. Returns `None` when no such prog exists
177/// (no scheduler attached, or only non-libbpf STRUCT_OPS subsystems
178/// active).
179///
180/// **Threat model: ktstr guest VM is single-tenant.** This walker
181/// returns the FIRST match in `prog_idr` iteration order — it does
182/// not assert uniqueness. ktstr-loaded guests are minimal (only
183/// scx-ktstr runs), so in practice only the live sched_ext
184/// scheduler's prog satisfies the filter. Two reinforcing reasons:
185///
186/// 1. Sched_ext is the only struct_ops subsystem ktstr loads.
187/// 2. The kernel enforces single-ENABLE for sched_ext: the enable
188///    path rejects a second scheduler while one is already enabled
189///    (pre-6.16: `scx_ops_enable_state() != SCX_OPS_DISABLED` ->
190///    `-EBUSY`; the accessor and states were renamed on later
191///    kernels -- 6.16's `scx_enable_state()`/`SCX_DISABLED`), so at
192///    most one sched_ext STRUCT_OPS prog is ENABLED at a time.
193///    This does NOT by itself guarantee the OLD prog has left
194///    `prog_idr` before the NEW prog is added: a detached struct_ops
195///    prog leaves `prog_idr` only when its owning struct_ops MAP's
196///    last userspace fd closes AND an RCU grace elapses (map free is
197///    RCU-deferred, see `kernel/bpf/bpf_struct_ops.c`), and the
198///    kernel does not serialize old-removal before new-add. The
199///    single-alive-prog property this walker relies on is therefore
200///    ktstr's swap sequencing -- `Op::ReplaceScheduler` kills the
201///    outgoing scheduler and waits for its process to exit (closing
202///    the outgoing map's fds) before loading the next -- not a kernel
203///    ordering invariant.
204///
205/// If a future setup loads non-sched_ext libbpf-named STRUCT_OPS
206/// progs (e.g. `tcp_congestion_ops`), this filter would need to also
207/// gate on `aux->btf` matching the sched_ext_ops btf type id.
208///
209/// Standard `prog_idr` walk: read xa_head → iterate ids 0..idr_next
210/// → translate each prog kva → filter to STRUCT_OPS → read aux's
211/// used_maps → derive obj prefix from any global-section sibling
212/// map.
213pub(crate) fn find_active_struct_ops_obj_no_target(
214    mem: &GuestMem,
215    walk: WalkContext,
216    prog_idr_kva: u64,
217    prog_offsets: &BpfProgOffsets,
218    map_offsets: &BpfMapOffsets,
219    start_kernel_map: u64,
220    phys_base: u64,
221) -> Option<ActiveObjMatch> {
222    // Returns the FIRST matching prog's ActiveObjMatch via Some,
223    // skips non-matching progs with None. `into_iter().next()`
224    // extracts that single match below.
225    for_each_struct_ops_prog(
226        mem,
227        walk,
228        prog_idr_kva,
229        prog_offsets,
230        start_kernel_map,
231        phys_base,
232        |_prog_pa, aux_pa, _aux_kva| {
233            // Read used_maps pointer FIRST, then cnt — pairs with
234            // bpf_prog_bind_map's cnt-then-pointer mutation order
235            // (kernel/bpf/syscall.c): the kernel bumps cnt before
236            // swapping the pointer, so cnt-then-pointer reads
237            // would index past the old allocation on a
238            // mid-mutation read. pointer-then-cnt observes cnt ≤
239            // pointer's slot count. Safe under freeze-rendezvous
240            // (vCPUs paused) but the protocol is defense-in-depth
241            // for any out-of-freeze caller.
242            let used_maps_kva = mem.read_u64(aux_pa, prog_offsets.aux_used_maps);
243            if used_maps_kva == 0 {
244                return None;
245            }
246            let used_map_cnt = mem
247                .read_u32(aux_pa, prog_offsets.aux_used_map_cnt)
248                .min(MAX_USED_MAPS);
249            if used_map_cnt == 0 {
250                return None;
251            }
252            let used_maps_pa = translate_any_kva(
253                mem,
254                walk.cr3_pa,
255                walk.page_offset,
256                used_maps_kva,
257                walk.l5,
258                walk.tcr_el1,
259            )?;
260
261            // Snapshot every non-zero used_maps entry (downstream
262            // disambiguation needs the full set as the KVA
263            // whitelist).
264            let mut entries: Vec<u64> = Vec::with_capacity(used_map_cnt as usize);
265            for i in 0..used_map_cnt {
266                let entry_kva = mem.read_u64(used_maps_pa, (i as usize) * 8);
267                if entry_kva != 0 {
268                    entries.push(entry_kva);
269                }
270            }
271            // Find a global-section map in the snapshot and derive
272            // the obj prefix. If none, this isn't a libbpf-loaded
273            // scheduler prog (could be a different struct_ops
274            // subsystem like tcp_congestion_ops without libbpf-
275            // named global maps) — return None to skip.
276            for &map_kva in &entries {
277                let Some(map_pa) = translate_any_kva(
278                    mem,
279                    walk.cr3_pa,
280                    walk.page_offset,
281                    map_kva,
282                    walk.l5,
283                    walk.tcr_el1,
284                ) else {
285                    continue;
286                };
287                let mut name_buf = [0u8; BPF_OBJ_NAME_LEN];
288                mem.read_bytes(map_pa + map_offsets.map_name as u64, &mut name_buf);
289                let name_len = name_buf
290                    .iter()
291                    .position(|&b| b == 0)
292                    .unwrap_or(BPF_OBJ_NAME_LEN);
293                let Ok(name) = std::str::from_utf8(&name_buf[..name_len]) else {
294                    continue;
295                };
296                if let Some(obj) = extract_global_section_obj_prefix(name) {
297                    return Some(ActiveObjMatch {
298                        obj_name: obj.to_string(),
299                        used_map_kvas: entries,
300                    });
301                }
302            }
303            None
304        },
305    )
306    .into_iter()
307    .next()
308}
309
310/// Result of [`find_active_struct_ops_obj_no_target`]: the matched
311/// scheduler's obj prefix plus the full set of used_maps KVAs from
312/// the matched prog's aux table. The KVA set lets the consumer
313/// distinguish two scheduler instances loaded from the SAME binary
314/// (whose maps share an obj prefix but live at distinct kernel
315/// addresses) — see
316/// [`crate::scenario::snapshot::Snapshot::active`] for the
317/// downstream filter that combines (obj-name match AND KVA-in-set)
318/// to defend against KVA aliasing across captures.
319#[derive(Debug, Clone)]
320pub(crate) struct ActiveObjMatch {
321    pub obj_name: String,
322    pub used_map_kvas: Vec<u64>,
323}
324
325/// If `map_name` matches `<obj>.bss` / `<obj>.data` / `<obj>.rodata`
326/// (libbpf naming for global-section maps), return `<obj>` (the
327/// prefix before the section suffix). Returns None for any other
328/// map name (struct_ops `ktstr_ops`, libbpf-named kfunc helpers,
329/// hashtables, etc.). Used by the active-obj walker to derive a
330/// scheduler obj prefix from a prog's `used_maps` entries.
331///
332/// The obj prefix returned by this helper is already truncated by
333/// the kernel to fit within `BPF_OBJ_NAME_LEN - section_suffix - 1`
334/// (libbpf's internal_map_name in tools/lib/bpf/libbpf.c). Callers
335/// must match against the same truncated obj prefix when
336/// cross-referencing the captured global-section maps.
337fn extract_global_section_obj_prefix(map_name: &str) -> Option<&str> {
338    for suffix in [".bss", ".data", ".rodata"] {
339        if let Some(prefix) = map_name.strip_suffix(suffix)
340            && !prefix.is_empty()
341        {
342            return Some(prefix);
343        }
344    }
345    None
346}
347
348#[cfg(test)]
349mod extract_global_section_obj_prefix_tests {
350    use super::*;
351
352    #[test]
353    fn extracts_bss_prefix() {
354        assert_eq!(
355            extract_global_section_obj_prefix("ktstr.bss"),
356            Some("ktstr")
357        );
358    }
359
360    #[test]
361    fn extracts_data_prefix() {
362        assert_eq!(
363            extract_global_section_obj_prefix("scx_layered.data"),
364            Some("scx_layered"),
365        );
366    }
367
368    #[test]
369    fn extracts_rodata_prefix() {
370        assert_eq!(
371            extract_global_section_obj_prefix("mitosis.rodata"),
372            Some("mitosis"),
373        );
374    }
375
376    #[test]
377    fn rejects_struct_ops_map_name() {
378        assert_eq!(extract_global_section_obj_prefix("ktstr_ops"), None);
379        assert_eq!(extract_global_section_obj_prefix("mitosis_ops"), None);
380    }
381
382    #[test]
383    fn rejects_unrelated_map_name() {
384        assert_eq!(extract_global_section_obj_prefix("scx_per_task"), None);
385        assert_eq!(extract_global_section_obj_prefix("bpf_runq"), None);
386    }
387
388    #[test]
389    fn rejects_empty_prefix_before_suffix() {
390        // ".bss" with no obj — degenerate map name; skip.
391        assert_eq!(extract_global_section_obj_prefix(".bss"), None);
392    }
393}
394
395/// Per-program runtime stats summed across all CPUs.
396///
397/// Mirrors the kernel's `struct bpf_prog_stats` (include/linux/filter.h):
398/// `cnt` (invocations), `nsecs` (cumulative runtime), `misses` (recursion
399/// re-entries skipped via `bpf_prog_inc_misses_counter`,
400/// kernel/bpf/syscall.c). All three counters are u64 monotonics summed
401/// across the program's per-CPU `bpf_prog_stats` slots.
402#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
403pub struct ProgRuntimeStats {
404    /// Program name as registered with the kernel.
405    pub name: String,
406    /// Total invocation count across all CPUs.
407    pub cnt: u64,
408    /// Total CPU time in nanoseconds across all CPUs.
409    pub nsecs: u64,
410    /// Total recursion misses across all CPUs. A miss is a re-entry
411    /// attempt blocked by the program's per-CPU recursion guard.
412    pub misses: u64,
413}
414
415impl ProgRuntimeStats {
416    /// Mean nanoseconds per invocation: `nsecs / cnt`. Returns
417    /// `0.0` when `cnt == 0` (program never ran or counter not
418    /// running) so the result never propagates `NaN` / `Infinity`
419    /// into downstream `finite_or_zero` filters. Method-only access
420    /// (no stored shadow) — recomputed every call from the raw
421    /// fields, matching the [`super::super::assert::CgroupStats::wake_latency_tail_ratio`]
422    /// derived-ratio convention.
423    ///
424    /// Unitless-from-bpftop's perspective: bpftop-style triage
425    /// reads "ns/call" as the primary cost-per-invocation metric;
426    /// surfacing it here lets a failure-dump consumer compare two
427    /// programs' per-call cost without dividing the wire counters
428    /// manually.
429    pub fn ns_per_call(&self) -> f64 {
430        if self.cnt > 0 {
431            self.nsecs as f64 / self.cnt as f64
432        } else {
433            0.0
434        }
435    }
436
437    /// Fraction of invocation attempts blocked by the per-CPU
438    /// recursion guard: `misses / (cnt + misses)`. Returns `0.0`
439    /// when both counters are zero (no signal); never produces
440    /// `NaN` / `Infinity` even on a saturated `cnt + misses`
441    /// overflow because `saturating_add` floors at `u64::MAX` and
442    /// the resulting denominator is non-zero.
443    ///
444    /// A non-trivial miss rate signals lock contention or a
445    /// misconfigured recursion guard — bpftop-style triage flags
446    /// any program with `miss_rate > 0.01` as a hot recursion
447    /// path. Method-only access (no stored shadow); the wire
448    /// format carries `cnt` and `misses` separately so consumers
449    /// who want the raw counts can recover them.
450    pub fn miss_rate(&self) -> f64 {
451        let total = self.cnt.saturating_add(self.misses);
452        if total > 0 {
453            self.misses as f64 / total as f64
454        } else {
455            0.0
456        }
457    }
458}
459
460impl std::fmt::Display for ProgRuntimeStats {
461    /// One-line summary used by [`super::dump::FailureDumpReport`]'s
462    /// human-readable rendering: name + the three counter sums plus
463    /// the bpftop-style derived metrics (ns/call, miss-rate fraction).
464    /// Derived metrics elide when their guards fire (cnt==0 or
465    /// cnt+misses==0) so a program that never ran renders without
466    /// misleading "0.000 ns/call" noise.
467    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
468        write!(
469            f,
470            "{}: cnt={} nsecs={} misses={}",
471            self.name, self.cnt, self.nsecs, self.misses
472        )?;
473        if self.cnt > 0 {
474            // Three decimals on ns/call: bpftop uses two; we add
475            // one for sub-microsecond precision since scheduler
476            // BPF ops typically run in tens of nanoseconds.
477            write!(f, " ns/call={:.3}", self.ns_per_call())?;
478        }
479        if self.cnt.saturating_add(self.misses) > 0 && self.misses > 0 {
480            // Render miss_rate only when there were actual misses
481            // — `0.000` would just be noise on healthy programs.
482            // Four decimals: a 0.0001 (= 1 in 10K) miss rate is
483            // already actionable for a hot scheduler op.
484            write!(f, " miss_rate={:.4}", self.miss_rate())?;
485        }
486        Ok(())
487    }
488}
489
490/// Walk `prog_idr` and produce per-program runtime stats in a single
491/// IDR pass.
492///
493/// Folds the previous discover-then-read split into one visitor: for
494/// each struct_ops program reached via `xa_load`, read
495/// `bpf_prog->stats` (per-CPU base) and `bpf_prog_aux->name` and then
496/// sum `cnt`/`nsecs`/`misses` across `per_cpu_offsets`. Halves the
497/// per-prog kernel-memory reads relative to the prior split (one
498/// `prog_idr` walk and one `bpf_prog`/`aux` translate per program
499/// instead of two of each).
500///
501/// `cnt`/`nsecs`/`misses` are u64 monotonic counters per the kernel's
502/// `struct bpf_prog_stats` (include/linux/filter.h) — see
503/// [`ProgRuntimeStats`] for provenance and the saturation contract.
504/// Address translation uses [`translate_any_kva`] so per-CPU pages
505/// served from vmalloc'd memory (`pcpu_get_vm_areas`) translate
506/// correctly alongside direct-mapping percpu allocations.
507pub(crate) fn walk_struct_ops_runtime_stats(
508    mem: &GuestMem,
509    walk: WalkContext,
510    prog_idr_kva: u64,
511    offsets: &BpfProgOffsets,
512    per_cpu_offsets: &[u64],
513    start_kernel_map: u64,
514    phys_base: u64,
515) -> Vec<ProgRuntimeStats> {
516    for_each_struct_ops_prog(
517        mem,
518        walk,
519        prog_idr_kva,
520        offsets,
521        start_kernel_map,
522        phys_base,
523        |prog_pa, aux_pa, _aux_kva| {
524            let mut name_buf = [0u8; BPF_OBJ_NAME_LEN];
525            mem.read_bytes(aux_pa + offsets.aux_name as u64, &mut name_buf);
526            let name_len = name_buf
527                .iter()
528                .position(|&b| b == 0)
529                .unwrap_or(BPF_OBJ_NAME_LEN);
530            let name = String::from_utf8_lossy(&name_buf[..name_len]).to_string();
531
532            let stats_percpu_kva = mem.read_u64(prog_pa, offsets.prog_stats);
533            if stats_percpu_kva == 0 {
534                return None;
535            }
536
537            // Per-CPU sum. saturating_add prevents the
538            // `attempt to add with overflow` panic that's been
539            // observed when uninitialized / scrambled per-CPU pages
540            // yield near-u64::MAX values; see `ProgRuntimeStats`.
541            let mut cnt: u64 = 0;
542            let mut nsecs: u64 = 0;
543            let mut misses: u64 = 0;
544            for (cpu_index, &cpu_off) in per_cpu_offsets.iter().enumerate() {
545                // Out-of-range CPU detection: kernel `setup_per_cpu_areas`
546                // only writes `__per_cpu_offset[cpu]` for CPUs in
547                // `for_each_possible_cpu`, leaving slots beyond
548                // `nr_cpu_ids` at the BSS-initialized 0. Real SMP
549                // kernels assign each possible CPU a strictly-positive
550                // offset for `cpu > 0`; only the BSP (cpu_index == 0)
551                // can legitimately observe a zero offset. Skip
552                // `cpu_off == 0 && cpu_index > 0` to avoid double-
553                // counting CPU 0's stats for every BSS-zero tail slot.
554                // Mirrors the guard in
555                // [`super::bpf_map::read_percpu_array_value`].
556                if cpu_off == 0 && cpu_index > 0 {
557                    continue;
558                }
559                let stats_kva = stats_percpu_kva.wrapping_add(cpu_off);
560                if let Some(stats_pa) = translate_any_kva(
561                    mem,
562                    walk.cr3_pa,
563                    walk.page_offset,
564                    stats_kva,
565                    walk.l5,
566                    walk.tcr_el1,
567                ) && stats_pa < mem.size()
568                {
569                    // Batch the three u64 stat reads into one bulk
570                    // `read_bytes` covering the contiguous span from
571                    // `min(cnt, nsecs, misses)` to `max(...) + 8`. The
572                    // kernel's `struct bpf_prog_stats` packs `cnt`,
573                    // `nsecs`, and `misses` as adjacent u64_stats_t
574                    // (8 bytes each) and the BTF resolver accepts only
575                    // layouts where the three fields land in 24
576                    // contiguous bytes. The bulk read pays one bounds
577                    // check + region resolve instead of three per CPU,
578                    // and parses the values from the local buffer
579                    // without further volatile loads.
580                    let lo = offsets
581                        .stats_cnt
582                        .min(offsets.stats_nsecs)
583                        .min(offsets.stats_misses);
584                    let hi = offsets
585                        .stats_cnt
586                        .max(offsets.stats_nsecs)
587                        .max(offsets.stats_misses)
588                        + 8;
589                    let span = hi - lo;
590                    if span <= 64 {
591                        let mut buf = [0u8; 64];
592                        let n = mem.read_bytes(stats_pa + lo as u64, &mut buf[..span]);
593                        if n == span {
594                            let parse = |off: usize| -> u64 {
595                                let i = off - lo;
596                                u64::from_ne_bytes(buf[i..i + 8].try_into().unwrap())
597                            };
598                            cnt = cnt.saturating_add(parse(offsets.stats_cnt));
599                            nsecs = nsecs.saturating_add(parse(offsets.stats_nsecs));
600                            misses = misses.saturating_add(parse(offsets.stats_misses));
601                        } else {
602                            // Partial copy (page straddle / end-of-DRAM)
603                            // — fall back to scalar reads to retain the
604                            // original semantics.
605                            cnt = cnt.saturating_add(mem.read_u64(stats_pa, offsets.stats_cnt));
606                            nsecs =
607                                nsecs.saturating_add(mem.read_u64(stats_pa, offsets.stats_nsecs));
608                            misses =
609                                misses.saturating_add(mem.read_u64(stats_pa, offsets.stats_misses));
610                        }
611                    } else {
612                        // Span exceeds the inline buffer. Should be
613                        // unreachable for the production
614                        // `bpf_prog_stats` layout (24 bytes), but
615                        // tolerate exotic layouts via the scalar path
616                        // rather than panicking.
617                        cnt = cnt.saturating_add(mem.read_u64(stats_pa, offsets.stats_cnt));
618                        nsecs = nsecs.saturating_add(mem.read_u64(stats_pa, offsets.stats_nsecs));
619                        misses =
620                            misses.saturating_add(mem.read_u64(stats_pa, offsets.stats_misses));
621                    }
622                }
623            }
624
625            Some(ProgRuntimeStats {
626                name,
627                cnt,
628                nsecs,
629                misses,
630            })
631        },
632    )
633}
634
635/// Read-only abstraction over BPF program enumeration and per-program
636/// stats reads across data sources. Mirror of
637/// [`super::bpf_map::BpfMapAccessor`] for the program side.
638///
639/// Currently one implementation: [`GuestMemProgAccessor`] (PTE-walks a
640/// frozen guest's `prog_idr`). The planned live-host backend
641/// will walk loaded programs via `BPF_PROG_GET_NEXT_ID` /
642/// `BPF_OBJ_GET_INFO_BY_FD` and produce the same
643/// `Vec<ProgVerifierStats>` / `Vec<ProgRuntimeStats>` shapes, so the
644/// failure-dump renderer stays data-source-agnostic.
645pub trait BpfProgAccessor {
646    /// Enumerate struct_ops BPF programs and collect verifier stats.
647    fn struct_ops_progs(&self) -> Vec<ProgVerifierStats>;
648
649    /// Snapshot per-program runtime stats (`cnt`, `nsecs`, `misses`)
650    /// for every struct_ops BPF program, summed across all CPUs.
651    ///
652    /// `per_cpu_offsets` is the kernel's `__per_cpu_offset[]` array,
653    /// typically obtained via [`super::symbols::read_per_cpu_offsets`].
654    /// The live-host backend will ignore this argument (the kernel
655    /// provides per-CPU sums via `BPF_OBJ_GET_INFO_BY_FD`).
656    fn struct_ops_runtime_stats(&self, per_cpu_offsets: &[u64]) -> Vec<ProgRuntimeStats>;
657
658    /// Target-free active-scheduler walker: find the FIRST alive
659    /// `BPF_PROG_TYPE_STRUCT_OPS` prog whose `aux->used_maps` carries
660    /// a sibling `<obj>.bss/.data/.rodata` global-section map, and
661    /// return that prog's obj prefix + full used_map_kvas set.
662    ///
663    /// **Why this exists.** The prior `value_kva == *scx_root`
664    /// equality approach in `identify_active_obj_from_struct_ops`
665    /// required identifying the active struct_ops map first. That
666    /// equality breaks on kernels where `struct scx_sched` allocates
667    /// a fresh kernel-side struct and COPIES the user's
668    /// `sched_ext_ops` into its embedded `ops` field (offset 0) —
669    /// `*scx_root` then points at the kernel-allocated `scx_sched`
670    /// (whose address equals `&scx_sched.ops`), NOT at the struct_ops
671    /// map's `kvalue.data` buffer (the user's source ops table at a
672    /// separate address). Without a target, this walker iterates
673    /// `prog_idr` and uses the "prog has a global-section map" signal
674    /// directly.
675    ///
676    /// **Uniqueness via ktstr threat model.** This walker returns
677    /// the FIRST match -- it does not assert uniqueness. ktstr's guest
678    /// VM is single-tenant (only scx-ktstr runs), and the kernel
679    /// enforces single-ENABLE for sched_ext: the enable path rejects a
680    /// second scheduler while one is already enabled (pre-6.16:
681    /// `scx_ops_enable_state() != SCX_OPS_DISABLED` -> `-EBUSY`;
682    /// renamed to `scx_enable_state()`/`SCX_DISABLED` on 6.16+), so at
683    /// most one sched_ext prog is ENABLED at a time. That does NOT
684    /// guarantee one prog alive in `prog_idr`: a detached prog lingers
685    /// until its owning struct_ops map's last fd closes and an RCU
686    /// grace elapses, so the single-alive-in-`prog_idr` property this
687    /// walker's FIRST-match relies on rests on ktstr's swap sequencing
688    /// (kill the outgoing scheduler, wait for its process to exit
689    /// before loading the next), not a kernel invariant. No other
690    /// struct_ops subsystem (e.g. `tcp_congestion_ops`) ever loads in
691    /// ktstr-managed guests. If a future setup loads non-sched_ext
692    /// libbpf STRUCT_OPS progs, this filter would need to also gate on
693    /// `aux->btf` matching the sched_ext_ops btf type id.
694    ///
695    /// Returns `None` when no live STRUCT_OPS prog has global-section
696    /// maps (no scheduler attached, or only non-sched_ext struct_ops
697    /// subsystems are running). The caller's prefix-grouping fallback
698    /// handles the no-match case.
699    fn find_active_struct_ops_obj_no_target(
700        &self,
701        map_offsets: &BpfMapOffsets,
702    ) -> Option<ActiveObjMatch>;
703}
704
705/// Host-side BPF program accessor backed by direct guest physical-memory
706/// reads. PTE-walks a frozen guest's `prog_idr` to enumerate loaded
707/// programs and reads `bpf_prog_stats` per-CPU slots inline.
708pub struct GuestMemProgAccessor<'a> {
709    kernel: &'a super::guest::GuestKernel,
710    prog_idr_kva: u64,
711    /// Borrowed from the caller. Mirrors the
712    /// [`super::bpf_map::GuestMemMapAccessor`] pattern:
713    /// `BpfProgOffsets` is a ~112-byte POD built once from the
714    /// vmlinux BTF, and every hot-path method reads it by reference,
715    /// so owning it in the accessor would charge a clone that serves
716    /// no purpose.
717    offsets: &'a BpfProgOffsets,
718}
719
720impl<'a> GuestMemProgAccessor<'a> {
721    /// Create from an existing [`GuestKernel`](super::guest::GuestKernel)
722    /// and a caller-owned [`BpfProgOffsets`]. The accessor borrows both
723    /// for its lifetime — build `offsets` once via
724    /// [`BpfProgOffsets::from_vmlinux`] and reuse across calls.
725    pub fn from_guest_kernel(
726        kernel: &'a super::guest::GuestKernel,
727        offsets: &'a BpfProgOffsets,
728    ) -> anyhow::Result<Self> {
729        let prog_idr_kva = kernel
730            .symbol_kva("prog_idr")
731            .ok_or_else(|| anyhow::anyhow!("prog_idr symbol not found in vmlinux"))?;
732
733        Ok(Self {
734            kernel,
735            prog_idr_kva,
736            offsets,
737        })
738    }
739}
740
741impl BpfProgAccessor for GuestMemProgAccessor<'_> {
742    fn struct_ops_progs(&self) -> Vec<ProgVerifierStats> {
743        find_struct_ops_progs(
744            self.kernel.mem(),
745            self.kernel.walk_context(),
746            self.prog_idr_kva,
747            self.offsets,
748            self.kernel.start_kernel_map(),
749            self.kernel.phys_base(),
750        )
751    }
752
753    /// Mirrors the kernel-side per-CPU accumulation: `cnt` is
754    /// bumped via `u64_stats_inc` and `nsecs` is bumped via
755    /// `u64_stats_add(&stats->nsecs, duration)` inside
756    /// `__bpf_prog_run` (include/linux/filter.h), invoked through
757    /// the JIT-emitted entry path on every program invocation.
758    /// `misses` is bumped by `bpf_prog_inc_misses_counter`
759    /// (defined in `kernel/bpf/syscall.c`) called from
760    /// `kernel/bpf/trampoline.c::__bpf_prog_enter_recur` when a
761    /// program re-enters and the recursion guard rejects it.
762    fn struct_ops_runtime_stats(&self, per_cpu_offsets: &[u64]) -> Vec<ProgRuntimeStats> {
763        walk_struct_ops_runtime_stats(
764            self.kernel.mem(),
765            self.kernel.walk_context(),
766            self.prog_idr_kva,
767            self.offsets,
768            per_cpu_offsets,
769            self.kernel.start_kernel_map(),
770            self.kernel.phys_base(),
771        )
772    }
773
774    fn find_active_struct_ops_obj_no_target(
775        &self,
776        map_offsets: &BpfMapOffsets,
777    ) -> Option<ActiveObjMatch> {
778        find_active_struct_ops_obj_no_target(
779            self.kernel.mem(),
780            self.kernel.walk_context(),
781            self.prog_idr_kva,
782            self.offsets,
783            map_offsets,
784            self.kernel.start_kernel_map(),
785            self.kernel.phys_base(),
786        )
787    }
788}
789
790/// Owns a [`super::guest::GuestKernel`] and a [`BpfProgOffsets`],
791/// providing BPF program access through a borrowed
792/// [`GuestMemProgAccessor`].
793///
794/// Mirrors [`super::bpf_map::GuestMemMapAccessorOwned`] for the
795/// program-side surface: callers that don't already hold a
796/// `GuestKernel` + `BpfProgOffsets` pair (e.g. the freeze
797/// coordinator) construct one of these once at start, retain it
798/// across the run, and borrow [`Self::as_accessor`] for each
799/// read. Owning the offsets here keeps the BTF parse to once per
800/// VM run rather than once per dump.
801pub struct GuestMemProgAccessorOwned {
802    kernel: super::guest::GuestKernel,
803    prog_idr_kva: u64,
804    offsets: BpfProgOffsets,
805}
806
807impl GuestMemProgAccessorOwned {
808    pub fn finish(
809        kernel: super::guest::GuestKernel,
810        elf: &goblin::elf::Elf<'_>,
811        data: &[u8],
812        vmlinux: &std::path::Path,
813    ) -> anyhow::Result<Self> {
814        let offsets = BpfProgOffsets::from_elf(elf, data, vmlinux)?;
815        let prog_idr_kva = kernel
816            .symbol_kva("prog_idr")
817            .ok_or_else(|| anyhow::anyhow!("prog_idr symbol not found in vmlinux"))?;
818        Ok(Self {
819            kernel,
820            prog_idr_kva,
821            offsets,
822        })
823    }
824
825    /// Borrow as a [`GuestMemProgAccessor`] for program operations.
826    ///
827    /// Infallible — `finish` already resolved `prog_idr_kva` and the
828    /// borrow returns the cached KVA directly. Mirrors
829    /// [`super::bpf_map::GuestMemMapAccessorOwned::as_accessor`].
830    pub fn as_accessor(&self) -> GuestMemProgAccessor<'_> {
831        GuestMemProgAccessor {
832            kernel: &self.kernel,
833            prog_idr_kva: self.prog_idr_kva,
834            offsets: &self.offsets,
835        }
836    }
837
838    /// Access the underlying [`super::guest::GuestKernel`] for
839    /// callers that need symbol resolution / page-walk primitives
840    /// outside the prog-discovery surface (e.g. resolving
841    /// `__per_cpu_offset` for `struct_ops_runtime_stats`).
842    #[allow(dead_code)]
843    pub fn guest_kernel(&self) -> &super::guest::GuestKernel {
844        &self.kernel
845    }
846}
847
848#[cfg(test)]
849mod tests {
850    use super::*;
851    use crate::monitor::symbols::START_KERNEL_MAP;
852
853    #[test]
854    fn prog_verifier_stats_serde_roundtrip() {
855        let info = ProgVerifierStats {
856            name: "dispatch".to_string(),
857            verified_insns: 42000,
858        };
859        let json = serde_json::to_string(&info).unwrap();
860        let loaded: ProgVerifierStats = serde_json::from_str(&json).unwrap();
861        assert_eq!(loaded.name, "dispatch");
862        assert_eq!(loaded.verified_insns, 42000);
863    }
864
865    #[test]
866    fn prog_verifier_stats_vec_serde_roundtrip() {
867        let stats = vec![
868            ProgVerifierStats {
869                name: "dispatch".to_string(),
870                verified_insns: 100000,
871            },
872            ProgVerifierStats {
873                name: "enqueue".to_string(),
874                verified_insns: 50000,
875            },
876        ];
877        let json = serde_json::to_vec(&stats).unwrap();
878        let loaded: Vec<ProgVerifierStats> = serde_json::from_slice(&json).unwrap();
879        assert_eq!(loaded.len(), 2);
880        assert_eq!(loaded[0].name, "dispatch");
881        assert_eq!(loaded[0].verified_insns, 100000);
882        assert_eq!(loaded[1].name, "enqueue");
883        assert_eq!(loaded[1].verified_insns, 50000);
884    }
885
886    #[test]
887    fn prog_verifier_stats_empty_name() {
888        let info = ProgVerifierStats {
889            name: String::new(),
890            verified_insns: 0,
891        };
892        let json = serde_json::to_string(&info).unwrap();
893        let loaded: ProgVerifierStats = serde_json::from_str(&json).unwrap();
894        assert_eq!(loaded.name, "");
895        assert_eq!(loaded.verified_insns, 0);
896    }
897
898    #[test]
899    fn prog_verifier_stats_max_values() {
900        let info = ProgVerifierStats {
901            name: "x".repeat(16),
902            verified_insns: u32::MAX,
903        };
904        let json = serde_json::to_string(&info).unwrap();
905        let loaded: ProgVerifierStats = serde_json::from_str(&json).unwrap();
906        assert_eq!(loaded.verified_insns, u32::MAX);
907        assert_eq!(loaded.name.len(), 16);
908    }
909
910    #[test]
911    fn prog_runtime_stats_serde_roundtrip() {
912        let info = ProgRuntimeStats {
913            name: "ktstr_dispatch".to_string(),
914            cnt: 12345,
915            nsecs: 9_876_543,
916            misses: 7,
917        };
918        let json = serde_json::to_string(&info).unwrap();
919        let loaded: ProgRuntimeStats = serde_json::from_str(&json).unwrap();
920        assert_eq!(loaded.name, "ktstr_dispatch");
921        assert_eq!(loaded.cnt, 12345);
922        assert_eq!(loaded.nsecs, 9_876_543);
923        assert_eq!(loaded.misses, 7);
924    }
925
926    /// All three counters use `saturating_add` in
927    /// [`walk_struct_ops_runtime_stats`] when summing per-CPU slots, so a
928    /// long-running guest with a hot BPF program (or scrambled
929    /// per-CPU pages from an unmapped slot) can produce a `u64::MAX`
930    /// sum instead of wrapping. Pinning the wire shape here proves
931    /// the serde codec preserves the saturated value end-to-end —
932    /// any future migration that swaps the field type would surface
933    /// here before bleeding into the failure-dump consumers.
934    #[test]
935    fn prog_runtime_stats_max_u64_saturation_roundtrip() {
936        let info = ProgRuntimeStats {
937            name: "saturated".to_string(),
938            cnt: u64::MAX,
939            nsecs: u64::MAX,
940            misses: u64::MAX,
941        };
942        let json = serde_json::to_string(&info).unwrap();
943        let loaded: ProgRuntimeStats = serde_json::from_str(&json).unwrap();
944        assert_eq!(loaded.cnt, u64::MAX);
945        assert_eq!(loaded.nsecs, u64::MAX);
946        assert_eq!(loaded.misses, u64::MAX);
947    }
948
949    #[test]
950    fn prog_runtime_stats_default_zero() {
951        let info = ProgRuntimeStats::default();
952        assert_eq!(info.name, "");
953        assert_eq!(info.cnt, 0);
954        assert_eq!(info.nsecs, 0);
955        assert_eq!(info.misses, 0);
956    }
957
958    /// The Display impl is the entry point used by
959    /// [`super::dump::FailureDumpReport`]'s human-readable rendering;
960    /// pin the format so a downstream change to the impl is caught
961    /// before the failure-dump output silently changes shape.
962    ///
963    /// Two derived metrics surface on the line when their guards
964    /// pass: `ns/call` whenever `cnt > 0`, and `miss_rate`
965    /// whenever there are any misses. A program that never ran
966    /// (cnt=0) elides both — `prog_runtime_stats_display_zero_counters_elides_derived`
967    /// covers that branch.
968    #[test]
969    fn prog_runtime_stats_display_format() {
970        let info = ProgRuntimeStats {
971            name: "ktstr_enqueue".to_string(),
972            cnt: 100,
973            nsecs: 200,
974            misses: 3,
975        };
976        // cnt=100, nsecs=200 → ns/call = 2.000.
977        // misses=3, cnt+misses=103 → miss_rate = 3/103 ≈ 0.0291.
978        assert_eq!(
979            format!("{info}"),
980            "ktstr_enqueue: cnt=100 nsecs=200 misses=3 ns/call=2.000 miss_rate=0.0291",
981        );
982    }
983
984    /// A program that never ran (cnt=0) renders only the four
985    /// raw counters — both derived metrics are guarded out.
986    /// Pin the elision so a regression that strips the guard and
987    /// emits "ns/call=0.000 miss_rate=0.0000" surfaces here.
988    #[test]
989    fn prog_runtime_stats_display_zero_counters_elides_derived() {
990        let info = ProgRuntimeStats {
991            name: "never_ran".to_string(),
992            cnt: 0,
993            nsecs: 0,
994            misses: 0,
995        };
996        let s = format!("{info}");
997        assert_eq!(s, "never_ran: cnt=0 nsecs=0 misses=0");
998        assert!(!s.contains("ns/call"), "ns/call must elide when cnt=0: {s}");
999        assert!(
1000            !s.contains("miss_rate"),
1001            "miss_rate must elide when total=0: {s}"
1002        );
1003    }
1004
1005    /// Healthy program with no recursion misses — `ns/call`
1006    /// surfaces but `miss_rate` elides (since misses=0).
1007    /// A regression that flipped the gate and rendered a
1008    /// "miss_rate=0.0000" line on every healthy program would
1009    /// trip here.
1010    #[test]
1011    fn prog_runtime_stats_display_no_misses_elides_miss_rate() {
1012        let info = ProgRuntimeStats {
1013            name: "healthy".to_string(),
1014            cnt: 1000,
1015            nsecs: 50_000,
1016            misses: 0,
1017        };
1018        let s = format!("{info}");
1019        assert!(s.contains("ns/call=50.000"), "ns/call must render: {s}");
1020        assert!(
1021            !s.contains("miss_rate"),
1022            "miss_rate must elide when misses=0: {s}",
1023        );
1024    }
1025
1026    /// `ns_per_call` derived accessor: pin happy-path math + zero-
1027    /// divisor guard. Mirrors the `CgroupStats::wake_latency_tail_ratio`
1028    /// test pattern from assert.rs.
1029    #[test]
1030    fn prog_runtime_stats_ns_per_call_derived() {
1031        // Happy path: 1000 cnt + 50000 nsecs = 50 ns/call.
1032        let info = ProgRuntimeStats {
1033            name: "x".to_string(),
1034            cnt: 1000,
1035            nsecs: 50_000,
1036            misses: 0,
1037        };
1038        assert_eq!(info.ns_per_call(), 50.0);
1039        assert!(info.ns_per_call().is_finite());
1040
1041        // Zero divisor: cnt=0 → 0.0 (not NaN).
1042        let info = ProgRuntimeStats {
1043            name: "x".to_string(),
1044            cnt: 0,
1045            nsecs: 999_999,
1046            misses: 0,
1047        };
1048        assert_eq!(info.ns_per_call(), 0.0);
1049        assert!(info.ns_per_call().is_finite());
1050    }
1051
1052    /// `miss_rate` derived accessor: pin happy-path math + zero-
1053    /// divisor guard + saturating_add edge.
1054    #[test]
1055    fn prog_runtime_stats_miss_rate_derived() {
1056        // Happy path: 9 misses / (1 cnt + 9 misses) = 0.9.
1057        let info = ProgRuntimeStats {
1058            name: "x".to_string(),
1059            cnt: 1,
1060            nsecs: 0,
1061            misses: 9,
1062        };
1063        assert!((info.miss_rate() - 0.9).abs() < 1e-12);
1064        assert!(info.miss_rate().is_finite());
1065
1066        // Zero divisor: both counters zero → 0.0 (not NaN).
1067        let info = ProgRuntimeStats::default();
1068        assert_eq!(info.miss_rate(), 0.0);
1069        assert!(info.miss_rate().is_finite());
1070
1071        // Saturating-add edge: cnt at u64::MAX, misses also non-
1072        // trivial — `saturating_add` floors at u64::MAX, so the
1073        // denominator stays non-zero and the rate is finite.
1074        let info = ProgRuntimeStats {
1075            name: "saturated".to_string(),
1076            cnt: u64::MAX,
1077            nsecs: 0,
1078            misses: 1000,
1079        };
1080        assert!(info.miss_rate().is_finite());
1081        // Result is essentially 0 (1000 / u64::MAX) but the
1082        // important contract is finiteness — a regression that
1083        // overflowed and produced inf/NaN trips here.
1084        assert!(info.miss_rate() >= 0.0);
1085    }
1086
1087    /// Wire format must NOT carry the derived ratios — they are
1088    /// method-only and recomputed on read. Pin so a regression
1089    /// that re-introduces a stored shadow trips here.
1090    #[test]
1091    fn prog_runtime_stats_wire_format_omits_derived_keys() {
1092        let info = ProgRuntimeStats {
1093            name: "x".to_string(),
1094            cnt: 100,
1095            nsecs: 200,
1096            misses: 3,
1097        };
1098        let json = serde_json::to_value(&info).unwrap();
1099        let map = match json {
1100            serde_json::Value::Object(m) => m,
1101            other => panic!("expected object, got {other:?}"),
1102        };
1103        assert!(
1104            !map.contains_key("ns_per_call"),
1105            "derived methods must NOT appear as wire fields: {map:#?}",
1106        );
1107        assert!(
1108            !map.contains_key("miss_rate"),
1109            "derived methods must NOT appear as wire fields: {map:#?}",
1110        );
1111        // Cross-check: methods still compute correctly.
1112        assert_eq!(info.ns_per_call(), 2.0);
1113        assert!((info.miss_rate() - 3.0_f64 / 103.0).abs() < 1e-12);
1114    }
1115
1116    /// Build a minimal `BpfProgOffsets` keyed for the synthetic
1117    /// chain test below. The exact field offsets are arbitrary —
1118    /// they only need to be consistent with how the test buffer
1119    /// is laid out — but `stats_cnt`/`stats_nsecs`/`stats_misses`
1120    /// MUST sit within a 24-byte window so the bulk-read path
1121    /// fires (`span <= 64`). Drift in these three offsets would
1122    /// silently switch the walker to the scalar fallback and
1123    /// the bulk-read assertion below would still pass for the
1124    /// wrong reason.
1125    fn synthetic_prog_offsets() -> BpfProgOffsets {
1126        BpfProgOffsets {
1127            prog_type: 0,
1128            prog_aux: 8,
1129            aux_verified_insns: 0,
1130            aux_name: 8,
1131            aux_used_maps: 24,
1132            aux_used_map_cnt: 32,
1133            xa_node_slots: 16,
1134            xa_node_shift: 0,
1135            idr_xa_head: 0,
1136            idr_next: 8,
1137            prog_stats: 16,
1138            stats_cnt: 0,
1139            stats_nsecs: 8,
1140            stats_misses: 16,
1141        }
1142    }
1143
1144    /// Run the bulk-24-byte-read end-to-end chain at a caller-
1145    /// supplied `page_offset`. Both the x86_64 and aarch64 wrapper
1146    /// tests call this with their respective `PAGE_OFFSET` baselines
1147    /// so the bulk-read fast path is exercised on both arches.
1148    fn walk_struct_ops_runtime_stats_bulk_chain_at_page_offset(page_offset: u64) {
1149        use crate::monitor::reader::{GuestMem, WalkContext};
1150
1151        // Layout (all PAs offset by `page_offset` to form KVAs in
1152        // the direct-mapping range, except `prog_idr_kva` which
1153        // sits in the kernel-text range and translates via
1154        // `text_kva_to_pa_with_base`):
1155        //
1156        //   0x0000  prog_idr (xa_head + idr_next)
1157        //   0x1000  bpf_prog (prog_type, prog_aux, prog_stats)
1158        //   0x2000  bpf_prog_aux (verified_insns, name)
1159        //   0x3000  per-CPU bpf_prog_stats (cnt, nsecs, misses)
1160        let total: usize = 0x4000;
1161        let mut buf = vec![0u8; total];
1162
1163        let pa_to_kva = |pa: u64| -> u64 { page_offset.wrapping_add(pa) };
1164
1165        let idr_pa: u64 = 0x0000;
1166        let prog_pa: u64 = 0x1000;
1167        let aux_pa: u64 = 0x2000;
1168        let stats_pa: u64 = 0x3000;
1169
1170        // Single-entry xarray: `xa_head` IS the prog KVA with
1171        // bit 1 clear (leaf marker). `pa_to_kva(prog_pa)` has
1172        // bit 1 clear because prog_pa is 4 KiB-aligned.
1173        let prog_kva = pa_to_kva(prog_pa);
1174        assert_eq!(prog_kva & 2, 0, "prog_kva must be a leaf entry");
1175
1176        let offsets = synthetic_prog_offsets();
1177        // Sanity: the bulk-read fast path requires
1178        // `span = hi - lo <= 64`. With offsets {0, 8, 16}:
1179        // lo = 0, hi = 16 + 8 = 24, span = 24. Pinning here so
1180        // a future offset change that pushed `span > 64`
1181        // (forcing the scalar fallback) trips the assert
1182        // before the test runs.
1183        let lo = offsets
1184            .stats_cnt
1185            .min(offsets.stats_nsecs)
1186            .min(offsets.stats_misses);
1187        let hi = offsets
1188            .stats_cnt
1189            .max(offsets.stats_nsecs)
1190            .max(offsets.stats_misses)
1191            + 8;
1192        assert!(
1193            hi - lo <= 64,
1194            "test premise: stats span must be small enough for the bulk path"
1195        );
1196
1197        let write_u64 = |buf: &mut Vec<u8>, pa: u64, val: u64| {
1198            let off = pa as usize;
1199            buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
1200        };
1201        let write_u32 = |buf: &mut Vec<u8>, pa: u64, val: u32| {
1202            let off = pa as usize;
1203            buf[off..off + 4].copy_from_slice(&val.to_ne_bytes());
1204        };
1205
1206        // IDR: xa_head = prog_kva, idr_next = 1.
1207        write_u64(&mut buf, idr_pa + offsets.idr_xa_head as u64, prog_kva);
1208        write_u32(&mut buf, idr_pa + offsets.idr_next as u64, 1);
1209
1210        // bpf_prog: type = STRUCT_OPS, aux = aux_kva, stats = stats_kva.
1211        write_u32(
1212            &mut buf,
1213            prog_pa + offsets.prog_type as u64,
1214            BPF_PROG_TYPE_STRUCT_OPS,
1215        );
1216        write_u64(
1217            &mut buf,
1218            prog_pa + offsets.prog_aux as u64,
1219            pa_to_kva(aux_pa),
1220        );
1221        write_u64(
1222            &mut buf,
1223            prog_pa + offsets.prog_stats as u64,
1224            pa_to_kva(stats_pa),
1225        );
1226
1227        // bpf_prog_aux: verified_insns + name. Name must NUL-
1228        // terminate within BPF_OBJ_NAME_LEN so the walker's
1229        // `position(|&b| b == 0)` finds the end.
1230        write_u32(&mut buf, aux_pa + offsets.aux_verified_insns as u64, 12_345);
1231        let name = b"bulk_test";
1232        let name_pa = (aux_pa + offsets.aux_name as u64) as usize;
1233        buf[name_pa..name_pa + name.len()].copy_from_slice(name);
1234
1235        // Stats: write the three u64 counters at the synthetic
1236        // offsets. These are the bytes the bulk read MUST surface
1237        // through the parse closure.
1238        let known_cnt: u64 = 0x1111_1111_1111_1111;
1239        let known_nsecs: u64 = 0x2222_2222_2222_2222;
1240        let known_misses: u64 = 0x3333_3333_3333_3333;
1241        write_u64(&mut buf, stats_pa + offsets.stats_cnt as u64, known_cnt);
1242        write_u64(&mut buf, stats_pa + offsets.stats_nsecs as u64, known_nsecs);
1243        write_u64(
1244            &mut buf,
1245            stats_pa + offsets.stats_misses as u64,
1246            known_misses,
1247        );
1248
1249        // SAFETY: buf is a live local Vec<u8> whose backing storage
1250        // outlives the GuestMem use.
1251        let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
1252        let walk = WalkContext {
1253            cr3_pa: 0,
1254            page_offset,
1255            l5: false,
1256            tcr_el1: 0,
1257        };
1258        // One CPU. `cpu_off == 0` is allowed at `cpu_index == 0`
1259        // (BSP). `stats_kva + 0 = stats_kva`, which translates
1260        // through the direct mapping to `stats_pa`.
1261        let per_cpu_offsets = vec![0u64];
1262
1263        let prog_idr_kva = idr_pa + START_KERNEL_MAP;
1264        let stats = walk_struct_ops_runtime_stats(
1265            &mem,
1266            walk,
1267            prog_idr_kva,
1268            &offsets,
1269            &per_cpu_offsets,
1270            START_KERNEL_MAP,
1271            0,
1272        );
1273
1274        assert_eq!(stats.len(), 1, "single STRUCT_OPS prog must surface");
1275        assert_eq!(stats[0].name, "bulk_test");
1276        assert_eq!(
1277            stats[0].cnt, known_cnt,
1278            "bulk read must parse cnt at offsets.stats_cnt within the 24-byte window",
1279        );
1280        assert_eq!(
1281            stats[0].nsecs, known_nsecs,
1282            "bulk read must parse nsecs at offsets.stats_nsecs within the 24-byte window",
1283        );
1284        assert_eq!(
1285            stats[0].misses, known_misses,
1286            "bulk read must parse misses at offsets.stats_misses within the 24-byte window",
1287        );
1288    }
1289
1290    /// End-to-end chain test for the bulk 24-byte
1291    /// `bpf_prog_stats` read on x86_64. The walker reads `cnt`,
1292    /// `nsecs`, and `misses` (three adjacent u64s in the kernel
1293    /// `struct bpf_prog_stats`) via one `read_bytes` over the
1294    /// `[lo, hi)` span and parses each value from the local
1295    /// buffer. The aarch64 wrapper below pins the same chain
1296    /// against the aarch64 `PAGE_OFFSET` baseline.
1297    #[test]
1298    #[cfg(target_arch = "x86_64")]
1299    fn walk_struct_ops_runtime_stats_bulk_24byte_read_parses_three_offsets() {
1300        // x86_64 PAGE_OFFSET (4-level paging, non-KASLR baseline).
1301        walk_struct_ops_runtime_stats_bulk_chain_at_page_offset(0xFFFF_8880_0000_0000);
1302    }
1303
1304    /// End-to-end chain test for the bulk 24-byte
1305    /// `bpf_prog_stats` read on aarch64. Mirrors the x86_64
1306    /// wrapper above against the aarch64 direct-mapping
1307    /// `PAGE_OFFSET` baseline so the bulk-read fast path is
1308    /// pinned on both arches.
1309    #[test]
1310    #[cfg(target_arch = "aarch64")]
1311    fn walk_struct_ops_runtime_stats_bulk_24byte_read_parses_three_offsets() {
1312        // aarch64 PAGE_OFFSET baseline (48-bit VA, 4 KiB granule).
1313        walk_struct_ops_runtime_stats_bulk_chain_at_page_offset(0xFFFF_0000_0000_0000);
1314    }
1315
1316    /// Format chain integration: the `ProgRuntimeStats` Display
1317    /// output must appear verbatim inside `FailureDumpReport`'s
1318    /// Display output. Pins the chain
1319    /// `ProgRuntimeStats::fmt` (bpf_prog.rs) →
1320    /// `FailureDumpReport::fmt::std::fmt::Display::fmt(stats, f)`
1321    /// (dump/display.rs `prog_runtime_stats:` arm).
1322    ///
1323    /// The standalone `prog_runtime_stats_display_format` test pins
1324    /// the inner Display in isolation; the dump-side
1325    /// `report_display_renders_prog_runtime_stats` test pins the
1326    /// outer section header. Neither catches a regression that
1327    /// SUBSTITUTED the inner Display call (e.g. introducing a
1328    /// custom rendering branch in the outer formatter that bypasses
1329    /// `ProgRuntimeStats::fmt`). This test catches that drift by
1330    /// asserting BOTH layers render identically and the inner
1331    /// string appears as a substring of the outer — a substitution
1332    /// would break either equality.
1333    #[test]
1334    fn prog_runtime_stats_format_chain_inner_appears_in_outer() {
1335        use crate::monitor::dump::{FailureDumpReport, SCHEMA_SINGLE};
1336        let info = ProgRuntimeStats {
1337            name: "chain_test".to_string(),
1338            cnt: 7,
1339            nsecs: 42,
1340            misses: 1,
1341        };
1342        let inner = format!("{info}");
1343        // Direct Display on ProgRuntimeStats: pinned shape includes
1344        // the bpftop-style derived metrics. cnt=7 nsecs=42 →
1345        // ns/call=6.000; misses=1 → miss_rate=1/8=0.1250.
1346        assert_eq!(
1347            inner,
1348            "chain_test: cnt=7 nsecs=42 misses=1 ns/call=6.000 miss_rate=0.1250",
1349        );
1350
1351        let report = FailureDumpReport {
1352            schema: SCHEMA_SINGLE.to_string(),
1353            prog_runtime_stats: vec![info],
1354            ..Default::default()
1355        };
1356        let outer = format!("{report}");
1357        // The outer's `prog_runtime_stats:` section calls
1358        // `std::fmt::Display::fmt(stats, f)` on each entry; that
1359        // call dispatches through THIS module's Display impl. If a
1360        // future regression replaced the dispatch with a custom
1361        // formatter, the inner string would no longer appear in
1362        // the outer output — surfacing as substring failure.
1363        assert!(
1364            outer.contains(&inner),
1365            "FailureDumpReport's Display chain must dispatch through \
1366             ProgRuntimeStats::fmt — inner {inner:?} must appear \
1367             verbatim inside outer:\n{outer}",
1368        );
1369        // Sanity: the outer also wraps with the expected section
1370        // header, so the substring match is finding the chain
1371        // through the correct arm of FailureDumpReport's fmt and
1372        // not (e.g.) a coincidence in the schema marker.
1373        assert!(
1374            outer.contains("prog_runtime_stats:"),
1375            "outer Display must carry the prog_runtime_stats section \
1376             header; without it the chain test could pass even when the \
1377             inner string matched a different format arm:\n{outer}",
1378        );
1379    }
1380
1381    // -- prog_idr chain fixtures: synthetic guest memory that drives
1382    //    the full for_each_struct_ops_prog walk on the host with no
1383    //    VM and no live kernel. The layout mirrors the existing
1384    //    `walk_struct_ops_runtime_stats_bulk_chain_at_page_offset`
1385    //    helper above (idr@0x0, prog@0x1000, aux@0x2000) and reuses
1386    //    `synthetic_prog_offsets()`. PAs are offset by `page_offset`
1387    //    to form direct-map KVAs that `translate_any_kva` resolves
1388    //    via `kva_to_pa` (the direct-map fast path), and the
1389    //    `prog_idr_kva` sits in kernel-text range translated via
1390    //    `text_kva_to_pa_with_base`.
1391
1392    /// x86_64 PAGE_OFFSET baseline (4-level paging, non-KASLR) used
1393    /// by the host-only chain fixtures below. The exact value only
1394    /// has to keep `page_offset + pa` outside `[0, buf.len())` so the
1395    /// formed KVAs are unambiguously in the direct-map range and
1396    /// translate back to `pa` via `kva_to_pa`.
1397    const FIXTURE_PAGE_OFFSET: u64 = 0xFFFF_8880_0000_0000;
1398
1399    /// Mutable byte buffer wrapped as guest DRAM for the prog_idr
1400    /// chain fixtures. Owns the backing `Vec` so the unsafe
1401    /// [`GuestMem::new`] pointer stays valid for the fixture's life.
1402    struct ProgChainFixture {
1403        buf: Vec<u8>,
1404        page_offset: u64,
1405    }
1406
1407    impl ProgChainFixture {
1408        fn new(size: usize) -> Self {
1409            Self {
1410                buf: vec![0u8; size],
1411                page_offset: FIXTURE_PAGE_OFFSET,
1412            }
1413        }
1414
1415        /// Direct-map KVA for a DRAM offset.
1416        fn pa_to_kva(&self, pa: u64) -> u64 {
1417            self.page_offset.wrapping_add(pa)
1418        }
1419
1420        fn write_u64(&mut self, pa: u64, val: u64) {
1421            let off = pa as usize;
1422            self.buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
1423        }
1424
1425        fn write_u32(&mut self, pa: u64, val: u32) {
1426            let off = pa as usize;
1427            self.buf[off..off + 4].copy_from_slice(&val.to_ne_bytes());
1428        }
1429
1430        /// Write a NUL-terminated (within `BPF_OBJ_NAME_LEN`) name
1431        /// blob at `pa`. Panics if the name needs a terminator but
1432        /// fills the whole field — every fixture name here is short.
1433        fn write_name(&mut self, pa: u64, name: &[u8]) {
1434            assert!(
1435                name.len() < BPF_OBJ_NAME_LEN,
1436                "fixture name must leave room for the NUL terminator",
1437            );
1438            let off = pa as usize;
1439            self.buf[off..off + name.len()].copy_from_slice(name);
1440        }
1441
1442        fn mem(&self) -> GuestMem {
1443            // SAFETY: `self.buf` is a live Vec<u8> owned by the
1444            // fixture; it outlives every GuestMem read in the test
1445            // because the fixture is dropped after the assertions.
1446            unsafe { GuestMem::new(self.buf.as_ptr() as *mut u8, self.buf.len() as u64) }
1447        }
1448
1449        fn walk(&self) -> WalkContext {
1450            WalkContext {
1451                cr3_pa: 0,
1452                page_offset: self.page_offset,
1453                l5: false,
1454                tcr_el1: 0,
1455            }
1456        }
1457    }
1458
1459    /// PA constants shared by the chain fixtures.
1460    const FIX_IDR_PA: u64 = 0x0000;
1461    const FIX_PROG_PA: u64 = 0x1000;
1462    const FIX_AUX_PA: u64 = 0x2000;
1463
1464    /// Build a fixture whose prog_idr holds a single STRUCT_OPS prog
1465    /// (single-entry xarray: `xa_head` IS the prog KVA, `idr_next=1`).
1466    /// `prog_type` lets a caller override the type to exercise the
1467    /// non-struct_ops skip arm. The prog's `aux` pointer is wired but
1468    /// the aux body (name, verified_insns, used_maps, stats) is left
1469    /// for the caller to populate. Returns the fixture; the
1470    /// `prog_idr_kva` for the walk is `FIX_IDR_PA + START_KERNEL_MAP`.
1471    fn single_prog_fixture(
1472        size: usize,
1473        prog_type: u32,
1474        offsets: &BpfProgOffsets,
1475    ) -> ProgChainFixture {
1476        let mut fx = ProgChainFixture::new(size);
1477        let prog_kva = fx.pa_to_kva(FIX_PROG_PA);
1478        // Single-entry xarray leaf marker: prog_kva must have bits
1479        // 0-1 clear so `xa_is_node` treats it as a direct entry.
1480        assert_eq!(prog_kva & 3, 0, "prog_kva must be a leaf entry");
1481
1482        // IDR: xa_head = prog_kva, idr_next = 1.
1483        fx.write_u64(FIX_IDR_PA + offsets.idr_xa_head as u64, prog_kva);
1484        fx.write_u32(FIX_IDR_PA + offsets.idr_next as u64, 1);
1485
1486        // bpf_prog: type + aux pointer.
1487        fx.write_u32(FIX_PROG_PA + offsets.prog_type as u64, prog_type);
1488        fx.write_u64(
1489            FIX_PROG_PA + offsets.prog_aux as u64,
1490            fx.pa_to_kva(FIX_AUX_PA),
1491        );
1492        fx
1493    }
1494
1495    // ---- find_struct_ops_progs ----
1496
1497    /// Happy path: a single STRUCT_OPS prog whose aux carries a name
1498    /// and verified_insns count surfaces with both fields read from
1499    /// the synthetic offsets. Covers the `find_struct_ops_progs`
1500    /// payload closure (aux_verified_insns read + aux_name NUL-scan +
1501    /// String build) and the `for_each_struct_ops_prog` happy path.
1502    #[test]
1503    fn find_struct_ops_progs_single_prog_reads_name_and_verified_insns() {
1504        let offsets = synthetic_prog_offsets();
1505        let mut fx = single_prog_fixture(0x3000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1506        fx.write_u32(FIX_AUX_PA + offsets.aux_verified_insns as u64, 12_345);
1507        fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"dispatch");
1508
1509        let progs = find_struct_ops_progs(
1510            &fx.mem(),
1511            fx.walk(),
1512            FIX_IDR_PA + START_KERNEL_MAP,
1513            &offsets,
1514            START_KERNEL_MAP,
1515            0,
1516        );
1517        assert_eq!(progs.len(), 1);
1518        assert_eq!(progs[0].name, "dispatch");
1519        assert_eq!(progs[0].verified_insns, 12_345u32);
1520    }
1521
1522    /// The `prog_type != BPF_PROG_TYPE_STRUCT_OPS { continue }` filter
1523    /// at `for_each_struct_ops_prog` line ~94: a prog whose type is
1524    /// not 27 (here KPROBE=2) is skipped before its aux is read, so
1525    /// the result is empty.
1526    #[test]
1527    fn find_struct_ops_progs_skips_non_struct_ops_type() {
1528        const BPF_PROG_TYPE_KPROBE: u32 = 2;
1529        let offsets = synthetic_prog_offsets();
1530        // Populate the aux body too, to prove the skip happens at the
1531        // type filter and not because aux was unreadable.
1532        let mut fx = single_prog_fixture(0x3000, BPF_PROG_TYPE_KPROBE, &offsets);
1533        fx.write_u32(FIX_AUX_PA + offsets.aux_verified_insns as u64, 999);
1534        fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"kprobe_prog");
1535
1536        let progs = find_struct_ops_progs(
1537            &fx.mem(),
1538            fx.walk(),
1539            FIX_IDR_PA + START_KERNEL_MAP,
1540            &offsets,
1541            START_KERNEL_MAP,
1542            0,
1543        );
1544        assert_eq!(progs.len(), 0);
1545        assert!(progs.is_empty());
1546    }
1547
1548    /// An all-zero IDR (xa_head left unwritten) yields an empty result:
1549    /// `for_each_struct_ops_prog` short-circuits on `xa_head == 0` before
1550    /// the id loop, but even without that guard an empty xarray surfaces
1551    /// no progs — so this pins the empty-IDR→empty outcome, not the
1552    /// short-circuit in isolation. Reached via `find_struct_ops_progs`.
1553    #[test]
1554    fn for_each_struct_ops_prog_empty_xa_head_returns_empty() {
1555        let offsets = synthetic_prog_offsets();
1556        // Default-zero buffer: do NOT write xa_head. idr_next is
1557        // irrelevant because the xa_head guard fires first.
1558        let fx = ProgChainFixture::new(0x3000);
1559
1560        let progs = find_struct_ops_progs(
1561            &fx.mem(),
1562            fx.walk(),
1563            FIX_IDR_PA + START_KERNEL_MAP,
1564            &offsets,
1565            START_KERNEL_MAP,
1566            0,
1567        );
1568        assert_eq!(progs.len(), 0);
1569        assert!(progs.is_empty());
1570    }
1571
1572    /// A corrupt `idr_next` of `u32::MAX` still returns the correct
1573    /// result — the one real prog at id 0 — and terminates, because the
1574    /// `.min(65536)` clamp on `idr_next` in `for_each_struct_ops_prog`
1575    /// bounds the loop. The single-entry xarray returns the prog for id 0
1576    /// and `Some(0)` for every id > 0 (see `idr::xa_load`). This pins the
1577    /// RESULT under a corrupt count; a clamp regression would surface as
1578    /// a slow run rather than a failed assertion (pinning the exact 65536
1579    /// boundary would need a multi-level xarray with an entry past the
1580    /// cap — out of scope here).
1581    #[test]
1582    fn for_each_struct_ops_prog_caps_idr_next_at_65536() {
1583        let offsets = synthetic_prog_offsets();
1584        let mut fx = single_prog_fixture(0x3000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1585        // Overwrite idr_next with u32::MAX. Without the .min(65536)
1586        // clamp this loop would attempt ~4 billion iterations.
1587        fx.write_u32(FIX_IDR_PA + offsets.idr_next as u64, u32::MAX);
1588        fx.write_u32(FIX_AUX_PA + offsets.aux_verified_insns as u64, 7);
1589        fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"capped");
1590
1591        let progs = find_struct_ops_progs(
1592            &fx.mem(),
1593            fx.walk(),
1594            FIX_IDR_PA + START_KERNEL_MAP,
1595            &offsets,
1596            START_KERNEL_MAP,
1597            0,
1598        );
1599        assert_eq!(progs.len(), 1);
1600        assert_eq!(progs[0].name, "capped");
1601        assert_eq!(progs[0].verified_insns, 7u32);
1602    }
1603
1604    // ---- walk_struct_ops_runtime_stats ----
1605
1606    /// The `if stats_percpu_kva == 0 { return None }` skip at
1607    /// `walk_struct_ops_runtime_stats` line ~533: a prog whose
1608    /// `prog_stats` per-CPU base is NULL is dropped from the result
1609    /// (closure returns None -> not pushed). prog_stats is left
1610    /// unwritten (zero) on the single STRUCT_OPS prog.
1611    #[test]
1612    fn walk_runtime_stats_skips_prog_with_null_stats_pointer() {
1613        let offsets = synthetic_prog_offsets();
1614        let mut fx = single_prog_fixture(0x3000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1615        fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"no_stats");
1616        // Deliberately leave prog_stats == 0 (do not write it).
1617
1618        let per_cpu_offsets = vec![0u64];
1619        let stats = walk_struct_ops_runtime_stats(
1620            &fx.mem(),
1621            fx.walk(),
1622            FIX_IDR_PA + START_KERNEL_MAP,
1623            &offsets,
1624            &per_cpu_offsets,
1625            START_KERNEL_MAP,
1626            0,
1627        );
1628        assert_eq!(stats.len(), 0);
1629        assert!(stats.is_empty());
1630    }
1631
1632    /// Multi-CPU accumulation: with two distinct, non-zero per-CPU
1633    /// offsets the walker translates both per-CPU `bpf_prog_stats`
1634    /// blocks and `saturating_add`s `cnt`/`nsecs`/`misses` across
1635    /// them. Covers the per-CPU sum loop at
1636    /// `walk_struct_ops_runtime_stats` lines ~544-623 for >1 CPU.
1637    #[test]
1638    fn walk_runtime_stats_sums_across_two_cpus() {
1639        let offsets = synthetic_prog_offsets();
1640        // Buffer must hold both stats blocks. stats0 @0x3000,
1641        // stats1 @0x3800 — both within a 0x4000 buffer.
1642        let stats_pa0: u64 = 0x3000;
1643        let stats_pa1: u64 = 0x3800;
1644        let mut fx = single_prog_fixture(0x4000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1645        fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"two_cpu");
1646        // prog_stats per-CPU base = stats_pa0's KVA. cpu_off shifts it.
1647        fx.write_u64(
1648            FIX_PROG_PA + offsets.prog_stats as u64,
1649            fx.pa_to_kva(stats_pa0),
1650        );
1651
1652        // Distinct small counters per block.
1653        let cnt0: u64 = 10;
1654        let nsecs0: u64 = 100;
1655        let misses0: u64 = 1;
1656        let cnt1: u64 = 7;
1657        let nsecs1: u64 = 70;
1658        let misses1: u64 = 3;
1659        fx.write_u64(stats_pa0 + offsets.stats_cnt as u64, cnt0);
1660        fx.write_u64(stats_pa0 + offsets.stats_nsecs as u64, nsecs0);
1661        fx.write_u64(stats_pa0 + offsets.stats_misses as u64, misses0);
1662        fx.write_u64(stats_pa1 + offsets.stats_cnt as u64, cnt1);
1663        fx.write_u64(stats_pa1 + offsets.stats_nsecs as u64, nsecs1);
1664        fx.write_u64(stats_pa1 + offsets.stats_misses as u64, misses1);
1665
1666        // cpu0: cpu_off=0 reads stats_pa0 (BSP). cpu1: cpu_off=delta
1667        // reads stats_pa0+delta = stats_pa1.
1668        let per_cpu_offsets = vec![0u64, stats_pa1 - stats_pa0];
1669        let stats = walk_struct_ops_runtime_stats(
1670            &fx.mem(),
1671            fx.walk(),
1672            FIX_IDR_PA + START_KERNEL_MAP,
1673            &offsets,
1674            &per_cpu_offsets,
1675            START_KERNEL_MAP,
1676            0,
1677        );
1678        assert_eq!(stats.len(), 1);
1679        assert_eq!(stats[0].cnt, cnt0 + cnt1);
1680        assert_eq!(stats[0].nsecs, nsecs0 + nsecs1);
1681        assert_eq!(stats[0].misses, misses0 + misses1);
1682    }
1683
1684    /// The `if cpu_off == 0 && cpu_index > 0 { continue }` BSS-zero-
1685    /// tail guard at `walk_struct_ops_runtime_stats` lines ~556-558:
1686    /// a trailing `__per_cpu_offset[]=0` slot (cpu_index > 0) must be
1687    /// skipped so CPU 0's stats are NOT double-counted. With
1688    /// `per_cpu_offsets = [0, 0]` the summed fields equal the single
1689    /// block's values, not twice them — a regression that dropped the
1690    /// `cpu_index > 0` guard would double them.
1691    #[test]
1692    fn walk_runtime_stats_skips_zero_offset_tail_cpu() {
1693        let offsets = synthetic_prog_offsets();
1694        let stats_pa: u64 = 0x3000;
1695        let mut fx = single_prog_fixture(0x4000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1696        fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"bss_tail");
1697        fx.write_u64(
1698            FIX_PROG_PA + offsets.prog_stats as u64,
1699            fx.pa_to_kva(stats_pa),
1700        );
1701
1702        let cnt0: u64 = 42;
1703        let nsecs0: u64 = 4200;
1704        let misses0: u64 = 5;
1705        fx.write_u64(stats_pa + offsets.stats_cnt as u64, cnt0);
1706        fx.write_u64(stats_pa + offsets.stats_nsecs as u64, nsecs0);
1707        fx.write_u64(stats_pa + offsets.stats_misses as u64, misses0);
1708
1709        // Two slots: cpu0 (cpu_off=0, BSP, allowed) reads stats_pa;
1710        // cpu1 (cpu_off=0, cpu_index=1) is skipped by the guard.
1711        let per_cpu_offsets = vec![0u64, 0u64];
1712        let stats = walk_struct_ops_runtime_stats(
1713            &fx.mem(),
1714            fx.walk(),
1715            FIX_IDR_PA + START_KERNEL_MAP,
1716            &offsets,
1717            &per_cpu_offsets,
1718            START_KERNEL_MAP,
1719            0,
1720        );
1721        assert_eq!(stats.len(), 1);
1722        assert_eq!(stats[0].cnt, cnt0);
1723        assert_eq!(stats[0].nsecs, nsecs0);
1724        assert_eq!(stats[0].misses, misses0);
1725    }
1726
1727    // ---- find_active_struct_ops_obj_no_target ----
1728
1729    /// PA constants for the active-obj fixture's used_maps array and
1730    /// map structs.
1731    const FIX_USED_MAPS_PA: u64 = 0x3000;
1732    const FIX_MAP0_PA: u64 = 0x4000;
1733    const FIX_MAP1_PA: u64 = 0x5000;
1734
1735    /// Happy path: a STRUCT_OPS prog whose aux->used_maps holds two
1736    /// map pointers — a struct_ops map (no global-section suffix) and
1737    /// a `<obj>.bss` global-section map — resolves to the obj prefix
1738    /// and the full used_map_kvas snapshot. Covers
1739    /// `find_active_struct_ops_obj_no_target`: used_maps!=0,
1740    /// used_map_cnt!=0, the entries snapshot loop, the per-map name
1741    /// read + `extract_global_section_obj_prefix` match.
1742    #[test]
1743    fn find_active_struct_ops_obj_returns_obj_prefix_from_bss_map() {
1744        let prog_offsets = synthetic_prog_offsets();
1745        let map_offsets = BpfMapOffsets {
1746            map_name: 0,
1747            ..BpfMapOffsets::EMPTY
1748        };
1749        let mut fx = single_prog_fixture(0x6000, BPF_PROG_TYPE_STRUCT_OPS, &prog_offsets);
1750
1751        // aux->used_maps = used_maps array KVA, used_map_cnt = 2.
1752        let map0_kva = fx.pa_to_kva(FIX_MAP0_PA);
1753        let map1_kva = fx.pa_to_kva(FIX_MAP1_PA);
1754        fx.write_u64(
1755            FIX_AUX_PA + prog_offsets.aux_used_maps as u64,
1756            fx.pa_to_kva(FIX_USED_MAPS_PA),
1757        );
1758        fx.write_u32(FIX_AUX_PA + prog_offsets.aux_used_map_cnt as u64, 2);
1759        fx.write_u64(FIX_USED_MAPS_PA, map0_kva);
1760        fx.write_u64(FIX_USED_MAPS_PA + 8, map1_kva);
1761
1762        // map0: struct_ops map name (no suffix -> no match).
1763        fx.write_name(FIX_MAP0_PA + map_offsets.map_name as u64, b"ktstr_ops");
1764        // map1: global-section .bss map (matches -> obj "bpf_bpf").
1765        fx.write_name(FIX_MAP1_PA + map_offsets.map_name as u64, b"bpf_bpf.bss");
1766
1767        let result = find_active_struct_ops_obj_no_target(
1768            &fx.mem(),
1769            fx.walk(),
1770            FIX_IDR_PA + START_KERNEL_MAP,
1771            &prog_offsets,
1772            &map_offsets,
1773            START_KERNEL_MAP,
1774            0,
1775        );
1776        let m = result.unwrap();
1777        assert_eq!(m.obj_name, "bpf_bpf");
1778        assert_eq!(m.used_map_kvas, vec![map0_kva, map1_kva]);
1779    }
1780
1781    /// The closure-returns-None path: every map in the snapshot is
1782    /// scanned but none matches a global-section suffix, so the
1783    /// closure returns None and `.into_iter().next()` yields None.
1784    /// Distinct from the null-used_maps and zero-cnt early skips.
1785    #[test]
1786    fn find_active_struct_ops_obj_none_when_no_global_section_map() {
1787        let prog_offsets = synthetic_prog_offsets();
1788        let map_offsets = BpfMapOffsets {
1789            map_name: 0,
1790            ..BpfMapOffsets::EMPTY
1791        };
1792        let mut fx = single_prog_fixture(0x6000, BPF_PROG_TYPE_STRUCT_OPS, &prog_offsets);
1793
1794        fx.write_u64(
1795            FIX_AUX_PA + prog_offsets.aux_used_maps as u64,
1796            fx.pa_to_kva(FIX_USED_MAPS_PA),
1797        );
1798        fx.write_u32(FIX_AUX_PA + prog_offsets.aux_used_map_cnt as u64, 2);
1799        fx.write_u64(FIX_USED_MAPS_PA, fx.pa_to_kva(FIX_MAP0_PA));
1800        fx.write_u64(FIX_USED_MAPS_PA + 8, fx.pa_to_kva(FIX_MAP1_PA));
1801
1802        // Both maps lack any .bss/.data/.rodata suffix.
1803        fx.write_name(FIX_MAP0_PA + map_offsets.map_name as u64, b"ktstr_ops");
1804        fx.write_name(FIX_MAP1_PA + map_offsets.map_name as u64, b"bpf_runq");
1805
1806        let result = find_active_struct_ops_obj_no_target(
1807            &fx.mem(),
1808            fx.walk(),
1809            FIX_IDR_PA + START_KERNEL_MAP,
1810            &prog_offsets,
1811            &map_offsets,
1812            START_KERNEL_MAP,
1813            0,
1814        );
1815        assert!(result.is_none());
1816    }
1817
1818    /// A STRUCT_OPS prog whose aux->used_maps pointer is NULL yields no
1819    /// active obj: `find_active_struct_ops_obj` returns None on the
1820    /// `used_maps_kva == 0` skip. The all-zero fixture also has
1821    /// used_map_cnt == 0 (whose guard would likewise return None), so
1822    /// this pins the NULL-used_maps→None outcome, not that one guard in
1823    /// isolation. used_maps is left unwritten (zero).
1824    #[test]
1825    fn find_active_struct_ops_obj_none_when_used_maps_null() {
1826        let prog_offsets = synthetic_prog_offsets();
1827        let map_offsets = BpfMapOffsets {
1828            map_name: 0,
1829            ..BpfMapOffsets::EMPTY
1830        };
1831        // aux->used_maps left 0; used_map_cnt irrelevant.
1832        let fx = single_prog_fixture(0x6000, BPF_PROG_TYPE_STRUCT_OPS, &prog_offsets);
1833
1834        let result = find_active_struct_ops_obj_no_target(
1835            &fx.mem(),
1836            fx.walk(),
1837            FIX_IDR_PA + START_KERNEL_MAP,
1838            &prog_offsets,
1839            &map_offsets,
1840            START_KERNEL_MAP,
1841            0,
1842        );
1843        assert!(result.is_none());
1844    }
1845
1846    /// The `.min(MAX_USED_MAPS)` clamp on used_map_cnt: a corrupt
1847    /// used_map_cnt of 70 (> MAX_USED_MAPS=64) must bound the snapshot
1848    /// loop to exactly 64 reads. Entry index 1 is a global-section
1849    /// .bss map so the prefix still resolves, but the captured
1850    /// used_map_kvas vector is capped at 64. The entries beyond index
1851    /// 1 are never translated (the match returns at index 1), so only
1852    /// maps 0 and 1 need a real backing struct; the rest are non-zero
1853    /// KVAs that only enter the snapshot.
1854    #[test]
1855    fn find_active_struct_ops_obj_caps_used_map_cnt_at_max_used_maps() {
1856        let prog_offsets = synthetic_prog_offsets();
1857        let map_offsets = BpfMapOffsets {
1858            map_name: 0,
1859            ..BpfMapOffsets::EMPTY
1860        };
1861        let mut fx = single_prog_fixture(0x6000, BPF_PROG_TYPE_STRUCT_OPS, &prog_offsets);
1862
1863        // used_maps array of 70 non-zero entries (> MAX_USED_MAPS).
1864        const CORRUPT_CNT: u32 = 70;
1865        let map0_kva = fx.pa_to_kva(FIX_MAP0_PA);
1866        let map1_kva = fx.pa_to_kva(FIX_MAP1_PA);
1867        fx.write_u64(
1868            FIX_AUX_PA + prog_offsets.aux_used_maps as u64,
1869            fx.pa_to_kva(FIX_USED_MAPS_PA),
1870        );
1871        fx.write_u32(
1872            FIX_AUX_PA + prog_offsets.aux_used_map_cnt as u64,
1873            CORRUPT_CNT,
1874        );
1875        fx.write_u64(FIX_USED_MAPS_PA, map0_kva);
1876        fx.write_u64(FIX_USED_MAPS_PA + 8, map1_kva);
1877        // Entries 2..70: arbitrary non-zero KVAs (never translated —
1878        // the match returns at index 1). They only populate the
1879        // snapshot, so the clamp is what bounds the loop.
1880        for i in 2..CORRUPT_CNT as u64 {
1881            fx.write_u64(FIX_USED_MAPS_PA + i * 8, 0xDEAD_0000 + i);
1882        }
1883
1884        // map0: struct_ops name (no match). map1: .bss (match).
1885        fx.write_name(FIX_MAP0_PA + map_offsets.map_name as u64, b"ktstr_ops");
1886        fx.write_name(FIX_MAP1_PA + map_offsets.map_name as u64, b"bpf_bpf.bss");
1887
1888        let result = find_active_struct_ops_obj_no_target(
1889            &fx.mem(),
1890            fx.walk(),
1891            FIX_IDR_PA + START_KERNEL_MAP,
1892            &prog_offsets,
1893            &map_offsets,
1894            START_KERNEL_MAP,
1895            0,
1896        );
1897        let m = result.unwrap();
1898        assert_eq!(m.obj_name, "bpf_bpf");
1899        assert!(m.used_map_kvas.len() <= MAX_USED_MAPS as usize);
1900        assert_eq!(m.used_map_kvas.len(), 64);
1901    }
1902}
ktstr/monitor/bpf_prog.rs

ktstr/monitor/
bpf_prog.rs