ktstr/monitor/bpf_prog.rs
1//! Host-side BPF program enumeration via guest physical memory.
2//!
3//! Walks the kernel's `prog_idr` xarray from the host to discover
4//! loaded BPF programs and read verifier stats from `bpf_prog_aux`.
5//! No guest cooperation is needed — all reads go through the guest
6//! physical memory mapping.
7
8use super::btf_offsets::{BpfMapOffsets, BpfProgOffsets};
9use super::idr::{translate_any_kva, xa_load};
10use super::reader::{GuestMem, WalkContext};
11use super::symbols::text_kva_to_pa_with_base;
12
13/// BPF_PROG_TYPE_STRUCT_OPS from include/uapi/linux/bpf.h.
14const BPF_PROG_TYPE_STRUCT_OPS: u32 = 27;
15
16/// Maximum `used_map_cnt` the walker will iterate. The kernel
17/// enforces a per-prog limit of 64 used_maps in `kernel/bpf/verifier.c`
18/// (`MAX_USED_MAPS = 64`), so a higher value here means the read
19/// raced against `bpf_prog_bind_map`'s "increment cnt, then swap
20/// pointer" sequence and got a stale-pointer + new-cnt observation.
21/// Capping at the kernel's own limit bounds the walk past the old
22/// allocation and matches the upper bound a healthy prog can ever
23/// reach.
24pub const MAX_USED_MAPS: u32 = 64;
25
26/// BPF_OBJ_NAME_LEN from include/linux/bpf.h.
27const BPF_OBJ_NAME_LEN: usize = 16;
28
29/// Iterate every alive `BPF_PROG_TYPE_STRUCT_OPS` prog in the
30/// kernel's `prog_idr`, invoking `payload` with each prog's
31/// `(prog_pa, aux_pa, aux_kva)`. The closure returns `Option<T>`;
32/// `Some(value)` is appended to the result vector, `None` skips.
33///
34/// Encapsulates the `prog_idr` walk shared by every per-struct-ops-
35/// prog reader in this module: translate `prog_idr_kva` → `idr_pa`,
36/// read the xarray head, iterate ids 0..idr_next (capped at 65536
37/// for safety against corrupted reads — a real kernel never
38/// approaches that limit), `xa_load` each entry, translate to
39/// `prog_pa`, filter on `prog_type == BPF_PROG_TYPE_STRUCT_OPS`,
40/// then translate `aux_kva` → `aux_pa`. Translation failures or
41/// zero pointers cause the entry to be skipped silently — matches
42/// the prior per-walker behavior and is the right policy under
43/// race conditions (torn reads from slab recycling) where the
44/// alternative is to publish garbage.
45fn for_each_struct_ops_prog<T, F>(
46 mem: &GuestMem,
47 walk: WalkContext,
48 prog_idr_kva: u64,
49 offsets: &BpfProgOffsets,
50 start_kernel_map: u64,
51 phys_base: u64,
52 mut payload: F,
53) -> Vec<T>
54where
55 F: FnMut(u64, u64, u64) -> Option<T>,
56{
57 let idr_pa = text_kva_to_pa_with_base(prog_idr_kva, start_kernel_map, phys_base);
58
59 let xa_head = mem.read_u64(idr_pa, offsets.idr_xa_head);
60 if xa_head == 0 {
61 return Vec::new();
62 }
63 // Cap at 64K entries. A real kernel never has millions of BPF
64 // programs; a larger `idr_next` means the PA is wrong or the
65 // IDR is corrupt. Bounds runaway loops on garbage reads.
66 let idr_next = mem.read_u32(idr_pa, offsets.idr_next).min(65536);
67
68 let mut out = Vec::new();
69 for id in 0..idr_next {
70 let Some(entry) = xa_load(
71 mem,
72 walk.page_offset,
73 xa_head,
74 id as u64,
75 offsets.xa_node_slots,
76 offsets.xa_node_shift,
77 ) else {
78 continue;
79 };
80 if entry == 0 {
81 continue;
82 }
83 let Some(prog_pa) = translate_any_kva(
84 mem,
85 walk.cr3_pa,
86 walk.page_offset,
87 entry,
88 walk.l5,
89 walk.tcr_el1,
90 ) else {
91 continue;
92 };
93 let prog_type = mem.read_u32(prog_pa, offsets.prog_type);
94 if prog_type != BPF_PROG_TYPE_STRUCT_OPS {
95 continue;
96 }
97 let aux_kva = mem.read_u64(prog_pa, offsets.prog_aux);
98 if aux_kva == 0 {
99 continue;
100 }
101 let Some(aux_pa) = translate_any_kva(
102 mem,
103 walk.cr3_pa,
104 walk.page_offset,
105 aux_kva,
106 walk.l5,
107 walk.tcr_el1,
108 ) else {
109 continue;
110 };
111 if let Some(value) = payload(prog_pa, aux_pa, aux_kva) {
112 out.push(value);
113 }
114 }
115 out
116}
117
118/// Per-program BPF verifier statistics collected from the host.
119#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
120pub struct ProgVerifierStats {
121 /// Program name as registered with the kernel.
122 pub name: String,
123 /// Instructions processed by the verifier (path-exploration count,
124 /// not static program size), from `bpf_prog_aux->verified_insns`.
125 pub verified_insns: u32,
126}
127
128/// Enumerate struct_ops BPF programs from the kernel's `prog_idr`.
129///
130/// Reads `prog_idr` from guest memory, walks the xarray, and for
131/// each `bpf_prog` with `type == BPF_PROG_TYPE_STRUCT_OPS`, reads
132/// `aux->verified_insns` and `aux->name`. `start_kernel_map` is the
133/// runtime kernel image base used to translate `prog_idr_kva` to a
134/// guest physical address.
135pub(crate) fn find_struct_ops_progs(
136 mem: &GuestMem,
137 walk: WalkContext,
138 prog_idr_kva: u64,
139 offsets: &BpfProgOffsets,
140 start_kernel_map: u64,
141 phys_base: u64,
142) -> Vec<ProgVerifierStats> {
143 for_each_struct_ops_prog(
144 mem,
145 walk,
146 prog_idr_kva,
147 offsets,
148 start_kernel_map,
149 phys_base,
150 |_prog_pa, aux_pa, _aux_kva| {
151 let verified_insns = mem.read_u32(aux_pa, offsets.aux_verified_insns);
152 let mut name_buf = [0u8; BPF_OBJ_NAME_LEN];
153 mem.read_bytes(aux_pa + offsets.aux_name as u64, &mut name_buf);
154 let name_len = name_buf
155 .iter()
156 .position(|&b| b == 0)
157 .unwrap_or(BPF_OBJ_NAME_LEN);
158 let name = String::from_utf8_lossy(&name_buf[..name_len]).to_string();
159 Some(ProgVerifierStats {
160 name,
161 verified_insns,
162 })
163 },
164 )
165}
166
167/// Target-free active-scheduler walker. See trait method
168/// [`BpfProgAccessor::find_active_struct_ops_obj_no_target`] for the
169/// motivation (Phase 0 sched_kva==value_kva equality is broken on
170/// kernels where `struct scx_sched` allocates fresh and copies
171/// `sched_ext_ops` into its embedded `ops` field).
172///
173/// Walks `prog_idr` for the FIRST `BPF_PROG_TYPE_STRUCT_OPS` prog
174/// whose `aux->used_maps` carries a sibling `<obj>.bss/.data/.rodata`
175/// global-section map. Returns that prog's obj prefix + full
176/// used_map_kvas snapshot. Returns `None` when no such prog exists
177/// (no scheduler attached, or only non-libbpf STRUCT_OPS subsystems
178/// active).
179///
180/// **Threat model: ktstr guest VM is single-tenant.** This walker
181/// returns the FIRST match in `prog_idr` iteration order — it does
182/// not assert uniqueness. ktstr-loaded guests are minimal (only
183/// scx-ktstr runs), so in practice only the live sched_ext
184/// scheduler's prog satisfies the filter. Two reinforcing reasons:
185///
186/// 1. Sched_ext is the only struct_ops subsystem ktstr loads.
187/// 2. The kernel enforces single-ENABLE for sched_ext: the enable
188/// path rejects a second scheduler while one is already enabled
189/// (pre-6.16: `scx_ops_enable_state() != SCX_OPS_DISABLED` ->
190/// `-EBUSY`; the accessor and states were renamed on later
191/// kernels -- 6.16's `scx_enable_state()`/`SCX_DISABLED`), so at
192/// most one sched_ext STRUCT_OPS prog is ENABLED at a time.
193/// This does NOT by itself guarantee the OLD prog has left
194/// `prog_idr` before the NEW prog is added: a detached struct_ops
195/// prog leaves `prog_idr` only when its owning struct_ops MAP's
196/// last userspace fd closes AND an RCU grace elapses (map free is
197/// RCU-deferred, see `kernel/bpf/bpf_struct_ops.c`), and the
198/// kernel does not serialize old-removal before new-add. The
199/// single-alive-prog property this walker relies on is therefore
200/// ktstr's swap sequencing -- `Op::ReplaceScheduler` kills the
201/// outgoing scheduler and waits for its process to exit (closing
202/// the outgoing map's fds) before loading the next -- not a kernel
203/// ordering invariant.
204///
205/// If a future setup loads non-sched_ext libbpf-named STRUCT_OPS
206/// progs (e.g. `tcp_congestion_ops`), this filter would need to also
207/// gate on `aux->btf` matching the sched_ext_ops btf type id.
208///
209/// Standard `prog_idr` walk: read xa_head → iterate ids 0..idr_next
210/// → translate each prog kva → filter to STRUCT_OPS → read aux's
211/// used_maps → derive obj prefix from any global-section sibling
212/// map.
213pub(crate) fn find_active_struct_ops_obj_no_target(
214 mem: &GuestMem,
215 walk: WalkContext,
216 prog_idr_kva: u64,
217 prog_offsets: &BpfProgOffsets,
218 map_offsets: &BpfMapOffsets,
219 start_kernel_map: u64,
220 phys_base: u64,
221) -> Option<ActiveObjMatch> {
222 // Returns the FIRST matching prog's ActiveObjMatch via Some,
223 // skips non-matching progs with None. `into_iter().next()`
224 // extracts that single match below.
225 for_each_struct_ops_prog(
226 mem,
227 walk,
228 prog_idr_kva,
229 prog_offsets,
230 start_kernel_map,
231 phys_base,
232 |_prog_pa, aux_pa, _aux_kva| {
233 // Read used_maps pointer FIRST, then cnt — pairs with
234 // bpf_prog_bind_map's cnt-then-pointer mutation order
235 // (kernel/bpf/syscall.c): the kernel bumps cnt before
236 // swapping the pointer, so cnt-then-pointer reads
237 // would index past the old allocation on a
238 // mid-mutation read. pointer-then-cnt observes cnt ≤
239 // pointer's slot count. Safe under freeze-rendezvous
240 // (vCPUs paused) but the protocol is defense-in-depth
241 // for any out-of-freeze caller.
242 let used_maps_kva = mem.read_u64(aux_pa, prog_offsets.aux_used_maps);
243 if used_maps_kva == 0 {
244 return None;
245 }
246 let used_map_cnt = mem
247 .read_u32(aux_pa, prog_offsets.aux_used_map_cnt)
248 .min(MAX_USED_MAPS);
249 if used_map_cnt == 0 {
250 return None;
251 }
252 let used_maps_pa = translate_any_kva(
253 mem,
254 walk.cr3_pa,
255 walk.page_offset,
256 used_maps_kva,
257 walk.l5,
258 walk.tcr_el1,
259 )?;
260
261 // Snapshot every non-zero used_maps entry (downstream
262 // disambiguation needs the full set as the KVA
263 // whitelist).
264 let mut entries: Vec<u64> = Vec::with_capacity(used_map_cnt as usize);
265 for i in 0..used_map_cnt {
266 let entry_kva = mem.read_u64(used_maps_pa, (i as usize) * 8);
267 if entry_kva != 0 {
268 entries.push(entry_kva);
269 }
270 }
271 // Find a global-section map in the snapshot and derive
272 // the obj prefix. If none, this isn't a libbpf-loaded
273 // scheduler prog (could be a different struct_ops
274 // subsystem like tcp_congestion_ops without libbpf-
275 // named global maps) — return None to skip.
276 for &map_kva in &entries {
277 let Some(map_pa) = translate_any_kva(
278 mem,
279 walk.cr3_pa,
280 walk.page_offset,
281 map_kva,
282 walk.l5,
283 walk.tcr_el1,
284 ) else {
285 continue;
286 };
287 let mut name_buf = [0u8; BPF_OBJ_NAME_LEN];
288 mem.read_bytes(map_pa + map_offsets.map_name as u64, &mut name_buf);
289 let name_len = name_buf
290 .iter()
291 .position(|&b| b == 0)
292 .unwrap_or(BPF_OBJ_NAME_LEN);
293 let Ok(name) = std::str::from_utf8(&name_buf[..name_len]) else {
294 continue;
295 };
296 if let Some(obj) = extract_global_section_obj_prefix(name) {
297 return Some(ActiveObjMatch {
298 obj_name: obj.to_string(),
299 used_map_kvas: entries,
300 });
301 }
302 }
303 None
304 },
305 )
306 .into_iter()
307 .next()
308}
309
310/// Result of [`find_active_struct_ops_obj_no_target`]: the matched
311/// scheduler's obj prefix plus the full set of used_maps KVAs from
312/// the matched prog's aux table. The KVA set lets the consumer
313/// distinguish two scheduler instances loaded from the SAME binary
314/// (whose maps share an obj prefix but live at distinct kernel
315/// addresses) — see
316/// [`crate::scenario::snapshot::Snapshot::active`] for the
317/// downstream filter that combines (obj-name match AND KVA-in-set)
318/// to defend against KVA aliasing across captures.
319#[derive(Debug, Clone)]
320pub(crate) struct ActiveObjMatch {
321 pub obj_name: String,
322 pub used_map_kvas: Vec<u64>,
323}
324
325/// If `map_name` matches `<obj>.bss` / `<obj>.data` / `<obj>.rodata`
326/// (libbpf naming for global-section maps), return `<obj>` (the
327/// prefix before the section suffix). Returns None for any other
328/// map name (struct_ops `ktstr_ops`, libbpf-named kfunc helpers,
329/// hashtables, etc.). Used by the active-obj walker to derive a
330/// scheduler obj prefix from a prog's `used_maps` entries.
331///
332/// The obj prefix returned by this helper is already truncated by
333/// the kernel to fit within `BPF_OBJ_NAME_LEN - section_suffix - 1`
334/// (libbpf's internal_map_name in tools/lib/bpf/libbpf.c). Callers
335/// must match against the same truncated obj prefix when
336/// cross-referencing the captured global-section maps.
337fn extract_global_section_obj_prefix(map_name: &str) -> Option<&str> {
338 for suffix in [".bss", ".data", ".rodata"] {
339 if let Some(prefix) = map_name.strip_suffix(suffix)
340 && !prefix.is_empty()
341 {
342 return Some(prefix);
343 }
344 }
345 None
346}
347
348#[cfg(test)]
349mod extract_global_section_obj_prefix_tests {
350 use super::*;
351
352 #[test]
353 fn extracts_bss_prefix() {
354 assert_eq!(
355 extract_global_section_obj_prefix("ktstr.bss"),
356 Some("ktstr")
357 );
358 }
359
360 #[test]
361 fn extracts_data_prefix() {
362 assert_eq!(
363 extract_global_section_obj_prefix("scx_layered.data"),
364 Some("scx_layered"),
365 );
366 }
367
368 #[test]
369 fn extracts_rodata_prefix() {
370 assert_eq!(
371 extract_global_section_obj_prefix("mitosis.rodata"),
372 Some("mitosis"),
373 );
374 }
375
376 #[test]
377 fn rejects_struct_ops_map_name() {
378 assert_eq!(extract_global_section_obj_prefix("ktstr_ops"), None);
379 assert_eq!(extract_global_section_obj_prefix("mitosis_ops"), None);
380 }
381
382 #[test]
383 fn rejects_unrelated_map_name() {
384 assert_eq!(extract_global_section_obj_prefix("scx_per_task"), None);
385 assert_eq!(extract_global_section_obj_prefix("bpf_runq"), None);
386 }
387
388 #[test]
389 fn rejects_empty_prefix_before_suffix() {
390 // ".bss" with no obj — degenerate map name; skip.
391 assert_eq!(extract_global_section_obj_prefix(".bss"), None);
392 }
393}
394
395/// Per-program runtime stats summed across all CPUs.
396///
397/// Mirrors the kernel's `struct bpf_prog_stats` (include/linux/filter.h):
398/// `cnt` (invocations), `nsecs` (cumulative runtime), `misses` (recursion
399/// re-entries skipped via `bpf_prog_inc_misses_counter`,
400/// kernel/bpf/syscall.c). All three counters are u64 monotonics summed
401/// across the program's per-CPU `bpf_prog_stats` slots.
402#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
403pub struct ProgRuntimeStats {
404 /// Program name as registered with the kernel.
405 pub name: String,
406 /// Total invocation count across all CPUs.
407 pub cnt: u64,
408 /// Total CPU time in nanoseconds across all CPUs.
409 pub nsecs: u64,
410 /// Total recursion misses across all CPUs. A miss is a re-entry
411 /// attempt blocked by the program's per-CPU recursion guard.
412 pub misses: u64,
413}
414
415impl ProgRuntimeStats {
416 /// Mean nanoseconds per invocation: `nsecs / cnt`. Returns
417 /// `0.0` when `cnt == 0` (program never ran or counter not
418 /// running) so the result never propagates `NaN` / `Infinity`
419 /// into downstream `finite_or_zero` filters. Method-only access
420 /// (no stored shadow) — recomputed every call from the raw
421 /// fields, matching the [`super::super::assert::CgroupStats::wake_latency_tail_ratio`]
422 /// derived-ratio convention.
423 ///
424 /// Unitless-from-bpftop's perspective: bpftop-style triage
425 /// reads "ns/call" as the primary cost-per-invocation metric;
426 /// surfacing it here lets a failure-dump consumer compare two
427 /// programs' per-call cost without dividing the wire counters
428 /// manually.
429 pub fn ns_per_call(&self) -> f64 {
430 if self.cnt > 0 {
431 self.nsecs as f64 / self.cnt as f64
432 } else {
433 0.0
434 }
435 }
436
437 /// Fraction of invocation attempts blocked by the per-CPU
438 /// recursion guard: `misses / (cnt + misses)`. Returns `0.0`
439 /// when both counters are zero (no signal); never produces
440 /// `NaN` / `Infinity` even on a saturated `cnt + misses`
441 /// overflow because `saturating_add` floors at `u64::MAX` and
442 /// the resulting denominator is non-zero.
443 ///
444 /// A non-trivial miss rate signals lock contention or a
445 /// misconfigured recursion guard — bpftop-style triage flags
446 /// any program with `miss_rate > 0.01` as a hot recursion
447 /// path. Method-only access (no stored shadow); the wire
448 /// format carries `cnt` and `misses` separately so consumers
449 /// who want the raw counts can recover them.
450 pub fn miss_rate(&self) -> f64 {
451 let total = self.cnt.saturating_add(self.misses);
452 if total > 0 {
453 self.misses as f64 / total as f64
454 } else {
455 0.0
456 }
457 }
458}
459
460impl std::fmt::Display for ProgRuntimeStats {
461 /// One-line summary used by [`super::dump::FailureDumpReport`]'s
462 /// human-readable rendering: name + the three counter sums plus
463 /// the bpftop-style derived metrics (ns/call, miss-rate fraction).
464 /// Derived metrics elide when their guards fire (cnt==0 or
465 /// cnt+misses==0) so a program that never ran renders without
466 /// misleading "0.000 ns/call" noise.
467 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
468 write!(
469 f,
470 "{}: cnt={} nsecs={} misses={}",
471 self.name, self.cnt, self.nsecs, self.misses
472 )?;
473 if self.cnt > 0 {
474 // Three decimals on ns/call: bpftop uses two; we add
475 // one for sub-microsecond precision since scheduler
476 // BPF ops typically run in tens of nanoseconds.
477 write!(f, " ns/call={:.3}", self.ns_per_call())?;
478 }
479 if self.cnt.saturating_add(self.misses) > 0 && self.misses > 0 {
480 // Render miss_rate only when there were actual misses
481 // — `0.000` would just be noise on healthy programs.
482 // Four decimals: a 0.0001 (= 1 in 10K) miss rate is
483 // already actionable for a hot scheduler op.
484 write!(f, " miss_rate={:.4}", self.miss_rate())?;
485 }
486 Ok(())
487 }
488}
489
490/// Walk `prog_idr` and produce per-program runtime stats in a single
491/// IDR pass.
492///
493/// Folds the previous discover-then-read split into one visitor: for
494/// each struct_ops program reached via `xa_load`, read
495/// `bpf_prog->stats` (per-CPU base) and `bpf_prog_aux->name` and then
496/// sum `cnt`/`nsecs`/`misses` across `per_cpu_offsets`. Halves the
497/// per-prog kernel-memory reads relative to the prior split (one
498/// `prog_idr` walk and one `bpf_prog`/`aux` translate per program
499/// instead of two of each).
500///
501/// `cnt`/`nsecs`/`misses` are u64 monotonic counters per the kernel's
502/// `struct bpf_prog_stats` (include/linux/filter.h) — see
503/// [`ProgRuntimeStats`] for provenance and the saturation contract.
504/// Address translation uses [`translate_any_kva`] so per-CPU pages
505/// served from vmalloc'd memory (`pcpu_get_vm_areas`) translate
506/// correctly alongside direct-mapping percpu allocations.
507pub(crate) fn walk_struct_ops_runtime_stats(
508 mem: &GuestMem,
509 walk: WalkContext,
510 prog_idr_kva: u64,
511 offsets: &BpfProgOffsets,
512 per_cpu_offsets: &[u64],
513 start_kernel_map: u64,
514 phys_base: u64,
515) -> Vec<ProgRuntimeStats> {
516 for_each_struct_ops_prog(
517 mem,
518 walk,
519 prog_idr_kva,
520 offsets,
521 start_kernel_map,
522 phys_base,
523 |prog_pa, aux_pa, _aux_kva| {
524 let mut name_buf = [0u8; BPF_OBJ_NAME_LEN];
525 mem.read_bytes(aux_pa + offsets.aux_name as u64, &mut name_buf);
526 let name_len = name_buf
527 .iter()
528 .position(|&b| b == 0)
529 .unwrap_or(BPF_OBJ_NAME_LEN);
530 let name = String::from_utf8_lossy(&name_buf[..name_len]).to_string();
531
532 let stats_percpu_kva = mem.read_u64(prog_pa, offsets.prog_stats);
533 if stats_percpu_kva == 0 {
534 return None;
535 }
536
537 // Per-CPU sum. saturating_add prevents the
538 // `attempt to add with overflow` panic that's been
539 // observed when uninitialized / scrambled per-CPU pages
540 // yield near-u64::MAX values; see `ProgRuntimeStats`.
541 let mut cnt: u64 = 0;
542 let mut nsecs: u64 = 0;
543 let mut misses: u64 = 0;
544 for (cpu_index, &cpu_off) in per_cpu_offsets.iter().enumerate() {
545 // Out-of-range CPU detection: kernel `setup_per_cpu_areas`
546 // only writes `__per_cpu_offset[cpu]` for CPUs in
547 // `for_each_possible_cpu`, leaving slots beyond
548 // `nr_cpu_ids` at the BSS-initialized 0. Real SMP
549 // kernels assign each possible CPU a strictly-positive
550 // offset for `cpu > 0`; only the BSP (cpu_index == 0)
551 // can legitimately observe a zero offset. Skip
552 // `cpu_off == 0 && cpu_index > 0` to avoid double-
553 // counting CPU 0's stats for every BSS-zero tail slot.
554 // Mirrors the guard in
555 // [`super::bpf_map::read_percpu_array_value`].
556 if cpu_off == 0 && cpu_index > 0 {
557 continue;
558 }
559 let stats_kva = stats_percpu_kva.wrapping_add(cpu_off);
560 if let Some(stats_pa) = translate_any_kva(
561 mem,
562 walk.cr3_pa,
563 walk.page_offset,
564 stats_kva,
565 walk.l5,
566 walk.tcr_el1,
567 ) && stats_pa < mem.size()
568 {
569 // Batch the three u64 stat reads into one bulk
570 // `read_bytes` covering the contiguous span from
571 // `min(cnt, nsecs, misses)` to `max(...) + 8`. The
572 // kernel's `struct bpf_prog_stats` packs `cnt`,
573 // `nsecs`, and `misses` as adjacent u64_stats_t
574 // (8 bytes each) and the BTF resolver accepts only
575 // layouts where the three fields land in 24
576 // contiguous bytes. The bulk read pays one bounds
577 // check + region resolve instead of three per CPU,
578 // and parses the values from the local buffer
579 // without further volatile loads.
580 let lo = offsets
581 .stats_cnt
582 .min(offsets.stats_nsecs)
583 .min(offsets.stats_misses);
584 let hi = offsets
585 .stats_cnt
586 .max(offsets.stats_nsecs)
587 .max(offsets.stats_misses)
588 + 8;
589 let span = hi - lo;
590 if span <= 64 {
591 let mut buf = [0u8; 64];
592 let n = mem.read_bytes(stats_pa + lo as u64, &mut buf[..span]);
593 if n == span {
594 let parse = |off: usize| -> u64 {
595 let i = off - lo;
596 u64::from_ne_bytes(buf[i..i + 8].try_into().unwrap())
597 };
598 cnt = cnt.saturating_add(parse(offsets.stats_cnt));
599 nsecs = nsecs.saturating_add(parse(offsets.stats_nsecs));
600 misses = misses.saturating_add(parse(offsets.stats_misses));
601 } else {
602 // Partial copy (page straddle / end-of-DRAM)
603 // — fall back to scalar reads to retain the
604 // original semantics.
605 cnt = cnt.saturating_add(mem.read_u64(stats_pa, offsets.stats_cnt));
606 nsecs =
607 nsecs.saturating_add(mem.read_u64(stats_pa, offsets.stats_nsecs));
608 misses =
609 misses.saturating_add(mem.read_u64(stats_pa, offsets.stats_misses));
610 }
611 } else {
612 // Span exceeds the inline buffer. Should be
613 // unreachable for the production
614 // `bpf_prog_stats` layout (24 bytes), but
615 // tolerate exotic layouts via the scalar path
616 // rather than panicking.
617 cnt = cnt.saturating_add(mem.read_u64(stats_pa, offsets.stats_cnt));
618 nsecs = nsecs.saturating_add(mem.read_u64(stats_pa, offsets.stats_nsecs));
619 misses =
620 misses.saturating_add(mem.read_u64(stats_pa, offsets.stats_misses));
621 }
622 }
623 }
624
625 Some(ProgRuntimeStats {
626 name,
627 cnt,
628 nsecs,
629 misses,
630 })
631 },
632 )
633}
634
635/// Read-only abstraction over BPF program enumeration and per-program
636/// stats reads across data sources. Mirror of
637/// [`super::bpf_map::BpfMapAccessor`] for the program side.
638///
639/// Currently one implementation: [`GuestMemProgAccessor`] (PTE-walks a
640/// frozen guest's `prog_idr`). The planned live-host backend
641/// will walk loaded programs via `BPF_PROG_GET_NEXT_ID` /
642/// `BPF_OBJ_GET_INFO_BY_FD` and produce the same
643/// `Vec<ProgVerifierStats>` / `Vec<ProgRuntimeStats>` shapes, so the
644/// failure-dump renderer stays data-source-agnostic.
645pub trait BpfProgAccessor {
646 /// Enumerate struct_ops BPF programs and collect verifier stats.
647 fn struct_ops_progs(&self) -> Vec<ProgVerifierStats>;
648
649 /// Snapshot per-program runtime stats (`cnt`, `nsecs`, `misses`)
650 /// for every struct_ops BPF program, summed across all CPUs.
651 ///
652 /// `per_cpu_offsets` is the kernel's `__per_cpu_offset[]` array,
653 /// typically obtained via [`super::symbols::read_per_cpu_offsets`].
654 /// The live-host backend will ignore this argument (the kernel
655 /// provides per-CPU sums via `BPF_OBJ_GET_INFO_BY_FD`).
656 fn struct_ops_runtime_stats(&self, per_cpu_offsets: &[u64]) -> Vec<ProgRuntimeStats>;
657
658 /// Target-free active-scheduler walker: find the FIRST alive
659 /// `BPF_PROG_TYPE_STRUCT_OPS` prog whose `aux->used_maps` carries
660 /// a sibling `<obj>.bss/.data/.rodata` global-section map, and
661 /// return that prog's obj prefix + full used_map_kvas set.
662 ///
663 /// **Why this exists.** The prior `value_kva == *scx_root`
664 /// equality approach in `identify_active_obj_from_struct_ops`
665 /// required identifying the active struct_ops map first. That
666 /// equality breaks on kernels where `struct scx_sched` allocates
667 /// a fresh kernel-side struct and COPIES the user's
668 /// `sched_ext_ops` into its embedded `ops` field (offset 0) —
669 /// `*scx_root` then points at the kernel-allocated `scx_sched`
670 /// (whose address equals `&scx_sched.ops`), NOT at the struct_ops
671 /// map's `kvalue.data` buffer (the user's source ops table at a
672 /// separate address). Without a target, this walker iterates
673 /// `prog_idr` and uses the "prog has a global-section map" signal
674 /// directly.
675 ///
676 /// **Uniqueness via ktstr threat model.** This walker returns
677 /// the FIRST match -- it does not assert uniqueness. ktstr's guest
678 /// VM is single-tenant (only scx-ktstr runs), and the kernel
679 /// enforces single-ENABLE for sched_ext: the enable path rejects a
680 /// second scheduler while one is already enabled (pre-6.16:
681 /// `scx_ops_enable_state() != SCX_OPS_DISABLED` -> `-EBUSY`;
682 /// renamed to `scx_enable_state()`/`SCX_DISABLED` on 6.16+), so at
683 /// most one sched_ext prog is ENABLED at a time. That does NOT
684 /// guarantee one prog alive in `prog_idr`: a detached prog lingers
685 /// until its owning struct_ops map's last fd closes and an RCU
686 /// grace elapses, so the single-alive-in-`prog_idr` property this
687 /// walker's FIRST-match relies on rests on ktstr's swap sequencing
688 /// (kill the outgoing scheduler, wait for its process to exit
689 /// before loading the next), not a kernel invariant. No other
690 /// struct_ops subsystem (e.g. `tcp_congestion_ops`) ever loads in
691 /// ktstr-managed guests. If a future setup loads non-sched_ext
692 /// libbpf STRUCT_OPS progs, this filter would need to also gate on
693 /// `aux->btf` matching the sched_ext_ops btf type id.
694 ///
695 /// Returns `None` when no live STRUCT_OPS prog has global-section
696 /// maps (no scheduler attached, or only non-sched_ext struct_ops
697 /// subsystems are running). The caller's prefix-grouping fallback
698 /// handles the no-match case.
699 fn find_active_struct_ops_obj_no_target(
700 &self,
701 map_offsets: &BpfMapOffsets,
702 ) -> Option<ActiveObjMatch>;
703}
704
705/// Host-side BPF program accessor backed by direct guest physical-memory
706/// reads. PTE-walks a frozen guest's `prog_idr` to enumerate loaded
707/// programs and reads `bpf_prog_stats` per-CPU slots inline.
708pub struct GuestMemProgAccessor<'a> {
709 kernel: &'a super::guest::GuestKernel,
710 prog_idr_kva: u64,
711 /// Borrowed from the caller. Mirrors the
712 /// [`super::bpf_map::GuestMemMapAccessor`] pattern:
713 /// `BpfProgOffsets` is a ~112-byte POD built once from the
714 /// vmlinux BTF, and every hot-path method reads it by reference,
715 /// so owning it in the accessor would charge a clone that serves
716 /// no purpose.
717 offsets: &'a BpfProgOffsets,
718}
719
720impl<'a> GuestMemProgAccessor<'a> {
721 /// Create from an existing [`GuestKernel`](super::guest::GuestKernel)
722 /// and a caller-owned [`BpfProgOffsets`]. The accessor borrows both
723 /// for its lifetime — build `offsets` once via
724 /// [`BpfProgOffsets::from_vmlinux`] and reuse across calls.
725 pub fn from_guest_kernel(
726 kernel: &'a super::guest::GuestKernel,
727 offsets: &'a BpfProgOffsets,
728 ) -> anyhow::Result<Self> {
729 let prog_idr_kva = kernel
730 .symbol_kva("prog_idr")
731 .ok_or_else(|| anyhow::anyhow!("prog_idr symbol not found in vmlinux"))?;
732
733 Ok(Self {
734 kernel,
735 prog_idr_kva,
736 offsets,
737 })
738 }
739}
740
741impl BpfProgAccessor for GuestMemProgAccessor<'_> {
742 fn struct_ops_progs(&self) -> Vec<ProgVerifierStats> {
743 find_struct_ops_progs(
744 self.kernel.mem(),
745 self.kernel.walk_context(),
746 self.prog_idr_kva,
747 self.offsets,
748 self.kernel.start_kernel_map(),
749 self.kernel.phys_base(),
750 )
751 }
752
753 /// Mirrors the kernel-side per-CPU accumulation: `cnt` is
754 /// bumped via `u64_stats_inc` and `nsecs` is bumped via
755 /// `u64_stats_add(&stats->nsecs, duration)` inside
756 /// `__bpf_prog_run` (include/linux/filter.h), invoked through
757 /// the JIT-emitted entry path on every program invocation.
758 /// `misses` is bumped by `bpf_prog_inc_misses_counter`
759 /// (defined in `kernel/bpf/syscall.c`) called from
760 /// `kernel/bpf/trampoline.c::__bpf_prog_enter_recur` when a
761 /// program re-enters and the recursion guard rejects it.
762 fn struct_ops_runtime_stats(&self, per_cpu_offsets: &[u64]) -> Vec<ProgRuntimeStats> {
763 walk_struct_ops_runtime_stats(
764 self.kernel.mem(),
765 self.kernel.walk_context(),
766 self.prog_idr_kva,
767 self.offsets,
768 per_cpu_offsets,
769 self.kernel.start_kernel_map(),
770 self.kernel.phys_base(),
771 )
772 }
773
774 fn find_active_struct_ops_obj_no_target(
775 &self,
776 map_offsets: &BpfMapOffsets,
777 ) -> Option<ActiveObjMatch> {
778 find_active_struct_ops_obj_no_target(
779 self.kernel.mem(),
780 self.kernel.walk_context(),
781 self.prog_idr_kva,
782 self.offsets,
783 map_offsets,
784 self.kernel.start_kernel_map(),
785 self.kernel.phys_base(),
786 )
787 }
788}
789
790/// Owns a [`super::guest::GuestKernel`] and a [`BpfProgOffsets`],
791/// providing BPF program access through a borrowed
792/// [`GuestMemProgAccessor`].
793///
794/// Mirrors [`super::bpf_map::GuestMemMapAccessorOwned`] for the
795/// program-side surface: callers that don't already hold a
796/// `GuestKernel` + `BpfProgOffsets` pair (e.g. the freeze
797/// coordinator) construct one of these once at start, retain it
798/// across the run, and borrow [`Self::as_accessor`] for each
799/// read. Owning the offsets here keeps the BTF parse to once per
800/// VM run rather than once per dump.
801pub struct GuestMemProgAccessorOwned {
802 kernel: super::guest::GuestKernel,
803 prog_idr_kva: u64,
804 offsets: BpfProgOffsets,
805}
806
807impl GuestMemProgAccessorOwned {
808 pub fn finish(
809 kernel: super::guest::GuestKernel,
810 elf: &goblin::elf::Elf<'_>,
811 data: &[u8],
812 vmlinux: &std::path::Path,
813 ) -> anyhow::Result<Self> {
814 let offsets = BpfProgOffsets::from_elf(elf, data, vmlinux)?;
815 let prog_idr_kva = kernel
816 .symbol_kva("prog_idr")
817 .ok_or_else(|| anyhow::anyhow!("prog_idr symbol not found in vmlinux"))?;
818 Ok(Self {
819 kernel,
820 prog_idr_kva,
821 offsets,
822 })
823 }
824
825 /// Borrow as a [`GuestMemProgAccessor`] for program operations.
826 ///
827 /// Infallible — `finish` already resolved `prog_idr_kva` and the
828 /// borrow returns the cached KVA directly. Mirrors
829 /// [`super::bpf_map::GuestMemMapAccessorOwned::as_accessor`].
830 pub fn as_accessor(&self) -> GuestMemProgAccessor<'_> {
831 GuestMemProgAccessor {
832 kernel: &self.kernel,
833 prog_idr_kva: self.prog_idr_kva,
834 offsets: &self.offsets,
835 }
836 }
837
838 /// Access the underlying [`super::guest::GuestKernel`] for
839 /// callers that need symbol resolution / page-walk primitives
840 /// outside the prog-discovery surface (e.g. resolving
841 /// `__per_cpu_offset` for `struct_ops_runtime_stats`).
842 #[allow(dead_code)]
843 pub fn guest_kernel(&self) -> &super::guest::GuestKernel {
844 &self.kernel
845 }
846}
847
848#[cfg(test)]
849mod tests {
850 use super::*;
851 use crate::monitor::symbols::START_KERNEL_MAP;
852
853 #[test]
854 fn prog_verifier_stats_serde_roundtrip() {
855 let info = ProgVerifierStats {
856 name: "dispatch".to_string(),
857 verified_insns: 42000,
858 };
859 let json = serde_json::to_string(&info).unwrap();
860 let loaded: ProgVerifierStats = serde_json::from_str(&json).unwrap();
861 assert_eq!(loaded.name, "dispatch");
862 assert_eq!(loaded.verified_insns, 42000);
863 }
864
865 #[test]
866 fn prog_verifier_stats_vec_serde_roundtrip() {
867 let stats = vec![
868 ProgVerifierStats {
869 name: "dispatch".to_string(),
870 verified_insns: 100000,
871 },
872 ProgVerifierStats {
873 name: "enqueue".to_string(),
874 verified_insns: 50000,
875 },
876 ];
877 let json = serde_json::to_vec(&stats).unwrap();
878 let loaded: Vec<ProgVerifierStats> = serde_json::from_slice(&json).unwrap();
879 assert_eq!(loaded.len(), 2);
880 assert_eq!(loaded[0].name, "dispatch");
881 assert_eq!(loaded[0].verified_insns, 100000);
882 assert_eq!(loaded[1].name, "enqueue");
883 assert_eq!(loaded[1].verified_insns, 50000);
884 }
885
886 #[test]
887 fn prog_verifier_stats_empty_name() {
888 let info = ProgVerifierStats {
889 name: String::new(),
890 verified_insns: 0,
891 };
892 let json = serde_json::to_string(&info).unwrap();
893 let loaded: ProgVerifierStats = serde_json::from_str(&json).unwrap();
894 assert_eq!(loaded.name, "");
895 assert_eq!(loaded.verified_insns, 0);
896 }
897
898 #[test]
899 fn prog_verifier_stats_max_values() {
900 let info = ProgVerifierStats {
901 name: "x".repeat(16),
902 verified_insns: u32::MAX,
903 };
904 let json = serde_json::to_string(&info).unwrap();
905 let loaded: ProgVerifierStats = serde_json::from_str(&json).unwrap();
906 assert_eq!(loaded.verified_insns, u32::MAX);
907 assert_eq!(loaded.name.len(), 16);
908 }
909
910 #[test]
911 fn prog_runtime_stats_serde_roundtrip() {
912 let info = ProgRuntimeStats {
913 name: "ktstr_dispatch".to_string(),
914 cnt: 12345,
915 nsecs: 9_876_543,
916 misses: 7,
917 };
918 let json = serde_json::to_string(&info).unwrap();
919 let loaded: ProgRuntimeStats = serde_json::from_str(&json).unwrap();
920 assert_eq!(loaded.name, "ktstr_dispatch");
921 assert_eq!(loaded.cnt, 12345);
922 assert_eq!(loaded.nsecs, 9_876_543);
923 assert_eq!(loaded.misses, 7);
924 }
925
926 /// All three counters use `saturating_add` in
927 /// [`walk_struct_ops_runtime_stats`] when summing per-CPU slots, so a
928 /// long-running guest with a hot BPF program (or scrambled
929 /// per-CPU pages from an unmapped slot) can produce a `u64::MAX`
930 /// sum instead of wrapping. Pinning the wire shape here proves
931 /// the serde codec preserves the saturated value end-to-end —
932 /// any future migration that swaps the field type would surface
933 /// here before bleeding into the failure-dump consumers.
934 #[test]
935 fn prog_runtime_stats_max_u64_saturation_roundtrip() {
936 let info = ProgRuntimeStats {
937 name: "saturated".to_string(),
938 cnt: u64::MAX,
939 nsecs: u64::MAX,
940 misses: u64::MAX,
941 };
942 let json = serde_json::to_string(&info).unwrap();
943 let loaded: ProgRuntimeStats = serde_json::from_str(&json).unwrap();
944 assert_eq!(loaded.cnt, u64::MAX);
945 assert_eq!(loaded.nsecs, u64::MAX);
946 assert_eq!(loaded.misses, u64::MAX);
947 }
948
949 #[test]
950 fn prog_runtime_stats_default_zero() {
951 let info = ProgRuntimeStats::default();
952 assert_eq!(info.name, "");
953 assert_eq!(info.cnt, 0);
954 assert_eq!(info.nsecs, 0);
955 assert_eq!(info.misses, 0);
956 }
957
958 /// The Display impl is the entry point used by
959 /// [`super::dump::FailureDumpReport`]'s human-readable rendering;
960 /// pin the format so a downstream change to the impl is caught
961 /// before the failure-dump output silently changes shape.
962 ///
963 /// Two derived metrics surface on the line when their guards
964 /// pass: `ns/call` whenever `cnt > 0`, and `miss_rate`
965 /// whenever there are any misses. A program that never ran
966 /// (cnt=0) elides both — `prog_runtime_stats_display_zero_counters_elides_derived`
967 /// covers that branch.
968 #[test]
969 fn prog_runtime_stats_display_format() {
970 let info = ProgRuntimeStats {
971 name: "ktstr_enqueue".to_string(),
972 cnt: 100,
973 nsecs: 200,
974 misses: 3,
975 };
976 // cnt=100, nsecs=200 → ns/call = 2.000.
977 // misses=3, cnt+misses=103 → miss_rate = 3/103 ≈ 0.0291.
978 assert_eq!(
979 format!("{info}"),
980 "ktstr_enqueue: cnt=100 nsecs=200 misses=3 ns/call=2.000 miss_rate=0.0291",
981 );
982 }
983
984 /// A program that never ran (cnt=0) renders only the four
985 /// raw counters — both derived metrics are guarded out.
986 /// Pin the elision so a regression that strips the guard and
987 /// emits "ns/call=0.000 miss_rate=0.0000" surfaces here.
988 #[test]
989 fn prog_runtime_stats_display_zero_counters_elides_derived() {
990 let info = ProgRuntimeStats {
991 name: "never_ran".to_string(),
992 cnt: 0,
993 nsecs: 0,
994 misses: 0,
995 };
996 let s = format!("{info}");
997 assert_eq!(s, "never_ran: cnt=0 nsecs=0 misses=0");
998 assert!(!s.contains("ns/call"), "ns/call must elide when cnt=0: {s}");
999 assert!(
1000 !s.contains("miss_rate"),
1001 "miss_rate must elide when total=0: {s}"
1002 );
1003 }
1004
1005 /// Healthy program with no recursion misses — `ns/call`
1006 /// surfaces but `miss_rate` elides (since misses=0).
1007 /// A regression that flipped the gate and rendered a
1008 /// "miss_rate=0.0000" line on every healthy program would
1009 /// trip here.
1010 #[test]
1011 fn prog_runtime_stats_display_no_misses_elides_miss_rate() {
1012 let info = ProgRuntimeStats {
1013 name: "healthy".to_string(),
1014 cnt: 1000,
1015 nsecs: 50_000,
1016 misses: 0,
1017 };
1018 let s = format!("{info}");
1019 assert!(s.contains("ns/call=50.000"), "ns/call must render: {s}");
1020 assert!(
1021 !s.contains("miss_rate"),
1022 "miss_rate must elide when misses=0: {s}",
1023 );
1024 }
1025
1026 /// `ns_per_call` derived accessor: pin happy-path math + zero-
1027 /// divisor guard. Mirrors the `CgroupStats::wake_latency_tail_ratio`
1028 /// test pattern from assert.rs.
1029 #[test]
1030 fn prog_runtime_stats_ns_per_call_derived() {
1031 // Happy path: 1000 cnt + 50000 nsecs = 50 ns/call.
1032 let info = ProgRuntimeStats {
1033 name: "x".to_string(),
1034 cnt: 1000,
1035 nsecs: 50_000,
1036 misses: 0,
1037 };
1038 assert_eq!(info.ns_per_call(), 50.0);
1039 assert!(info.ns_per_call().is_finite());
1040
1041 // Zero divisor: cnt=0 → 0.0 (not NaN).
1042 let info = ProgRuntimeStats {
1043 name: "x".to_string(),
1044 cnt: 0,
1045 nsecs: 999_999,
1046 misses: 0,
1047 };
1048 assert_eq!(info.ns_per_call(), 0.0);
1049 assert!(info.ns_per_call().is_finite());
1050 }
1051
1052 /// `miss_rate` derived accessor: pin happy-path math + zero-
1053 /// divisor guard + saturating_add edge.
1054 #[test]
1055 fn prog_runtime_stats_miss_rate_derived() {
1056 // Happy path: 9 misses / (1 cnt + 9 misses) = 0.9.
1057 let info = ProgRuntimeStats {
1058 name: "x".to_string(),
1059 cnt: 1,
1060 nsecs: 0,
1061 misses: 9,
1062 };
1063 assert!((info.miss_rate() - 0.9).abs() < 1e-12);
1064 assert!(info.miss_rate().is_finite());
1065
1066 // Zero divisor: both counters zero → 0.0 (not NaN).
1067 let info = ProgRuntimeStats::default();
1068 assert_eq!(info.miss_rate(), 0.0);
1069 assert!(info.miss_rate().is_finite());
1070
1071 // Saturating-add edge: cnt at u64::MAX, misses also non-
1072 // trivial — `saturating_add` floors at u64::MAX, so the
1073 // denominator stays non-zero and the rate is finite.
1074 let info = ProgRuntimeStats {
1075 name: "saturated".to_string(),
1076 cnt: u64::MAX,
1077 nsecs: 0,
1078 misses: 1000,
1079 };
1080 assert!(info.miss_rate().is_finite());
1081 // Result is essentially 0 (1000 / u64::MAX) but the
1082 // important contract is finiteness — a regression that
1083 // overflowed and produced inf/NaN trips here.
1084 assert!(info.miss_rate() >= 0.0);
1085 }
1086
1087 /// Wire format must NOT carry the derived ratios — they are
1088 /// method-only and recomputed on read. Pin so a regression
1089 /// that re-introduces a stored shadow trips here.
1090 #[test]
1091 fn prog_runtime_stats_wire_format_omits_derived_keys() {
1092 let info = ProgRuntimeStats {
1093 name: "x".to_string(),
1094 cnt: 100,
1095 nsecs: 200,
1096 misses: 3,
1097 };
1098 let json = serde_json::to_value(&info).unwrap();
1099 let map = match json {
1100 serde_json::Value::Object(m) => m,
1101 other => panic!("expected object, got {other:?}"),
1102 };
1103 assert!(
1104 !map.contains_key("ns_per_call"),
1105 "derived methods must NOT appear as wire fields: {map:#?}",
1106 );
1107 assert!(
1108 !map.contains_key("miss_rate"),
1109 "derived methods must NOT appear as wire fields: {map:#?}",
1110 );
1111 // Cross-check: methods still compute correctly.
1112 assert_eq!(info.ns_per_call(), 2.0);
1113 assert!((info.miss_rate() - 3.0_f64 / 103.0).abs() < 1e-12);
1114 }
1115
1116 /// Build a minimal `BpfProgOffsets` keyed for the synthetic
1117 /// chain test below. The exact field offsets are arbitrary —
1118 /// they only need to be consistent with how the test buffer
1119 /// is laid out — but `stats_cnt`/`stats_nsecs`/`stats_misses`
1120 /// MUST sit within a 24-byte window so the bulk-read path
1121 /// fires (`span <= 64`). Drift in these three offsets would
1122 /// silently switch the walker to the scalar fallback and
1123 /// the bulk-read assertion below would still pass for the
1124 /// wrong reason.
1125 fn synthetic_prog_offsets() -> BpfProgOffsets {
1126 BpfProgOffsets {
1127 prog_type: 0,
1128 prog_aux: 8,
1129 aux_verified_insns: 0,
1130 aux_name: 8,
1131 aux_used_maps: 24,
1132 aux_used_map_cnt: 32,
1133 xa_node_slots: 16,
1134 xa_node_shift: 0,
1135 idr_xa_head: 0,
1136 idr_next: 8,
1137 prog_stats: 16,
1138 stats_cnt: 0,
1139 stats_nsecs: 8,
1140 stats_misses: 16,
1141 }
1142 }
1143
1144 /// Run the bulk-24-byte-read end-to-end chain at a caller-
1145 /// supplied `page_offset`. Both the x86_64 and aarch64 wrapper
1146 /// tests call this with their respective `PAGE_OFFSET` baselines
1147 /// so the bulk-read fast path is exercised on both arches.
1148 fn walk_struct_ops_runtime_stats_bulk_chain_at_page_offset(page_offset: u64) {
1149 use crate::monitor::reader::{GuestMem, WalkContext};
1150
1151 // Layout (all PAs offset by `page_offset` to form KVAs in
1152 // the direct-mapping range, except `prog_idr_kva` which
1153 // sits in the kernel-text range and translates via
1154 // `text_kva_to_pa_with_base`):
1155 //
1156 // 0x0000 prog_idr (xa_head + idr_next)
1157 // 0x1000 bpf_prog (prog_type, prog_aux, prog_stats)
1158 // 0x2000 bpf_prog_aux (verified_insns, name)
1159 // 0x3000 per-CPU bpf_prog_stats (cnt, nsecs, misses)
1160 let total: usize = 0x4000;
1161 let mut buf = vec![0u8; total];
1162
1163 let pa_to_kva = |pa: u64| -> u64 { page_offset.wrapping_add(pa) };
1164
1165 let idr_pa: u64 = 0x0000;
1166 let prog_pa: u64 = 0x1000;
1167 let aux_pa: u64 = 0x2000;
1168 let stats_pa: u64 = 0x3000;
1169
1170 // Single-entry xarray: `xa_head` IS the prog KVA with
1171 // bit 1 clear (leaf marker). `pa_to_kva(prog_pa)` has
1172 // bit 1 clear because prog_pa is 4 KiB-aligned.
1173 let prog_kva = pa_to_kva(prog_pa);
1174 assert_eq!(prog_kva & 2, 0, "prog_kva must be a leaf entry");
1175
1176 let offsets = synthetic_prog_offsets();
1177 // Sanity: the bulk-read fast path requires
1178 // `span = hi - lo <= 64`. With offsets {0, 8, 16}:
1179 // lo = 0, hi = 16 + 8 = 24, span = 24. Pinning here so
1180 // a future offset change that pushed `span > 64`
1181 // (forcing the scalar fallback) trips the assert
1182 // before the test runs.
1183 let lo = offsets
1184 .stats_cnt
1185 .min(offsets.stats_nsecs)
1186 .min(offsets.stats_misses);
1187 let hi = offsets
1188 .stats_cnt
1189 .max(offsets.stats_nsecs)
1190 .max(offsets.stats_misses)
1191 + 8;
1192 assert!(
1193 hi - lo <= 64,
1194 "test premise: stats span must be small enough for the bulk path"
1195 );
1196
1197 let write_u64 = |buf: &mut Vec<u8>, pa: u64, val: u64| {
1198 let off = pa as usize;
1199 buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
1200 };
1201 let write_u32 = |buf: &mut Vec<u8>, pa: u64, val: u32| {
1202 let off = pa as usize;
1203 buf[off..off + 4].copy_from_slice(&val.to_ne_bytes());
1204 };
1205
1206 // IDR: xa_head = prog_kva, idr_next = 1.
1207 write_u64(&mut buf, idr_pa + offsets.idr_xa_head as u64, prog_kva);
1208 write_u32(&mut buf, idr_pa + offsets.idr_next as u64, 1);
1209
1210 // bpf_prog: type = STRUCT_OPS, aux = aux_kva, stats = stats_kva.
1211 write_u32(
1212 &mut buf,
1213 prog_pa + offsets.prog_type as u64,
1214 BPF_PROG_TYPE_STRUCT_OPS,
1215 );
1216 write_u64(
1217 &mut buf,
1218 prog_pa + offsets.prog_aux as u64,
1219 pa_to_kva(aux_pa),
1220 );
1221 write_u64(
1222 &mut buf,
1223 prog_pa + offsets.prog_stats as u64,
1224 pa_to_kva(stats_pa),
1225 );
1226
1227 // bpf_prog_aux: verified_insns + name. Name must NUL-
1228 // terminate within BPF_OBJ_NAME_LEN so the walker's
1229 // `position(|&b| b == 0)` finds the end.
1230 write_u32(&mut buf, aux_pa + offsets.aux_verified_insns as u64, 12_345);
1231 let name = b"bulk_test";
1232 let name_pa = (aux_pa + offsets.aux_name as u64) as usize;
1233 buf[name_pa..name_pa + name.len()].copy_from_slice(name);
1234
1235 // Stats: write the three u64 counters at the synthetic
1236 // offsets. These are the bytes the bulk read MUST surface
1237 // through the parse closure.
1238 let known_cnt: u64 = 0x1111_1111_1111_1111;
1239 let known_nsecs: u64 = 0x2222_2222_2222_2222;
1240 let known_misses: u64 = 0x3333_3333_3333_3333;
1241 write_u64(&mut buf, stats_pa + offsets.stats_cnt as u64, known_cnt);
1242 write_u64(&mut buf, stats_pa + offsets.stats_nsecs as u64, known_nsecs);
1243 write_u64(
1244 &mut buf,
1245 stats_pa + offsets.stats_misses as u64,
1246 known_misses,
1247 );
1248
1249 // SAFETY: buf is a live local Vec<u8> whose backing storage
1250 // outlives the GuestMem use.
1251 let mem = unsafe { GuestMem::new(buf.as_ptr() as *mut u8, buf.len() as u64) };
1252 let walk = WalkContext {
1253 cr3_pa: 0,
1254 page_offset,
1255 l5: false,
1256 tcr_el1: 0,
1257 };
1258 // One CPU. `cpu_off == 0` is allowed at `cpu_index == 0`
1259 // (BSP). `stats_kva + 0 = stats_kva`, which translates
1260 // through the direct mapping to `stats_pa`.
1261 let per_cpu_offsets = vec![0u64];
1262
1263 let prog_idr_kva = idr_pa + START_KERNEL_MAP;
1264 let stats = walk_struct_ops_runtime_stats(
1265 &mem,
1266 walk,
1267 prog_idr_kva,
1268 &offsets,
1269 &per_cpu_offsets,
1270 START_KERNEL_MAP,
1271 0,
1272 );
1273
1274 assert_eq!(stats.len(), 1, "single STRUCT_OPS prog must surface");
1275 assert_eq!(stats[0].name, "bulk_test");
1276 assert_eq!(
1277 stats[0].cnt, known_cnt,
1278 "bulk read must parse cnt at offsets.stats_cnt within the 24-byte window",
1279 );
1280 assert_eq!(
1281 stats[0].nsecs, known_nsecs,
1282 "bulk read must parse nsecs at offsets.stats_nsecs within the 24-byte window",
1283 );
1284 assert_eq!(
1285 stats[0].misses, known_misses,
1286 "bulk read must parse misses at offsets.stats_misses within the 24-byte window",
1287 );
1288 }
1289
1290 /// End-to-end chain test for the bulk 24-byte
1291 /// `bpf_prog_stats` read on x86_64. The walker reads `cnt`,
1292 /// `nsecs`, and `misses` (three adjacent u64s in the kernel
1293 /// `struct bpf_prog_stats`) via one `read_bytes` over the
1294 /// `[lo, hi)` span and parses each value from the local
1295 /// buffer. The aarch64 wrapper below pins the same chain
1296 /// against the aarch64 `PAGE_OFFSET` baseline.
1297 #[test]
1298 #[cfg(target_arch = "x86_64")]
1299 fn walk_struct_ops_runtime_stats_bulk_24byte_read_parses_three_offsets() {
1300 // x86_64 PAGE_OFFSET (4-level paging, non-KASLR baseline).
1301 walk_struct_ops_runtime_stats_bulk_chain_at_page_offset(0xFFFF_8880_0000_0000);
1302 }
1303
1304 /// End-to-end chain test for the bulk 24-byte
1305 /// `bpf_prog_stats` read on aarch64. Mirrors the x86_64
1306 /// wrapper above against the aarch64 direct-mapping
1307 /// `PAGE_OFFSET` baseline so the bulk-read fast path is
1308 /// pinned on both arches.
1309 #[test]
1310 #[cfg(target_arch = "aarch64")]
1311 fn walk_struct_ops_runtime_stats_bulk_24byte_read_parses_three_offsets() {
1312 // aarch64 PAGE_OFFSET baseline (48-bit VA, 4 KiB granule).
1313 walk_struct_ops_runtime_stats_bulk_chain_at_page_offset(0xFFFF_0000_0000_0000);
1314 }
1315
1316 /// Format chain integration: the `ProgRuntimeStats` Display
1317 /// output must appear verbatim inside `FailureDumpReport`'s
1318 /// Display output. Pins the chain
1319 /// `ProgRuntimeStats::fmt` (bpf_prog.rs) →
1320 /// `FailureDumpReport::fmt::std::fmt::Display::fmt(stats, f)`
1321 /// (dump/display.rs `prog_runtime_stats:` arm).
1322 ///
1323 /// The standalone `prog_runtime_stats_display_format` test pins
1324 /// the inner Display in isolation; the dump-side
1325 /// `report_display_renders_prog_runtime_stats` test pins the
1326 /// outer section header. Neither catches a regression that
1327 /// SUBSTITUTED the inner Display call (e.g. introducing a
1328 /// custom rendering branch in the outer formatter that bypasses
1329 /// `ProgRuntimeStats::fmt`). This test catches that drift by
1330 /// asserting BOTH layers render identically and the inner
1331 /// string appears as a substring of the outer — a substitution
1332 /// would break either equality.
1333 #[test]
1334 fn prog_runtime_stats_format_chain_inner_appears_in_outer() {
1335 use crate::monitor::dump::{FailureDumpReport, SCHEMA_SINGLE};
1336 let info = ProgRuntimeStats {
1337 name: "chain_test".to_string(),
1338 cnt: 7,
1339 nsecs: 42,
1340 misses: 1,
1341 };
1342 let inner = format!("{info}");
1343 // Direct Display on ProgRuntimeStats: pinned shape includes
1344 // the bpftop-style derived metrics. cnt=7 nsecs=42 →
1345 // ns/call=6.000; misses=1 → miss_rate=1/8=0.1250.
1346 assert_eq!(
1347 inner,
1348 "chain_test: cnt=7 nsecs=42 misses=1 ns/call=6.000 miss_rate=0.1250",
1349 );
1350
1351 let report = FailureDumpReport {
1352 schema: SCHEMA_SINGLE.to_string(),
1353 prog_runtime_stats: vec![info],
1354 ..Default::default()
1355 };
1356 let outer = format!("{report}");
1357 // The outer's `prog_runtime_stats:` section calls
1358 // `std::fmt::Display::fmt(stats, f)` on each entry; that
1359 // call dispatches through THIS module's Display impl. If a
1360 // future regression replaced the dispatch with a custom
1361 // formatter, the inner string would no longer appear in
1362 // the outer output — surfacing as substring failure.
1363 assert!(
1364 outer.contains(&inner),
1365 "FailureDumpReport's Display chain must dispatch through \
1366 ProgRuntimeStats::fmt — inner {inner:?} must appear \
1367 verbatim inside outer:\n{outer}",
1368 );
1369 // Sanity: the outer also wraps with the expected section
1370 // header, so the substring match is finding the chain
1371 // through the correct arm of FailureDumpReport's fmt and
1372 // not (e.g.) a coincidence in the schema marker.
1373 assert!(
1374 outer.contains("prog_runtime_stats:"),
1375 "outer Display must carry the prog_runtime_stats section \
1376 header; without it the chain test could pass even when the \
1377 inner string matched a different format arm:\n{outer}",
1378 );
1379 }
1380
1381 // -- prog_idr chain fixtures: synthetic guest memory that drives
1382 // the full for_each_struct_ops_prog walk on the host with no
1383 // VM and no live kernel. The layout mirrors the existing
1384 // `walk_struct_ops_runtime_stats_bulk_chain_at_page_offset`
1385 // helper above (idr@0x0, prog@0x1000, aux@0x2000) and reuses
1386 // `synthetic_prog_offsets()`. PAs are offset by `page_offset`
1387 // to form direct-map KVAs that `translate_any_kva` resolves
1388 // via `kva_to_pa` (the direct-map fast path), and the
1389 // `prog_idr_kva` sits in kernel-text range translated via
1390 // `text_kva_to_pa_with_base`.
1391
1392 /// x86_64 PAGE_OFFSET baseline (4-level paging, non-KASLR) used
1393 /// by the host-only chain fixtures below. The exact value only
1394 /// has to keep `page_offset + pa` outside `[0, buf.len())` so the
1395 /// formed KVAs are unambiguously in the direct-map range and
1396 /// translate back to `pa` via `kva_to_pa`.
1397 const FIXTURE_PAGE_OFFSET: u64 = 0xFFFF_8880_0000_0000;
1398
1399 /// Mutable byte buffer wrapped as guest DRAM for the prog_idr
1400 /// chain fixtures. Owns the backing `Vec` so the unsafe
1401 /// [`GuestMem::new`] pointer stays valid for the fixture's life.
1402 struct ProgChainFixture {
1403 buf: Vec<u8>,
1404 page_offset: u64,
1405 }
1406
1407 impl ProgChainFixture {
1408 fn new(size: usize) -> Self {
1409 Self {
1410 buf: vec![0u8; size],
1411 page_offset: FIXTURE_PAGE_OFFSET,
1412 }
1413 }
1414
1415 /// Direct-map KVA for a DRAM offset.
1416 fn pa_to_kva(&self, pa: u64) -> u64 {
1417 self.page_offset.wrapping_add(pa)
1418 }
1419
1420 fn write_u64(&mut self, pa: u64, val: u64) {
1421 let off = pa as usize;
1422 self.buf[off..off + 8].copy_from_slice(&val.to_ne_bytes());
1423 }
1424
1425 fn write_u32(&mut self, pa: u64, val: u32) {
1426 let off = pa as usize;
1427 self.buf[off..off + 4].copy_from_slice(&val.to_ne_bytes());
1428 }
1429
1430 /// Write a NUL-terminated (within `BPF_OBJ_NAME_LEN`) name
1431 /// blob at `pa`. Panics if the name needs a terminator but
1432 /// fills the whole field — every fixture name here is short.
1433 fn write_name(&mut self, pa: u64, name: &[u8]) {
1434 assert!(
1435 name.len() < BPF_OBJ_NAME_LEN,
1436 "fixture name must leave room for the NUL terminator",
1437 );
1438 let off = pa as usize;
1439 self.buf[off..off + name.len()].copy_from_slice(name);
1440 }
1441
1442 fn mem(&self) -> GuestMem {
1443 // SAFETY: `self.buf` is a live Vec<u8> owned by the
1444 // fixture; it outlives every GuestMem read in the test
1445 // because the fixture is dropped after the assertions.
1446 unsafe { GuestMem::new(self.buf.as_ptr() as *mut u8, self.buf.len() as u64) }
1447 }
1448
1449 fn walk(&self) -> WalkContext {
1450 WalkContext {
1451 cr3_pa: 0,
1452 page_offset: self.page_offset,
1453 l5: false,
1454 tcr_el1: 0,
1455 }
1456 }
1457 }
1458
1459 /// PA constants shared by the chain fixtures.
1460 const FIX_IDR_PA: u64 = 0x0000;
1461 const FIX_PROG_PA: u64 = 0x1000;
1462 const FIX_AUX_PA: u64 = 0x2000;
1463
1464 /// Build a fixture whose prog_idr holds a single STRUCT_OPS prog
1465 /// (single-entry xarray: `xa_head` IS the prog KVA, `idr_next=1`).
1466 /// `prog_type` lets a caller override the type to exercise the
1467 /// non-struct_ops skip arm. The prog's `aux` pointer is wired but
1468 /// the aux body (name, verified_insns, used_maps, stats) is left
1469 /// for the caller to populate. Returns the fixture; the
1470 /// `prog_idr_kva` for the walk is `FIX_IDR_PA + START_KERNEL_MAP`.
1471 fn single_prog_fixture(
1472 size: usize,
1473 prog_type: u32,
1474 offsets: &BpfProgOffsets,
1475 ) -> ProgChainFixture {
1476 let mut fx = ProgChainFixture::new(size);
1477 let prog_kva = fx.pa_to_kva(FIX_PROG_PA);
1478 // Single-entry xarray leaf marker: prog_kva must have bits
1479 // 0-1 clear so `xa_is_node` treats it as a direct entry.
1480 assert_eq!(prog_kva & 3, 0, "prog_kva must be a leaf entry");
1481
1482 // IDR: xa_head = prog_kva, idr_next = 1.
1483 fx.write_u64(FIX_IDR_PA + offsets.idr_xa_head as u64, prog_kva);
1484 fx.write_u32(FIX_IDR_PA + offsets.idr_next as u64, 1);
1485
1486 // bpf_prog: type + aux pointer.
1487 fx.write_u32(FIX_PROG_PA + offsets.prog_type as u64, prog_type);
1488 fx.write_u64(
1489 FIX_PROG_PA + offsets.prog_aux as u64,
1490 fx.pa_to_kva(FIX_AUX_PA),
1491 );
1492 fx
1493 }
1494
1495 // ---- find_struct_ops_progs ----
1496
1497 /// Happy path: a single STRUCT_OPS prog whose aux carries a name
1498 /// and verified_insns count surfaces with both fields read from
1499 /// the synthetic offsets. Covers the `find_struct_ops_progs`
1500 /// payload closure (aux_verified_insns read + aux_name NUL-scan +
1501 /// String build) and the `for_each_struct_ops_prog` happy path.
1502 #[test]
1503 fn find_struct_ops_progs_single_prog_reads_name_and_verified_insns() {
1504 let offsets = synthetic_prog_offsets();
1505 let mut fx = single_prog_fixture(0x3000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1506 fx.write_u32(FIX_AUX_PA + offsets.aux_verified_insns as u64, 12_345);
1507 fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"dispatch");
1508
1509 let progs = find_struct_ops_progs(
1510 &fx.mem(),
1511 fx.walk(),
1512 FIX_IDR_PA + START_KERNEL_MAP,
1513 &offsets,
1514 START_KERNEL_MAP,
1515 0,
1516 );
1517 assert_eq!(progs.len(), 1);
1518 assert_eq!(progs[0].name, "dispatch");
1519 assert_eq!(progs[0].verified_insns, 12_345u32);
1520 }
1521
1522 /// The `prog_type != BPF_PROG_TYPE_STRUCT_OPS { continue }` filter
1523 /// at `for_each_struct_ops_prog` line ~94: a prog whose type is
1524 /// not 27 (here KPROBE=2) is skipped before its aux is read, so
1525 /// the result is empty.
1526 #[test]
1527 fn find_struct_ops_progs_skips_non_struct_ops_type() {
1528 const BPF_PROG_TYPE_KPROBE: u32 = 2;
1529 let offsets = synthetic_prog_offsets();
1530 // Populate the aux body too, to prove the skip happens at the
1531 // type filter and not because aux was unreadable.
1532 let mut fx = single_prog_fixture(0x3000, BPF_PROG_TYPE_KPROBE, &offsets);
1533 fx.write_u32(FIX_AUX_PA + offsets.aux_verified_insns as u64, 999);
1534 fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"kprobe_prog");
1535
1536 let progs = find_struct_ops_progs(
1537 &fx.mem(),
1538 fx.walk(),
1539 FIX_IDR_PA + START_KERNEL_MAP,
1540 &offsets,
1541 START_KERNEL_MAP,
1542 0,
1543 );
1544 assert_eq!(progs.len(), 0);
1545 assert!(progs.is_empty());
1546 }
1547
1548 /// An all-zero IDR (xa_head left unwritten) yields an empty result:
1549 /// `for_each_struct_ops_prog` short-circuits on `xa_head == 0` before
1550 /// the id loop, but even without that guard an empty xarray surfaces
1551 /// no progs — so this pins the empty-IDR→empty outcome, not the
1552 /// short-circuit in isolation. Reached via `find_struct_ops_progs`.
1553 #[test]
1554 fn for_each_struct_ops_prog_empty_xa_head_returns_empty() {
1555 let offsets = synthetic_prog_offsets();
1556 // Default-zero buffer: do NOT write xa_head. idr_next is
1557 // irrelevant because the xa_head guard fires first.
1558 let fx = ProgChainFixture::new(0x3000);
1559
1560 let progs = find_struct_ops_progs(
1561 &fx.mem(),
1562 fx.walk(),
1563 FIX_IDR_PA + START_KERNEL_MAP,
1564 &offsets,
1565 START_KERNEL_MAP,
1566 0,
1567 );
1568 assert_eq!(progs.len(), 0);
1569 assert!(progs.is_empty());
1570 }
1571
1572 /// A corrupt `idr_next` of `u32::MAX` still returns the correct
1573 /// result — the one real prog at id 0 — and terminates, because the
1574 /// `.min(65536)` clamp on `idr_next` in `for_each_struct_ops_prog`
1575 /// bounds the loop. The single-entry xarray returns the prog for id 0
1576 /// and `Some(0)` for every id > 0 (see `idr::xa_load`). This pins the
1577 /// RESULT under a corrupt count; a clamp regression would surface as
1578 /// a slow run rather than a failed assertion (pinning the exact 65536
1579 /// boundary would need a multi-level xarray with an entry past the
1580 /// cap — out of scope here).
1581 #[test]
1582 fn for_each_struct_ops_prog_caps_idr_next_at_65536() {
1583 let offsets = synthetic_prog_offsets();
1584 let mut fx = single_prog_fixture(0x3000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1585 // Overwrite idr_next with u32::MAX. Without the .min(65536)
1586 // clamp this loop would attempt ~4 billion iterations.
1587 fx.write_u32(FIX_IDR_PA + offsets.idr_next as u64, u32::MAX);
1588 fx.write_u32(FIX_AUX_PA + offsets.aux_verified_insns as u64, 7);
1589 fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"capped");
1590
1591 let progs = find_struct_ops_progs(
1592 &fx.mem(),
1593 fx.walk(),
1594 FIX_IDR_PA + START_KERNEL_MAP,
1595 &offsets,
1596 START_KERNEL_MAP,
1597 0,
1598 );
1599 assert_eq!(progs.len(), 1);
1600 assert_eq!(progs[0].name, "capped");
1601 assert_eq!(progs[0].verified_insns, 7u32);
1602 }
1603
1604 // ---- walk_struct_ops_runtime_stats ----
1605
1606 /// The `if stats_percpu_kva == 0 { return None }` skip at
1607 /// `walk_struct_ops_runtime_stats` line ~533: a prog whose
1608 /// `prog_stats` per-CPU base is NULL is dropped from the result
1609 /// (closure returns None -> not pushed). prog_stats is left
1610 /// unwritten (zero) on the single STRUCT_OPS prog.
1611 #[test]
1612 fn walk_runtime_stats_skips_prog_with_null_stats_pointer() {
1613 let offsets = synthetic_prog_offsets();
1614 let mut fx = single_prog_fixture(0x3000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1615 fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"no_stats");
1616 // Deliberately leave prog_stats == 0 (do not write it).
1617
1618 let per_cpu_offsets = vec![0u64];
1619 let stats = walk_struct_ops_runtime_stats(
1620 &fx.mem(),
1621 fx.walk(),
1622 FIX_IDR_PA + START_KERNEL_MAP,
1623 &offsets,
1624 &per_cpu_offsets,
1625 START_KERNEL_MAP,
1626 0,
1627 );
1628 assert_eq!(stats.len(), 0);
1629 assert!(stats.is_empty());
1630 }
1631
1632 /// Multi-CPU accumulation: with two distinct, non-zero per-CPU
1633 /// offsets the walker translates both per-CPU `bpf_prog_stats`
1634 /// blocks and `saturating_add`s `cnt`/`nsecs`/`misses` across
1635 /// them. Covers the per-CPU sum loop at
1636 /// `walk_struct_ops_runtime_stats` lines ~544-623 for >1 CPU.
1637 #[test]
1638 fn walk_runtime_stats_sums_across_two_cpus() {
1639 let offsets = synthetic_prog_offsets();
1640 // Buffer must hold both stats blocks. stats0 @0x3000,
1641 // stats1 @0x3800 — both within a 0x4000 buffer.
1642 let stats_pa0: u64 = 0x3000;
1643 let stats_pa1: u64 = 0x3800;
1644 let mut fx = single_prog_fixture(0x4000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1645 fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"two_cpu");
1646 // prog_stats per-CPU base = stats_pa0's KVA. cpu_off shifts it.
1647 fx.write_u64(
1648 FIX_PROG_PA + offsets.prog_stats as u64,
1649 fx.pa_to_kva(stats_pa0),
1650 );
1651
1652 // Distinct small counters per block.
1653 let cnt0: u64 = 10;
1654 let nsecs0: u64 = 100;
1655 let misses0: u64 = 1;
1656 let cnt1: u64 = 7;
1657 let nsecs1: u64 = 70;
1658 let misses1: u64 = 3;
1659 fx.write_u64(stats_pa0 + offsets.stats_cnt as u64, cnt0);
1660 fx.write_u64(stats_pa0 + offsets.stats_nsecs as u64, nsecs0);
1661 fx.write_u64(stats_pa0 + offsets.stats_misses as u64, misses0);
1662 fx.write_u64(stats_pa1 + offsets.stats_cnt as u64, cnt1);
1663 fx.write_u64(stats_pa1 + offsets.stats_nsecs as u64, nsecs1);
1664 fx.write_u64(stats_pa1 + offsets.stats_misses as u64, misses1);
1665
1666 // cpu0: cpu_off=0 reads stats_pa0 (BSP). cpu1: cpu_off=delta
1667 // reads stats_pa0+delta = stats_pa1.
1668 let per_cpu_offsets = vec![0u64, stats_pa1 - stats_pa0];
1669 let stats = walk_struct_ops_runtime_stats(
1670 &fx.mem(),
1671 fx.walk(),
1672 FIX_IDR_PA + START_KERNEL_MAP,
1673 &offsets,
1674 &per_cpu_offsets,
1675 START_KERNEL_MAP,
1676 0,
1677 );
1678 assert_eq!(stats.len(), 1);
1679 assert_eq!(stats[0].cnt, cnt0 + cnt1);
1680 assert_eq!(stats[0].nsecs, nsecs0 + nsecs1);
1681 assert_eq!(stats[0].misses, misses0 + misses1);
1682 }
1683
1684 /// The `if cpu_off == 0 && cpu_index > 0 { continue }` BSS-zero-
1685 /// tail guard at `walk_struct_ops_runtime_stats` lines ~556-558:
1686 /// a trailing `__per_cpu_offset[]=0` slot (cpu_index > 0) must be
1687 /// skipped so CPU 0's stats are NOT double-counted. With
1688 /// `per_cpu_offsets = [0, 0]` the summed fields equal the single
1689 /// block's values, not twice them — a regression that dropped the
1690 /// `cpu_index > 0` guard would double them.
1691 #[test]
1692 fn walk_runtime_stats_skips_zero_offset_tail_cpu() {
1693 let offsets = synthetic_prog_offsets();
1694 let stats_pa: u64 = 0x3000;
1695 let mut fx = single_prog_fixture(0x4000, BPF_PROG_TYPE_STRUCT_OPS, &offsets);
1696 fx.write_name(FIX_AUX_PA + offsets.aux_name as u64, b"bss_tail");
1697 fx.write_u64(
1698 FIX_PROG_PA + offsets.prog_stats as u64,
1699 fx.pa_to_kva(stats_pa),
1700 );
1701
1702 let cnt0: u64 = 42;
1703 let nsecs0: u64 = 4200;
1704 let misses0: u64 = 5;
1705 fx.write_u64(stats_pa + offsets.stats_cnt as u64, cnt0);
1706 fx.write_u64(stats_pa + offsets.stats_nsecs as u64, nsecs0);
1707 fx.write_u64(stats_pa + offsets.stats_misses as u64, misses0);
1708
1709 // Two slots: cpu0 (cpu_off=0, BSP, allowed) reads stats_pa;
1710 // cpu1 (cpu_off=0, cpu_index=1) is skipped by the guard.
1711 let per_cpu_offsets = vec![0u64, 0u64];
1712 let stats = walk_struct_ops_runtime_stats(
1713 &fx.mem(),
1714 fx.walk(),
1715 FIX_IDR_PA + START_KERNEL_MAP,
1716 &offsets,
1717 &per_cpu_offsets,
1718 START_KERNEL_MAP,
1719 0,
1720 );
1721 assert_eq!(stats.len(), 1);
1722 assert_eq!(stats[0].cnt, cnt0);
1723 assert_eq!(stats[0].nsecs, nsecs0);
1724 assert_eq!(stats[0].misses, misses0);
1725 }
1726
1727 // ---- find_active_struct_ops_obj_no_target ----
1728
1729 /// PA constants for the active-obj fixture's used_maps array and
1730 /// map structs.
1731 const FIX_USED_MAPS_PA: u64 = 0x3000;
1732 const FIX_MAP0_PA: u64 = 0x4000;
1733 const FIX_MAP1_PA: u64 = 0x5000;
1734
1735 /// Happy path: a STRUCT_OPS prog whose aux->used_maps holds two
1736 /// map pointers — a struct_ops map (no global-section suffix) and
1737 /// a `<obj>.bss` global-section map — resolves to the obj prefix
1738 /// and the full used_map_kvas snapshot. Covers
1739 /// `find_active_struct_ops_obj_no_target`: used_maps!=0,
1740 /// used_map_cnt!=0, the entries snapshot loop, the per-map name
1741 /// read + `extract_global_section_obj_prefix` match.
1742 #[test]
1743 fn find_active_struct_ops_obj_returns_obj_prefix_from_bss_map() {
1744 let prog_offsets = synthetic_prog_offsets();
1745 let map_offsets = BpfMapOffsets {
1746 map_name: 0,
1747 ..BpfMapOffsets::EMPTY
1748 };
1749 let mut fx = single_prog_fixture(0x6000, BPF_PROG_TYPE_STRUCT_OPS, &prog_offsets);
1750
1751 // aux->used_maps = used_maps array KVA, used_map_cnt = 2.
1752 let map0_kva = fx.pa_to_kva(FIX_MAP0_PA);
1753 let map1_kva = fx.pa_to_kva(FIX_MAP1_PA);
1754 fx.write_u64(
1755 FIX_AUX_PA + prog_offsets.aux_used_maps as u64,
1756 fx.pa_to_kva(FIX_USED_MAPS_PA),
1757 );
1758 fx.write_u32(FIX_AUX_PA + prog_offsets.aux_used_map_cnt as u64, 2);
1759 fx.write_u64(FIX_USED_MAPS_PA, map0_kva);
1760 fx.write_u64(FIX_USED_MAPS_PA + 8, map1_kva);
1761
1762 // map0: struct_ops map name (no suffix -> no match).
1763 fx.write_name(FIX_MAP0_PA + map_offsets.map_name as u64, b"ktstr_ops");
1764 // map1: global-section .bss map (matches -> obj "bpf_bpf").
1765 fx.write_name(FIX_MAP1_PA + map_offsets.map_name as u64, b"bpf_bpf.bss");
1766
1767 let result = find_active_struct_ops_obj_no_target(
1768 &fx.mem(),
1769 fx.walk(),
1770 FIX_IDR_PA + START_KERNEL_MAP,
1771 &prog_offsets,
1772 &map_offsets,
1773 START_KERNEL_MAP,
1774 0,
1775 );
1776 let m = result.unwrap();
1777 assert_eq!(m.obj_name, "bpf_bpf");
1778 assert_eq!(m.used_map_kvas, vec![map0_kva, map1_kva]);
1779 }
1780
1781 /// The closure-returns-None path: every map in the snapshot is
1782 /// scanned but none matches a global-section suffix, so the
1783 /// closure returns None and `.into_iter().next()` yields None.
1784 /// Distinct from the null-used_maps and zero-cnt early skips.
1785 #[test]
1786 fn find_active_struct_ops_obj_none_when_no_global_section_map() {
1787 let prog_offsets = synthetic_prog_offsets();
1788 let map_offsets = BpfMapOffsets {
1789 map_name: 0,
1790 ..BpfMapOffsets::EMPTY
1791 };
1792 let mut fx = single_prog_fixture(0x6000, BPF_PROG_TYPE_STRUCT_OPS, &prog_offsets);
1793
1794 fx.write_u64(
1795 FIX_AUX_PA + prog_offsets.aux_used_maps as u64,
1796 fx.pa_to_kva(FIX_USED_MAPS_PA),
1797 );
1798 fx.write_u32(FIX_AUX_PA + prog_offsets.aux_used_map_cnt as u64, 2);
1799 fx.write_u64(FIX_USED_MAPS_PA, fx.pa_to_kva(FIX_MAP0_PA));
1800 fx.write_u64(FIX_USED_MAPS_PA + 8, fx.pa_to_kva(FIX_MAP1_PA));
1801
1802 // Both maps lack any .bss/.data/.rodata suffix.
1803 fx.write_name(FIX_MAP0_PA + map_offsets.map_name as u64, b"ktstr_ops");
1804 fx.write_name(FIX_MAP1_PA + map_offsets.map_name as u64, b"bpf_runq");
1805
1806 let result = find_active_struct_ops_obj_no_target(
1807 &fx.mem(),
1808 fx.walk(),
1809 FIX_IDR_PA + START_KERNEL_MAP,
1810 &prog_offsets,
1811 &map_offsets,
1812 START_KERNEL_MAP,
1813 0,
1814 );
1815 assert!(result.is_none());
1816 }
1817
1818 /// A STRUCT_OPS prog whose aux->used_maps pointer is NULL yields no
1819 /// active obj: `find_active_struct_ops_obj` returns None on the
1820 /// `used_maps_kva == 0` skip. The all-zero fixture also has
1821 /// used_map_cnt == 0 (whose guard would likewise return None), so
1822 /// this pins the NULL-used_maps→None outcome, not that one guard in
1823 /// isolation. used_maps is left unwritten (zero).
1824 #[test]
1825 fn find_active_struct_ops_obj_none_when_used_maps_null() {
1826 let prog_offsets = synthetic_prog_offsets();
1827 let map_offsets = BpfMapOffsets {
1828 map_name: 0,
1829 ..BpfMapOffsets::EMPTY
1830 };
1831 // aux->used_maps left 0; used_map_cnt irrelevant.
1832 let fx = single_prog_fixture(0x6000, BPF_PROG_TYPE_STRUCT_OPS, &prog_offsets);
1833
1834 let result = find_active_struct_ops_obj_no_target(
1835 &fx.mem(),
1836 fx.walk(),
1837 FIX_IDR_PA + START_KERNEL_MAP,
1838 &prog_offsets,
1839 &map_offsets,
1840 START_KERNEL_MAP,
1841 0,
1842 );
1843 assert!(result.is_none());
1844 }
1845
1846 /// The `.min(MAX_USED_MAPS)` clamp on used_map_cnt: a corrupt
1847 /// used_map_cnt of 70 (> MAX_USED_MAPS=64) must bound the snapshot
1848 /// loop to exactly 64 reads. Entry index 1 is a global-section
1849 /// .bss map so the prefix still resolves, but the captured
1850 /// used_map_kvas vector is capped at 64. The entries beyond index
1851 /// 1 are never translated (the match returns at index 1), so only
1852 /// maps 0 and 1 need a real backing struct; the rest are non-zero
1853 /// KVAs that only enter the snapshot.
1854 #[test]
1855 fn find_active_struct_ops_obj_caps_used_map_cnt_at_max_used_maps() {
1856 let prog_offsets = synthetic_prog_offsets();
1857 let map_offsets = BpfMapOffsets {
1858 map_name: 0,
1859 ..BpfMapOffsets::EMPTY
1860 };
1861 let mut fx = single_prog_fixture(0x6000, BPF_PROG_TYPE_STRUCT_OPS, &prog_offsets);
1862
1863 // used_maps array of 70 non-zero entries (> MAX_USED_MAPS).
1864 const CORRUPT_CNT: u32 = 70;
1865 let map0_kva = fx.pa_to_kva(FIX_MAP0_PA);
1866 let map1_kva = fx.pa_to_kva(FIX_MAP1_PA);
1867 fx.write_u64(
1868 FIX_AUX_PA + prog_offsets.aux_used_maps as u64,
1869 fx.pa_to_kva(FIX_USED_MAPS_PA),
1870 );
1871 fx.write_u32(
1872 FIX_AUX_PA + prog_offsets.aux_used_map_cnt as u64,
1873 CORRUPT_CNT,
1874 );
1875 fx.write_u64(FIX_USED_MAPS_PA, map0_kva);
1876 fx.write_u64(FIX_USED_MAPS_PA + 8, map1_kva);
1877 // Entries 2..70: arbitrary non-zero KVAs (never translated —
1878 // the match returns at index 1). They only populate the
1879 // snapshot, so the clamp is what bounds the loop.
1880 for i in 2..CORRUPT_CNT as u64 {
1881 fx.write_u64(FIX_USED_MAPS_PA + i * 8, 0xDEAD_0000 + i);
1882 }
1883
1884 // map0: struct_ops name (no match). map1: .bss (match).
1885 fx.write_name(FIX_MAP0_PA + map_offsets.map_name as u64, b"ktstr_ops");
1886 fx.write_name(FIX_MAP1_PA + map_offsets.map_name as u64, b"bpf_bpf.bss");
1887
1888 let result = find_active_struct_ops_obj_no_target(
1889 &fx.mem(),
1890 fx.walk(),
1891 FIX_IDR_PA + START_KERNEL_MAP,
1892 &prog_offsets,
1893 &map_offsets,
1894 START_KERNEL_MAP,
1895 0,
1896 );
1897 let m = result.unwrap();
1898 assert_eq!(m.obj_name, "bpf_bpf");
1899 assert!(m.used_map_kvas.len() <= MAX_USED_MAPS as usize);
1900 assert_eq!(m.used_map_kvas.len(), 64);
1901 }
1902}