ktstr/monitor/bpf_map/mod.rs
1//! Host-side BPF map discovery, read/write, and iteration via guest physical memory.
2//!
3//! Walks the kernel's `map_idr` xarray from the host, finds a BPF map
4//! by name suffix, and provides read/write access to the map's value
5//! region. No guest cooperation is needed — all reads go through the
6//! guest physical memory mapping.
7//!
8//! Address translation strategy:
9//! - `map_idr` is a kernel BSS symbol: use `text_kva_to_pa_with_base`
10//! (or [`super::guest::GuestKernel::text_kva_to_pa`]).
11//! - xa_node structs are SLAB-allocated (direct mapping): use `kva_to_pa`.
12//! - bpf_map/bpf_array may be kmalloc'd or vmalloc'd: use `translate_any_kva`.
13//! - .bss value region is vmalloc'd: use `translate_kva`.
14//! - Per-CPU values (`PERCPU_ARRAY` / `PERCPU_HASH`) live in dynamic
15//! per-CPU memory — the embedded first chunk is in the direct
16//! mapping, larger allocations are vmalloc'd. Add
17//! `__per_cpu_offset[cpu]` to the `__percpu` base and read the value
18//! page-by-page via `read_percpu_value_bytes` (`translate_any_kva`).
19
20use crate::sync::MutexExt;
21use anyhow::Context;
22
23use super::btf_offsets::BpfMapOffsets;
24use super::idr::{translate_any_kva, xa_load};
25use super::reader::GuestMem;
26use super::symbols::text_kva_to_pa_with_base;
27use super::{Cr3Pa, Kva, PageOffset};
28
29mod htab;
30mod local_storage;
31#[cfg(test)]
32mod tests;
33use htab::{iter_htab_entries, iter_percpu_htab_entries};
34use local_storage::iter_local_storage_entries;
35
36/// Per-element row from a percpu-hash iteration: `(key_bytes,
37/// per_cpu_values)` where `per_cpu_values[cpu]` is `Some(value_bytes)`
38/// when the per-CPU slot is readable and `None` when the page is
39/// unmapped or the CPU index is out of range. Returned by
40/// [`BpfMapAccessor::iter_percpu_hash_map`] and the underlying walker
41/// helpers in [`htab`].
42pub(crate) type PerCpuHashEntries = Vec<(Vec<u8>, Vec<Option<Vec<u8>>>)>;
43
44/// Maximum chain-element visits per map walk — the production value of
45/// [`AccessorCtx::iter_max`]. A corrupted `next` pointer that forms a
46/// cycle is bounded here so a hash-bucket or local-storage walk can't
47/// spin the freeze hot path until the rendezvous timeout. Shared by
48/// the bucket walker `htab::walk_htab` and the local-storage walker
49/// `local_storage::iter_local_storage_entries`.
50pub(crate) const MAP_WALK_ITER_MAX: usize = 1_000_000;
51
52/// Maximum chain entries a walker materializes into its result `Vec`
53/// before the renderer's per-map `.take`. One-past
54/// (`out.len() > MAP_MATERIALIZE_MAX`) so the renderer's
55/// `len > MAX_HASH_ENTRIES` truncation check still fires on a truncated
56/// map. Without it the guest-memory walker materializes up to
57/// [`MAP_WALK_ITER_MAX`] entries (×num_cpus for per-CPU values) before
58/// the renderer truncates to 4096 — a freeze-hot-path memory spike.
59/// `dump::render_map::MAX_HASH_ENTRIES` aliases this so the render cap
60/// and the walker materialize cap are one value.
61pub(crate) const MAP_MATERIALIZE_MAX: usize = 4096;
62
63/// Bundle of borrow-held state every map-access routine threads
64/// through the page-table walk, bounds check, and byte read/write path.
65///
66/// Every free function in this module previously took the same four-
67/// to eight-argument fan of `mem`, `cr3_pa`, `page_offset`, `offsets`,
68/// `l5` (some also took `map_idr_kva`); callers invariably forwarded
69/// the same fields from their [`GuestMemMapAccessor`] because all six
70/// originate on the accessor. Grouping them here drops the duplication
71/// and lets additional shared context (per-CPU offset cache, BTF
72/// cache, etc.) ride the same lifetime without touching every
73/// signature. `cr3_pa` and `page_offset` are newtyped so the page-
74/// walker can't silently swap them at a call site.
75pub(crate) struct AccessorCtx<'a> {
76 pub mem: &'a GuestMem,
77 pub cr3_pa: Cr3Pa,
78 pub page_offset: PageOffset,
79 pub offsets: &'a BpfMapOffsets,
80 pub l5: bool,
81 /// Cached TCR_EL1 register; drives the aarch64 page-table walker's
82 /// granule selection. Always 0 on x86_64 (the walker ignores it).
83 pub tcr_el1: u64,
84 /// Runtime kernel image base (`__START_KERNEL_map` on x86_64,
85 /// `KIMAGE_VADDR` on aarch64). Used for translating
86 /// kernel-text/data symbols (e.g. `map_idr`) to physical
87 /// addresses. Mirrors [`super::guest::GuestKernel::start_kernel_map`].
88 pub start_kernel_map: u64,
89 /// Runtime KASLR offset (`phys_base` on x86_64; `0` on aarch64
90 /// and on non-KASLR x86_64 boots). Threaded through every
91 /// `text_kva_to_pa_with_base` call so KASLR'd kernels resolve
92 /// text/data symbols correctly. See
93 /// [`super::guest::GuestKernel::phys_base`].
94 pub phys_base: u64,
95 /// Maximum chain-element visits per map walk (hash buckets,
96 /// local-storage chains). A corrupted `next` pointer that loops
97 /// back into a chain would otherwise spin the walker on the freeze
98 /// hot path; this bounds the walk. Production sets
99 /// [`MAP_WALK_ITER_MAX`]; tests override it with a small value to
100 /// exercise the cap cheaply.
101 pub iter_max: usize,
102}
103
104// Map type discriminants from `enum bpf_map_type` in
105// `include/uapi/linux/bpf.h`. Kept as flat `pub const u32` rather
106// than a Rust enum so a kernel that adds a new map type past this
107// list still surfaces as a numeric `map_type` on the
108// [`BpfMapInfo`] / [`super::dump::FailureDumpMap`] wire format —
109// the dump renderer falls through to a generic
110// "unknown map type {n}" arm rather than failing to deserialize.
111
112/// `BPF_MAP_TYPE_HASH` — generic hash table. Inline value bytes at
113/// `htab_elem_value` (`key + round_up(key_size, 8)`).
114pub const BPF_MAP_TYPE_HASH: u32 = 1;
115
116/// `BPF_MAP_TYPE_ARRAY` — fixed-size array of values. Inline values
117/// at the `bpf_array.value` flex array.
118pub const BPF_MAP_TYPE_ARRAY: u32 = 2;
119
120/// `BPF_MAP_TYPE_PROG_ARRAY` — array of `struct bpf_prog *` slots
121/// used by `bpf_tail_call`. Userspace-visible value is a program fd
122/// (or its kernel pointer); the underlying program is not data.
123pub const BPF_MAP_TYPE_PROG_ARRAY: u32 = 3;
124
125/// `BPF_MAP_TYPE_PERF_EVENT_ARRAY` — array of perf event fds. Same
126/// shape as `PROG_ARRAY` but stores perf event references.
127pub const BPF_MAP_TYPE_PERF_EVENT_ARRAY: u32 = 4;
128
129/// `BPF_MAP_TYPE_PERCPU_HASH` — like `HASH` but value is a
130/// `void __percpu *` resolved per-CPU via `__per_cpu_offset[cpu]`.
131pub const BPF_MAP_TYPE_PERCPU_HASH: u32 = 5;
132
133/// `BPF_MAP_TYPE_PERCPU_ARRAY` — like `ARRAY` but each slot is a
134/// `void __percpu *` resolved per-CPU.
135pub const BPF_MAP_TYPE_PERCPU_ARRAY: u32 = 6;
136
137/// `BPF_MAP_TYPE_STACK_TRACE` — kernel-side stack trace storage
138/// keyed by stackid. Values are transient (cleared after read by
139/// `bpf_get_stackid`); not a persistent state surface.
140pub const BPF_MAP_TYPE_STACK_TRACE: u32 = 7;
141
142/// `BPF_MAP_TYPE_CGROUP_ARRAY` — array of cgroup fds. FD-array shape
143/// like `PROG_ARRAY`.
144pub const BPF_MAP_TYPE_CGROUP_ARRAY: u32 = 8;
145
146/// `BPF_MAP_TYPE_LRU_HASH` — `HASH` plus LRU eviction. Value layout
147/// identical to `HASH` (inline value bytes); `htab_elem` carries
148/// `lru_node` in the same union slot as `ptr_to_pptr`.
149pub const BPF_MAP_TYPE_LRU_HASH: u32 = 9;
150
151/// `BPF_MAP_TYPE_LRU_PERCPU_HASH` — `PERCPU_HASH` plus LRU eviction.
152/// Same value-position-is-percpu-pointer layout as `PERCPU_HASH`.
153pub const BPF_MAP_TYPE_LRU_PERCPU_HASH: u32 = 10;
154
155/// `BPF_MAP_TYPE_LPM_TRIE` — longest-prefix-match trie. Keyed by
156/// (prefixlen, data); values are bytes. Iteration requires the
157/// trie's per-node walk, not provided here.
158pub const BPF_MAP_TYPE_LPM_TRIE: u32 = 11;
159
160/// `BPF_MAP_TYPE_ARRAY_OF_MAPS` — array slots store map fds.
161pub const BPF_MAP_TYPE_ARRAY_OF_MAPS: u32 = 12;
162
163/// `BPF_MAP_TYPE_HASH_OF_MAPS` — hash slots store map fds.
164pub const BPF_MAP_TYPE_HASH_OF_MAPS: u32 = 13;
165
166/// `BPF_MAP_TYPE_DEVMAP` — array of net_device fds for XDP
167/// redirection.
168pub const BPF_MAP_TYPE_DEVMAP: u32 = 14;
169
170/// `BPF_MAP_TYPE_SOCKMAP` — array of socket fds.
171pub const BPF_MAP_TYPE_SOCKMAP: u32 = 15;
172
173/// `BPF_MAP_TYPE_CPUMAP` — array of cpumap entries for XDP
174/// redirection.
175pub const BPF_MAP_TYPE_CPUMAP: u32 = 16;
176
177/// `BPF_MAP_TYPE_XSKMAP` — array of AF_XDP socket fds.
178pub const BPF_MAP_TYPE_XSKMAP: u32 = 17;
179
180/// `BPF_MAP_TYPE_SOCKHASH` — hash of socket fds.
181pub const BPF_MAP_TYPE_SOCKHASH: u32 = 18;
182
183/// `BPF_MAP_TYPE_CGROUP_STORAGE` — deprecated cgroup-attached
184/// storage. Replaced by `CGRP_STORAGE`. Reading requires the
185/// cgroup context the program was attached to.
186pub const BPF_MAP_TYPE_CGROUP_STORAGE: u32 = 19;
187
188/// `BPF_MAP_TYPE_REUSEPORT_SOCKARRAY` — array of SO_REUSEPORT
189/// socket fds.
190pub const BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: u32 = 20;
191
192/// `BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE` — deprecated per-CPU
193/// cgroup-attached storage.
194pub const BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: u32 = 21;
195
196/// `BPF_MAP_TYPE_QUEUE` — FIFO queue (no key). Values are popped
197/// destructively by `bpf_map_pop_elem`.
198pub const BPF_MAP_TYPE_QUEUE: u32 = 22;
199
200/// `BPF_MAP_TYPE_STACK` — LIFO stack (no key). Same destructive
201/// pop semantics as `QUEUE`.
202pub const BPF_MAP_TYPE_STACK: u32 = 23;
203
204/// `BPF_MAP_TYPE_SK_STORAGE` — per-socket storage. Reading requires
205/// iterating sockets, not a flat key/value walk.
206pub const BPF_MAP_TYPE_SK_STORAGE: u32 = 24;
207
208/// `BPF_MAP_TYPE_DEVMAP_HASH` — hash of net_device fds.
209pub const BPF_MAP_TYPE_DEVMAP_HASH: u32 = 25;
210
211/// `BPF_MAP_TYPE_STRUCT_OPS` — kernel struct table (e.g.
212/// `tcp_congestion_ops`, `sched_ext_ops`). The map holds a single
213/// `bpf_struct_ops_value` whose `data` field is the registered
214/// kernel struct. `lookup_elem` returns `-EINVAL`; the live-host
215/// path reads via `BPF_MAP_LOOKUP_ELEM` at key=0 anyway because the
216/// kernel's syscall ABI does the read for `STRUCT_OPS` maps.
217pub const BPF_MAP_TYPE_STRUCT_OPS: u32 = 26;
218
219/// `BPF_MAP_TYPE_RINGBUF` — single-producer/single-consumer ring
220/// buffer for streaming events. No key/value access; consumers
221/// poll via `bpf_ringbuf_poll`.
222pub const BPF_MAP_TYPE_RINGBUF: u32 = 27;
223
224/// `BPF_MAP_TYPE_INODE_STORAGE` — per-inode storage. Reading
225/// requires iterating inodes.
226pub const BPF_MAP_TYPE_INODE_STORAGE: u32 = 28;
227
228/// `BPF_MAP_TYPE_TASK_STORAGE` — per-task storage. Reading
229/// requires iterating tasks.
230pub const BPF_MAP_TYPE_TASK_STORAGE: u32 = 29;
231
232/// `BPF_MAP_TYPE_BLOOM_FILTER` — probabilistic set membership.
233/// No key enumeration — only `bpf_map_peek_elem` returns whether
234/// a probe value is "maybe present".
235pub const BPF_MAP_TYPE_BLOOM_FILTER: u32 = 30;
236
237/// `BPF_MAP_TYPE_USER_RINGBUF` — userspace producer / BPF
238/// consumer ring buffer. Same transient nature as `RINGBUF`.
239pub const BPF_MAP_TYPE_USER_RINGBUF: u32 = 31;
240
241/// `BPF_MAP_TYPE_CGRP_STORAGE` — per-cgroup storage (replaces
242/// `CGROUP_STORAGE`). Reading requires iterating cgroups.
243pub const BPF_MAP_TYPE_CGRP_STORAGE: u32 = 32;
244
245/// `BPF_MAP_TYPE_ARENA` — sparse, page-granular memory region
246/// shared between BPF programs and userspace. The host-side
247/// walker for arena pages lives in `super::arena`.
248pub const BPF_MAP_TYPE_ARENA: u32 = 33;
249
250/// `BPF_MAP_TYPE_INSN_ARRAY` — array of bpf instructions used by
251/// the verifier for indirect-jump targets. Values are kernel-side
252/// program references, not application data.
253pub const BPF_MAP_TYPE_INSN_ARRAY: u32 = 34;
254
255/// BPF_OBJ_NAME_LEN from include/linux/bpf.h.
256pub const BPF_OBJ_NAME_LEN: usize = 16;
257
258/// Discovered BPF map metadata and value location.
259#[derive(Debug, Clone, Default)]
260#[allow(dead_code)]
261pub struct BpfMapInfo {
262 /// Guest physical address of the `struct bpf_map`.
263 pub map_pa: u64,
264 /// Guest KVA of the `struct bpf_map` (or containing struct like
265 /// `bpf_array`/`bpf_htab`). Needed for hash map iteration to
266 /// read `bpf_htab` fields relative to this base.
267 pub map_kva: u64,
268 /// Map name as raw bytes (kernel `bpf_map.name`), null-padded to
269 /// `BPF_OBJ_NAME_LEN`. The active prefix length is in
270 /// [`Self::name_len`]; use [`Self::name`] for a `&str` view.
271 /// Holding the bytes inline avoids a per-map heap allocation on
272 /// the freeze hot path.
273 pub name_bytes: [u8; BPF_OBJ_NAME_LEN],
274 /// Active byte length of [`Self::name_bytes`]. Offset of the
275 /// first NUL byte; `BPF_OBJ_NAME_LEN` is the upper bound but
276 /// every kernel-registered map name is NUL-terminated within the
277 /// `BPF_OBJ_NAME_LEN`-byte field. The kernel's
278 /// `bpf_obj_name_cpy` (kernel/bpf/syscall.c) memsets the
279 /// destination to zero before copying and rejects names whose
280 /// source bytes fill the field without a NUL — see the
281 /// `if (src == end) return -EINVAL;` guard. So `name_len` is
282 /// strictly less than `BPF_OBJ_NAME_LEN` in practice; the
283 /// `unwrap_or(BPF_OBJ_NAME_LEN)` fallback in `find_all_bpf_maps`
284 /// is defense-in-depth against a corrupted guest read, not a
285 /// shape the kernel itself produces.
286 pub name_len: u8,
287 /// `map_type` field value.
288 pub map_type: u32,
289 /// `map_flags` field value.
290 pub map_flags: u32,
291 /// `key_size` field value.
292 pub key_size: u32,
293 /// `value_size` field value — the size of ONE entry's value.
294 /// For `BPF_MAP_TYPE_ARRAY` the kernel's per-entry stride is
295 /// `array->elem_size = round_up(value_size, 8)`
296 /// (kernel/bpf/arraymap.c:93) and the value region spans
297 /// `max_entries * elem_size`; a multi-entry ARRAY is read one
298 /// entry at a time via [`BpfMapAccessor::read_array`], not as a
299 /// single `value_size`-byte buffer.
300 pub value_size: u32,
301 /// `max_entries` field value.
302 pub max_entries: u32,
303 /// Guest KVA of the map's value region (entry 0). `Some(kva)`
304 /// when the renderer can read an entry starting at this address;
305 /// `None` when the map type requires a different walker (hash
306 /// iteration, arena page snapshot, …) or the kva resolution
307 /// failed.
308 ///
309 /// Populated for:
310 /// * `BPF_MAP_TYPE_ARRAY` — points at `bpf_array.value` (the
311 /// inline flex array, entry 0). A single-entry ARRAY
312 /// (`max_entries <= 1`, incl. `.bss`/`.data`/`.rodata`) reads
313 /// `value_size` bytes via [`BpfMapAccessor::read_value`]; a
314 /// multi-entry ARRAY reads entry `k` at
315 /// `value_kva + k * round_up(value_size, 8)` via
316 /// [`BpfMapAccessor::read_array`].
317 /// * `BPF_MAP_TYPE_STRUCT_OPS` — points at `kvalue.data` (the
318 /// embedded registered struct's bytes, after the
319 /// `bpf_struct_ops_common_value` header). Renderer reads
320 /// `value_size - data_off` bytes to match the size of the
321 /// `btf_value_type_id` type, which describes the data payload
322 /// only. `None` when struct_ops BTF offsets are unresolved.
323 pub value_kva: Option<u64>,
324 /// Guest KVA of the map's `struct btf` (guest-memory backend),
325 /// or `btf_id` cast to u64 (live-host backend reading via the
326 /// bpf(2) syscall: `BPF_OBJ_GET_INFO_BY_FD` returns `btf_id`,
327 /// not a kernel pointer). The dump path treats the value as
328 /// opaque — only `btf_kva == 0` is meaningful (no BTF
329 /// associated with this map). Backend-specific consumers cast
330 /// to the shape they need.
331 /// 0 if the map has no BTF.
332 pub btf_kva: u64,
333 /// BTF type ID for the map's value type. 0 if the map has no BTF.
334 pub btf_value_type_id: u32,
335 /// BTF type ID for the kernel-side `bpf_struct_ops_<name>`
336 /// wrapper in vmlinux BTF, populated for `BPF_MAP_TYPE_STRUCT_OPS`
337 /// maps. libbpf zeros `btf_value_type_id` for STRUCT_OPS and
338 /// passes the wrapper id via the kernel-only
339 /// `btf_vmlinux_value_type_id` field on `struct bpf_map`. The
340 /// dump path uses it to BTF-render the data payload by walking
341 /// the wrapper's `data` member to the per-ops struct (e.g.
342 /// `sched_ext_ops`). Zero on every other map type.
343 pub btf_vmlinux_value_type_id: u32,
344 /// BTF type ID for the map's key type. 0 when the map's BTF is
345 /// missing or the map type does not record a key type id (most
346 /// ARRAY-family maps store a synthetic `__u32` key implicitly).
347 /// HASH maps populate this so the dump path can render keys via
348 /// BTF instead of falling back to hex.
349 pub btf_key_type_id: u32,
350}
351
352impl BpfMapInfo {
353 /// Active name bytes: `&name_bytes[..name_len]`.
354 pub fn name_bytes_active(&self) -> &[u8] {
355 &self.name_bytes[..self.name_len as usize]
356 }
357
358 /// Map name as a `&str` view over [`Self::name_bytes`]. Lossily
359 /// renders any non-UTF-8 bytes via `String::from_utf8_lossy`,
360 /// allocating only when the active region contains invalid UTF-8.
361 /// Most kernel-registered names are ASCII so the common path is
362 /// alloc-free.
363 pub fn name(&self) -> std::borrow::Cow<'_, str> {
364 String::from_utf8_lossy(self.name_bytes_active())
365 }
366}
367
368impl std::fmt::Display for BpfMapInfo {
369 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
370 f.write_str(&self.name())
371 }
372}
373
374/// Maximum bytes covered by [`MapMetadata`]'s batched read of
375/// `struct bpf_map`. The struct itself is ~250 bytes on 6.16+
376/// kernels (verified against include/linux/bpf.h `struct bpf_map`),
377/// and every field [`find_all_bpf_maps`] touches falls within this
378/// span. The bound keeps the batched read to a fixed scratch size; if
379/// `map_pa` straddles end-of-DRAM `read_bytes` truncates short, so
380/// any field whose end exceeds the actual copy length falls through
381/// to the scalar read path via `MapMetadata::u32_at` / `u64_at`.
382const MAP_METADATA_SPAN: usize = 384;
383
384/// Cached scratch buffer over `struct bpf_map` bytes, batched into
385/// one [`GuestMem::read_bytes`] call. Replaces ~10 individual
386/// `read_u32`/`read_u64` syscall-equivalents per map per IDR walk
387/// with one bulk copy + local LE parses, paying one bounds check +
388/// region resolve instead of N.
389///
390/// All accessor methods bounds-check the requested offset against
391/// the bytes actually copied; a short copy (e.g. when `map_pa` is
392/// near end-of-DRAM) cleanly falls through to the scalar read path
393/// rather than indexing into uninitialized scratch.
394struct MapMetadata<'a> {
395 mem: &'a GuestMem,
396 map_pa: u64,
397 /// Stack-allocated scratch buffer holding the bulk-read bytes;
398 /// `copied` is the number actually filled by `read_bytes`.
399 buf: [u8; MAP_METADATA_SPAN],
400 copied: usize,
401}
402
403impl<'a> MapMetadata<'a> {
404 /// Issue the bulk `read_bytes` covering enough of `struct bpf_map`
405 /// for every offset [`find_all_bpf_maps`] dereferences.
406 fn read(mem: &'a GuestMem, map_pa: u64, _offsets: &BpfMapOffsets) -> Self {
407 // _offsets is reserved for a future tightening of the read
408 // span: callers could compute the exact `[lo, hi)` from the
409 // BTF-resolved fields rather than the conservative
410 // `MAP_METADATA_SPAN` cap, trimming the bulk read length on
411 // kernels with smaller `struct bpf_map`. Left unused today
412 // because the cap covers every supported layout and the
413 // savings are smaller than the page-table walks the bulk
414 // read replaces.
415 let mut buf = [0u8; MAP_METADATA_SPAN];
416 let copied = mem.read_bytes(map_pa, &mut buf);
417 Self {
418 mem,
419 map_pa,
420 buf,
421 copied,
422 }
423 }
424
425 /// Read a `u32` from the cached buffer. Falls through to the
426 /// scalar `read_u32` path (one volatile read, with its own
427 /// bounds check) when the offset spans past the bulk copy — the
428 /// scalar path returns 0 for out-of-bounds reads, matching the
429 /// pre-batch behaviour bit for bit.
430 fn u32_at(&self, off: usize) -> u32 {
431 if off + 4 <= self.copied {
432 u32::from_ne_bytes(self.buf[off..off + 4].try_into().unwrap())
433 } else {
434 self.mem.read_u32(self.map_pa, off)
435 }
436 }
437
438 /// Read a `u64` from the cached buffer; same fall-through
439 /// contract as [`Self::u32_at`].
440 fn u64_at(&self, off: usize) -> u64 {
441 if off + 8 <= self.copied {
442 u64::from_ne_bytes(self.buf[off..off + 8].try_into().unwrap())
443 } else {
444 self.mem.read_u64(self.map_pa, off)
445 }
446 }
447
448 /// Borrow `BPF_OBJ_NAME_LEN` name bytes at `name_off`. When the
449 /// name straddles end-of-buffer, falls through to a fresh
450 /// `read_bytes` call on the scalar path so the caller never sees
451 /// silent truncation.
452 fn name_bytes(&self, name_off: usize) -> std::borrow::Cow<'_, [u8]> {
453 if name_off + BPF_OBJ_NAME_LEN <= self.copied {
454 std::borrow::Cow::Borrowed(&self.buf[name_off..name_off + BPF_OBJ_NAME_LEN])
455 } else {
456 let mut name_buf = vec![0u8; BPF_OBJ_NAME_LEN];
457 self.mem
458 .read_bytes(self.map_pa + name_off as u64, &mut name_buf);
459 std::borrow::Cow::Owned(name_buf)
460 }
461 }
462}
463
464/// Enumerate all BPF maps in the kernel's `map_idr` xarray.
465///
466/// Returns metadata for every map whose KVA can be translated.
467/// No filtering by type or name — callers select from the result.
468///
469/// `value_kva` is populated for `BPF_MAP_TYPE_ARRAY` (inline
470/// `bpf_array.value`) and `BPF_MAP_TYPE_STRUCT_OPS`
471/// (`kvalue.data` inside `bpf_struct_ops_map`). All other map types
472/// resolve to `None` — they require dedicated walkers
473/// ([`iter_htab_entries`] for HASH, [`super::arena::snapshot_arena`]
474/// for ARENA, …).
475pub(crate) fn find_all_bpf_maps(ctx: &AccessorCtx<'_>, map_idr_kva: u64) -> Vec<BpfMapInfo> {
476 let idr_pa = text_kva_to_pa_with_base(map_idr_kva, ctx.start_kernel_map, ctx.phys_base);
477 let offsets = ctx.offsets;
478
479 let xa_head = ctx.mem.read_u64(idr_pa, offsets.idr_xa_head);
480 if xa_head == 0 {
481 return Vec::new();
482 }
483 let idr_next = ctx.mem.read_u32(idr_pa, offsets.idr_next).min(65536);
484
485 let mut maps = Vec::new();
486
487 for id in 0..idr_next {
488 let Some(entry) = xa_load(
489 ctx.mem,
490 ctx.page_offset.0,
491 xa_head,
492 id as u64,
493 offsets.xa_node_slots,
494 offsets.xa_node_shift,
495 ) else {
496 continue;
497 };
498 if entry == 0 {
499 continue;
500 }
501
502 let Some(map_pa) = translate_any_kva(
503 ctx.mem,
504 ctx.cr3_pa.0,
505 ctx.page_offset.0,
506 entry,
507 ctx.l5,
508 ctx.tcr_el1,
509 ) else {
510 continue;
511 };
512
513 // Batch the metadata reads: one `read_bytes` covering every
514 // field we touch on `struct bpf_map` collapses ~10 individual
515 // volatile scalar reads into one bulk copy + local indexed
516 // parses, saving 9 page-lookup + bounds-check round trips per
517 // map per IDR walk.
518 let meta = MapMetadata::read(ctx.mem, map_pa, offsets);
519
520 let map_type = meta.u32_at(offsets.map_type);
521 let map_flags = meta.u32_at(offsets.map_flags);
522 let key_size = meta.u32_at(offsets.key_size);
523 let value_size = meta.u32_at(offsets.value_size);
524 let max_entries = meta.u32_at(offsets.max_entries);
525
526 let mut name_bytes = [0u8; BPF_OBJ_NAME_LEN];
527 name_bytes.copy_from_slice(&meta.name_bytes(offsets.map_name));
528 let name_len = name_bytes
529 .iter()
530 .position(|&b| b == 0)
531 .unwrap_or(BPF_OBJ_NAME_LEN) as u8;
532
533 // value_kva is the start KVA the renderer reads value bytes
534 // from. Two map types populate it:
535 //
536 // * `BPF_MAP_TYPE_ARRAY`: `bpf_array` embeds `bpf_map` at
537 // offset 0 and the value flex array is inline at
538 // `bpf_array.value`.
539 // * `BPF_MAP_TYPE_STRUCT_OPS`: `bpf_struct_ops_map` embeds
540 // `kvalue` (a `bpf_struct_ops_value`) inline; the registered
541 // kernel struct lives at `kvalue.data`. `map->btf_value_type_id`
542 // describes only the data payload, not the prefixing
543 // `bpf_struct_ops_common_value`, so value_kva points at
544 // `data` and the renderer reads `value_size - data_off` bytes
545 // to fit the typed shape.
546 //
547 // Other map types (HASH, RINGBUF, ARENA, …) have no contiguous
548 // value region the renderer can read with a single offset/len
549 // pair — they use dedicated walkers (`iter_hash_map`,
550 // `read_arena_pages`, …).
551 let value_kva = match map_type {
552 BPF_MAP_TYPE_ARRAY => Some(entry + offsets.array_value as u64),
553 BPF_MAP_TYPE_STRUCT_OPS => offsets
554 .struct_ops_offsets
555 .as_ref()
556 .map(|so| entry + so.kvalue as u64 + so.value_data as u64),
557 _ => None,
558 };
559
560 let btf_kva = meta.u64_at(offsets.map_btf);
561 let btf_value_type_id = meta.u32_at(offsets.map_btf_value_type_id);
562 // `btf_vmlinux_value_type_id` lives at offset 0 only when the
563 // resolver couldn't locate the field (kernel built without
564 // CONFIG_BPF_JIT). Treat offset 0 as "unresolved" — reading
565 // u32 at offset 0 of `struct bpf_map` would alias `map_type`,
566 // which is decidedly NOT a btf type id. The STRUCT_OPS arm
567 // checks for non-zero before using.
568 let btf_vmlinux_value_type_id = if offsets.map_btf_vmlinux_value_type_id != 0 {
569 meta.u32_at(offsets.map_btf_vmlinux_value_type_id)
570 } else {
571 0
572 };
573 let btf_key_type_id = meta.u32_at(offsets.map_btf_key_type_id);
574
575 maps.push(BpfMapInfo {
576 map_pa,
577 map_kva: entry,
578 name_bytes,
579 name_len,
580 map_type,
581 map_flags,
582 key_size,
583 value_size,
584 max_entries,
585 value_kva,
586 btf_kva,
587 btf_value_type_id,
588 btf_vmlinux_value_type_id,
589 btf_key_type_id,
590 });
591 }
592
593 maps
594}
595
596/// Find the first BPF ARRAY map whose name ends with `name_suffix`.
597///
598/// Only returns `BPF_MAP_TYPE_ARRAY` maps. Use [`find_all_bpf_maps`]
599/// to enumerate maps of all types.
600///
601/// Walks the IDR directly and short-circuits on `map_type` for each
602/// candidate before reading the rest of the struct fields: an
603/// allocator-fast path that skips ~10 scalar reads + the
604/// `BPF_OBJ_NAME_LEN` name copy on every non-ARRAY map. The kernel
605/// typically registers many non-ARRAY maps (HASH, LRU_HASH,
606/// PERCPU_HASH, RINGBUF, ARENA, …) alongside the small set of ARRAY
607/// maps the failure-dump renderer reaches through, so the savings
608/// are proportional to the reject rate.
609// Production callers go through [`GuestMemMapAccessor::find_array_map`] /
610// [`BpfMapAccessor::maps`]; this single-shot variant is preserved
611// for the `bpf_map::tests` suite that exercises the IDR walk
612// directly.
613#[allow(dead_code)]
614pub(crate) fn find_bpf_map(
615 ctx: &AccessorCtx<'_>,
616 map_idr_kva: u64,
617 name_suffix: &str,
618) -> Option<BpfMapInfo> {
619 let idr_pa = text_kva_to_pa_with_base(map_idr_kva, ctx.start_kernel_map, ctx.phys_base);
620 let offsets = ctx.offsets;
621
622 let xa_head = ctx.mem.read_u64(idr_pa, offsets.idr_xa_head);
623 if xa_head == 0 {
624 return None;
625 }
626 let idr_next = ctx.mem.read_u32(idr_pa, offsets.idr_next).min(65536);
627
628 for id in 0..idr_next {
629 let Some(entry) = xa_load(
630 ctx.mem,
631 ctx.page_offset.0,
632 xa_head,
633 id as u64,
634 offsets.xa_node_slots,
635 offsets.xa_node_shift,
636 ) else {
637 continue;
638 };
639 if entry == 0 {
640 continue;
641 }
642
643 let Some(map_pa) = translate_any_kva(
644 ctx.mem,
645 ctx.cr3_pa.0,
646 ctx.page_offset.0,
647 entry,
648 ctx.l5,
649 ctx.tcr_el1,
650 ) else {
651 continue;
652 };
653
654 // Short-circuit on map_type before reading the rest of the
655 // struct: a non-ARRAY map is rejected with one volatile u32
656 // read instead of the bulk `read_bytes` over `MAP_METADATA_SPAN`
657 // bytes plus the [`BPF_OBJ_NAME_LEN`] name copy that the full
658 // metadata batch would do.
659 //
660 // [`find_bpf_map`] is reached only by direct callers (tests
661 // today; future single-shot probes that don't want to pay the
662 // [`find_all_bpf_maps`] IDR walk). The freeze hot path in
663 // production goes through [`GuestMemMapAccessor::find_array_map`] /
664 // [`BpfMapAccessor::maps`], which build and consult the
665 // per-accessor [`maps_cache`] populated by
666 // [`find_all_bpf_maps`]; that path does the full bulk read
667 // for every map and the type filter happens during the
668 // post-walk scan, not here.
669 let map_type = ctx.mem.read_u32(map_pa, offsets.map_type);
670 if map_type != BPF_MAP_TYPE_ARRAY {
671 continue;
672 }
673
674 // Wrong-name candidates also bail before the heavy reads.
675 let mut name_buf = [0u8; BPF_OBJ_NAME_LEN];
676 ctx.mem
677 .read_bytes(map_pa + offsets.map_name as u64, &mut name_buf);
678 let name_len = name_buf
679 .iter()
680 .position(|&b| b == 0)
681 .unwrap_or(BPF_OBJ_NAME_LEN);
682 let name_str = std::str::from_utf8(&name_buf[..name_len]).unwrap_or("");
683 if !name_str.ends_with(name_suffix) {
684 continue;
685 }
686
687 // Match: now do the full batched metadata read for the
688 // remaining fields and build the BpfMapInfo.
689 let meta = MapMetadata::read(ctx.mem, map_pa, offsets);
690 // `name_buf` is already `[u8; BPF_OBJ_NAME_LEN]` and was
691 // populated by the pre-batch name read above; reuse it
692 // directly instead of round-tripping through a fresh
693 // zero-init + copy_from_slice.
694 let name_bytes = name_buf;
695 let value_kva = Some(entry + offsets.array_value as u64);
696 let btf_vmlinux_value_type_id = if offsets.map_btf_vmlinux_value_type_id != 0 {
697 meta.u32_at(offsets.map_btf_vmlinux_value_type_id)
698 } else {
699 0
700 };
701 return Some(BpfMapInfo {
702 map_pa,
703 map_kva: entry,
704 name_bytes,
705 name_len: name_len as u8,
706 map_type,
707 map_flags: meta.u32_at(offsets.map_flags),
708 key_size: meta.u32_at(offsets.key_size),
709 value_size: meta.u32_at(offsets.value_size),
710 max_entries: meta.u32_at(offsets.max_entries),
711 value_kva,
712 btf_kva: meta.u64_at(offsets.map_btf),
713 btf_value_type_id: meta.u32_at(offsets.map_btf_value_type_id),
714 btf_vmlinux_value_type_id,
715 btf_key_type_id: meta.u32_at(offsets.map_btf_key_type_id),
716 });
717 }
718 None
719}
720
721/// Robustness bound on a single value-region allocation. `value_size`
722/// is read live from kernel memory via page-table translation, so a
723/// torn read mid-update, a stale/wrong offset-table entry for the
724/// running kernel, or a corrupted-pointer chase could yield a garbage
725/// `value_size` — up to ~4 GiB, the u32 max. Capping before the
726/// `vec![0u8; len]` allocation keeps a mis-read from driving a
727/// multi-gigabyte allocation on the freeze hot path. 16 MiB covers
728/// every realistic scheduler-scale map (a global-section `.bss` ARRAY
729/// is the largest practical value, KiB–low-MiB); kernel-legal
730/// non-percpu ARRAYs can exceed it (up to INT_MAX), so the cap is a
731/// robustness heuristic, not a kernel limit.
732const MAX_VALUE_SIZE: usize = 16 * 1024 * 1024;
733
734/// BPF-map I/O wrapper around [`super::kva_io::chunked_kva_io`] that
735/// supplies the `(cr3_pa, l5, tcr_el1, mem)` translator the BPF map
736/// path needs.
737///
738/// The shared helper covers the page-boundary chunking; this thin
739/// wrapper plumbs the per-accessor translation context through. See
740/// the shared helper's docs for the chunking semantics and the
741/// caller-side bytes-tracking contract.
742fn chunked_kva_io<F>(ctx: &AccessorCtx<'_>, target_kva: u64, len: usize, chunk_fn: F) -> bool
743where
744 F: FnMut(u64, u64, usize),
745{
746 super::kva_io::chunked_kva_io(
747 |kva| {
748 ctx.mem
749 .translate_kva(ctx.cr3_pa.0, Kva(kva), ctx.l5, ctx.tcr_el1)
750 },
751 target_kva,
752 len,
753 chunk_fn,
754 )
755}
756
757/// Write bytes to a BPF map's value region at `offset`.
758///
759/// Translates the value KVA (vmalloc'd for .bss maps) through the
760/// page table to find the guest physical address, then writes directly.
761/// Returns `false` if the map has no value KVA (non-ARRAY map),
762/// `offset + data.len()` exceeds `value_size`, or any page in the
763/// range is unmapped. Uses [`chunked_kva_io`] to pay one translate per
764/// 4 KiB page rather than one per byte.
765pub(crate) fn write_bpf_map_value(
766 ctx: &AccessorCtx<'_>,
767 map_info: &BpfMapInfo,
768 offset: usize,
769 data: &[u8],
770) -> bool {
771 let Some(base_kva) = map_info.value_kva else {
772 return false;
773 };
774 // checked_add against pathological offset+len that would
775 // wrap usize. Without the check, a wrap would silently make
776 // `> value_size` false and the chunked write would walk
777 // arbitrary KVAs.
778 let Some(end) = offset.checked_add(data.len()) else {
779 return false;
780 };
781 if end > map_info.value_size as usize {
782 return false;
783 }
784 let target_kva = base_kva + offset as u64;
785
786 let mut bytes_written: usize = 0;
787 let walked = chunked_kva_io(ctx, target_kva, data.len(), |pa, src_off, chunk_len| {
788 // One `copy_nonoverlapping` per chunk replaces the old per-
789 // byte `write_u8` loop. The chunk has already been
790 // bounds-checked against `value_size`, so a short return
791 // here means the page crosses end-of-DRAM. Track bytes_written
792 // and surface the short-write to the caller as `false` —
793 // mirrors the [`read_bpf_map_value`] symmetry where
794 // `bytes_filled != len` returns `None`. Without this guard a
795 // half-landed write would silently report success.
796 let src_off = src_off as usize;
797 let n = ctx
798 .mem
799 .write_bytes_at(pa, 0, &data[src_off..src_off + chunk_len]);
800 bytes_written = bytes_written.saturating_add(n);
801 });
802 walked && bytes_written == data.len()
803}
804
805/// Write a u32 to a BPF map's value region at `offset`.
806pub(crate) fn write_bpf_map_value_u32(
807 ctx: &AccessorCtx<'_>,
808 map_info: &BpfMapInfo,
809 offset: usize,
810 val: u32,
811) -> bool {
812 write_bpf_map_value(ctx, map_info, offset, &val.to_ne_bytes())
813}
814
815/// Read bytes from a BPF map's value region at `offset`.
816///
817/// Translates the value KVA (vmalloc'd for .bss maps) through the
818/// page table to find the guest physical address, then reads directly.
819/// Returns `None` if the map has no value KVA (non-ARRAY map),
820/// `offset + len` exceeds `value_size`, or any page in the range
821/// is unmapped. Uses [`chunked_kva_io`] to pay one translate per 4 KiB
822/// page plus one bulk [`GuestMem::read_bytes`] call, instead of one
823/// translate and one-byte copy per byte.
824pub(crate) fn read_bpf_map_value(
825 ctx: &AccessorCtx<'_>,
826 map_info: &BpfMapInfo,
827 offset: usize,
828 len: usize,
829) -> Option<Vec<u8>> {
830 let base_kva = map_info.value_kva?;
831 // checked_add against pathological offset+len that would
832 // wrap usize. See the matching guard on `write_bpf_map_value`
833 // above for the rationale.
834 let end = offset.checked_add(len)?;
835 if end > map_info.value_size as usize {
836 return None;
837 }
838 // Live-read robustness cap before allocation (see MAX_VALUE_SIZE):
839 // a garbage `value_size` (torn / stale-offset read) or a caller
840 // passing a huge `len` would otherwise allocate up to 4 GiB inside
841 // `vec![0u8; len]`.
842 if len > MAX_VALUE_SIZE {
843 return None;
844 }
845 read_kva_bytes(ctx, base_kva + offset as u64, len)
846}
847
848/// Read the value bytes for one entry of a multi-entry
849/// `BPF_MAP_TYPE_ARRAY` map.
850///
851/// Entries are contiguous starting at `bpf_array.value` with a
852/// per-entry stride of `round_up(value_size, 8)` — the kernel's
853/// `array->elem_size` (kernel/bpf/arraymap.c:93 sets it, :167-176
854/// indexes with it). The value region spans `max_entries *
855/// elem_size`. Unlike [`read_bpf_map_value`], whose bound is one
856/// entry's `value_size`, this reads entry `key` at
857/// `value_kva + key * stride`.
858///
859/// Returns `None` when the map is not `BPF_MAP_TYPE_ARRAY`,
860/// `key >= max_entries` (the kernel's `index_mask` is a Spectre
861/// bound, not a range check — `array_map_lookup_elem` rejects
862/// `index >= max_entries` BEFORE masking, so this replicates the
863/// pre-mask test), `value_size` exceeds `MAX_VALUE_SIZE`, no value
864/// KVA was resolved, the offset would overflow, or any page in the
865/// entry is unmapped. On success the buffer is exactly `value_size`
866/// bytes — the 8-rounded stride is internal padding the kernel's
867/// `copy_map_value` does not copy.
868pub(crate) fn read_bpf_map_array_value(
869 ctx: &AccessorCtx<'_>,
870 map_info: &BpfMapInfo,
871 key: u32,
872) -> Option<Vec<u8>> {
873 if map_info.map_type != BPF_MAP_TYPE_ARRAY {
874 return None;
875 }
876 // Replicate array_map_lookup_elem's pre-mask `index >= max_entries`
877 // rejection (kernel/bpf/arraymap.c:172) — never trust index_mask
878 // to clamp; it only blocks speculation.
879 if key >= map_info.max_entries {
880 return None;
881 }
882 let value_size = map_info.value_size as usize;
883 // Live-read robustness cap before any allocation, matching
884 // `read_bpf_map_value` (see MAX_VALUE_SIZE).
885 if value_size > MAX_VALUE_SIZE {
886 return None;
887 }
888 let base_kva = map_info.value_kva?;
889 // Per-entry stride is round_up(value_size, 8) = array->elem_size
890 // (kernel/bpf/arraymap.c:93). Same `(x + 7) & !7` rounding the
891 // percpu stride math uses in bpf_syscall.rs.
892 let stride = (value_size + 7) & !7;
893 // key < max_entries (checked above) keeps key * stride within the
894 // `max_entries * stride` value region; checked arithmetic guards a
895 // corrupted max_entries from wrapping the KVA past u64.
896 let offset = (key as u64).checked_mul(stride as u64)?;
897 let target_kva = base_kva.checked_add(offset)?;
898 read_kva_bytes(ctx, target_kva, value_size)
899}
900
901/// Page-walk `len` bytes from guest kernel-virtual address
902/// `target_kva` into a fresh buffer, resolving each 4 KiB page through
903/// `translate`. The translator abstracts WHICH KVA→PA strategy
904/// applies: [`read_kva_bytes`] passes the PTE-only `translate_kva`
905/// (vmalloc'd `.bss` value region); [`read_percpu_value_bytes`]
906/// passes `translate_any_kva` (direct-mapping-first, for per-CPU
907/// values that may live in either the direct mapping or vmalloc).
908///
909/// Walking page-by-page is mandatory, not an optimization. A
910/// vmalloc-backed range (a `.bss` map's value region, or a large
911/// dynamic per-CPU allocation) occupies physically discontiguous
912/// order-0 frames, so a value crossing a page boundary lives in
913/// non-adjacent guest physical memory. A single translate of the
914/// first page followed by one bulk read of `len` bytes would copy
915/// whatever frame happens to sit after the first page — garbage — for
916/// every byte past that boundary.
917///
918/// Returns `None` if any page is unmapped (`translate` returns
919/// `None`) or the copy short-reads (`read_bytes` returns fewer bytes
920/// than the chunk at end-of-DRAM); the buffer is adopted via
921/// `set_len` only once every byte is proven written. Performs NO
922/// semantic bounds check: `target_kva` must lie in the value region.
923/// Every value-region caller caps `len`/`value_size` against
924/// `MAX_VALUE_SIZE` before allocating — the `.bss`/ARRAY paths
925/// ([`read_bpf_map_value`], [`read_bpf_map_array_value`]) and the
926/// per-CPU paths ([`read_percpu_array_value`] and the PERCPU-HASH
927/// walker via `walk_htab`) alike — because `value_size` is read live
928/// from kernel memory and a mis-read could otherwise drive a huge
929/// allocation (per CPU, in the per-CPU case).
930fn read_kva_bytes_with<T: Fn(u64) -> Option<u64>>(
931 mem: &GuestMem,
932 translate: T,
933 target_kva: u64,
934 len: usize,
935) -> Option<Vec<u8>> {
936 // `Vec::with_capacity` reserves backing storage without zeroing
937 // — the zero-fill that `vec![0u8; len]` would have emitted is
938 // wasted because every byte gets overwritten by the
939 // `read_bytes` calls below. The buffer's length stays at zero
940 // until we've proven every chunk wrote, then `set_len(len)`
941 // adopts the populated bytes.
942 let mut buf: Vec<u8> = Vec::with_capacity(len);
943
944 // Safety / correctness: `chunked_kva_io` returns false when any
945 // page in the range is unmapped; propagate that to None so callers
946 // see "unreadable" rather than a partial buffer.
947 let buf_ptr = buf.as_mut_ptr();
948 let mut bytes_filled: usize = 0;
949 let ok = super::kva_io::chunked_kva_io(translate, target_kva, len, |pa, dst_off, chunk_len| {
950 // SAFETY: dst_off + chunk_len <= len <= buf.capacity(); the
951 // slice borrows the heap-allocated Vec whose backing storage
952 // is live for the duration of this call (the Vec is pinned in
953 // `buf` above and reborrowed here only through its mutable
954 // pointer). The slice covers reserved-but-uninitialized
955 // memory; `read_bytes` writes every byte before any read of
956 // the slice, and the outer code only adopts the bytes via
957 // `set_len` once `bytes_filled == len`.
958 let slice =
959 unsafe { std::slice::from_raw_parts_mut(buf_ptr.add(dst_off as usize), chunk_len) };
960 // GuestMem::read_bytes returns the count actually copied; the
961 // page was confirmed mapped by `translate`, so a short read
962 // here means the page crosses end-of-DRAM, which the original
963 // byte loop would also have silently short-copied.
964 let n = mem.read_bytes(pa, slice);
965 // `saturating_add` so a pathological accumulation past
966 // `usize::MAX` clamps and the `bytes_filled != len` check
967 // below still surfaces the short read instead of wrapping
968 // back to a value that aliases `len`.
969 bytes_filled = bytes_filled.saturating_add(n);
970 });
971 if !ok || bytes_filled != len {
972 return None;
973 }
974 // SAFETY: every byte in `0..len` of `buf`'s backing storage was
975 // written by the `read_bytes` calls above (`bytes_filled == len`
976 // proves it), the capacity is `len`, and u8 has no validity
977 // invariants.
978 unsafe {
979 buf.set_len(len);
980 }
981 Some(buf)
982}
983
984/// Page-walk `len` bytes from a `.bss`/ARRAY value-region KVA via the
985/// PTE-only `translate_kva` (the value region is vmalloc'd, never in
986/// the direct mapping).
987///
988/// Shared by [`read_bpf_map_value`] (byte-range reads bounded by one
989/// entry's `value_size`) and [`read_bpf_map_array_value`] (per-entry
990/// reads into a multi-entry ARRAY bounded by `max_entries * stride`).
991/// Performs NO semantic bounds check — see [`read_kva_bytes_with`].
992fn read_kva_bytes(ctx: &AccessorCtx<'_>, target_kva: u64, len: usize) -> Option<Vec<u8>> {
993 read_kva_bytes_with(
994 ctx.mem,
995 |kva| {
996 ctx.mem
997 .translate_kva(ctx.cr3_pa.0, Kva(kva), ctx.l5, ctx.tcr_el1)
998 },
999 target_kva,
1000 len,
1001 )
1002}
1003
1004/// Page-walk `len` bytes of one CPU's per-CPU value via
1005/// `translate_any_kva` (direct-mapping-first, then a page-table walk
1006/// for vmalloc'd per-CPU memory).
1007///
1008/// Per-CPU values come from the kernel's dynamic per-CPU allocator:
1009/// the embedded first chunk lives in the direct mapping, while larger
1010/// allocations are vmalloc-backed (`mm/percpu-vm.c` hands out order-0
1011/// pages via `pcpu_get_vm_areas`, so the frames are physically
1012/// discontiguous). Walking page-by-page — [`read_kva_bytes_with`]'s
1013/// core behavior — is therefore required for any value that crosses a
1014/// page boundary; a single translate + bulk read would copy garbage
1015/// past the first page.
1016///
1017/// Returns `None` if any page is unmapped or the read short-reads at
1018/// end-of-DRAM. The end-of-DRAM bound the single-read path checked
1019/// explicitly (`cpu_pa + value_size <= mem.size()`) is subsumed:
1020/// [`GuestMem::read_bytes`] returns 0 for a PA past `mem.size()`, so
1021/// `bytes_filled != len` drops the slot to `None`.
1022fn read_percpu_value_bytes(ctx: &AccessorCtx<'_>, target_kva: u64, len: usize) -> Option<Vec<u8>> {
1023 read_kva_bytes_with(
1024 ctx.mem,
1025 |kva| {
1026 translate_any_kva(
1027 ctx.mem,
1028 ctx.cr3_pa.0,
1029 ctx.page_offset.0,
1030 kva,
1031 ctx.l5,
1032 ctx.tcr_el1,
1033 )
1034 },
1035 target_kva,
1036 len,
1037 )
1038}
1039
1040/// Read a u32 from a BPF map's value region at `offset`.
1041pub(crate) fn read_bpf_map_value_u32(
1042 ctx: &AccessorCtx<'_>,
1043 map_info: &BpfMapInfo,
1044 offset: usize,
1045) -> Option<u32> {
1046 let bytes = read_bpf_map_value(ctx, map_info, offset, 4)?;
1047 Some(u32::from_ne_bytes(bytes.try_into().unwrap()))
1048}
1049
1050/// Read the per-CPU values for a single key in a `BPF_MAP_TYPE_PERCPU_ARRAY` map.
1051///
1052/// `bpf_array.pptrs[key]` holds a `__percpu` pointer. Adding
1053/// `__per_cpu_offset[cpu]` yields the per-CPU KVA, which may live
1054/// either in the direct mapping (static percpu, kmalloc'd percpu)
1055/// or in vmalloc'd memory (large dynamic per-CPU allocations). The
1056/// value is read via [`read_percpu_value_bytes`], which walks it
1057/// page-by-page through [`translate_any_kva`] (direct-mapping-first,
1058/// vmalloc fallback) so a value that misses the direct mapping — or
1059/// that straddles physically discontiguous vmalloc frames — is read
1060/// correctly rather than reading as `None` or copying garbage.
1061///
1062/// Returns one entry per CPU, indexed by CPU number. `Some(bytes)`
1063/// when the per-CPU PA falls within guest memory; `None` when it
1064/// does not. Returns an empty vec if the map is not
1065/// `BPF_MAP_TYPE_PERCPU_ARRAY`, `key >= max_entries`, or the percpu
1066/// pointer is zero.
1067fn read_percpu_array_value(
1068 ctx: &AccessorCtx<'_>,
1069 map: &BpfMapInfo,
1070 key: u32,
1071 per_cpu_offsets: &[u64],
1072) -> Vec<Option<Vec<u8>>> {
1073 if map.map_type != BPF_MAP_TYPE_PERCPU_ARRAY {
1074 return Vec::new();
1075 }
1076 if key >= map.max_entries {
1077 return Vec::new();
1078 }
1079
1080 // pptrs is at the same offset as value (union in bpf_array).
1081 let pptrs_kva = map.map_kva + ctx.offsets.array_value as u64;
1082 // pptrs[key] is a void __percpu * — 8 bytes.
1083 let pptr_kva = pptrs_kva + (key as u64) * 8;
1084
1085 // bpf_array may be kmalloc'd or vmalloc'd — try direct mapping first.
1086 let Some(pptr_pa) = translate_any_kva(
1087 ctx.mem,
1088 ctx.cr3_pa.0,
1089 ctx.page_offset.0,
1090 pptr_kva,
1091 ctx.l5,
1092 ctx.tcr_el1,
1093 ) else {
1094 return Vec::new();
1095 };
1096 let percpu_base = ctx.mem.read_u64(pptr_pa, 0);
1097 if percpu_base == 0 {
1098 return Vec::new();
1099 }
1100
1101 let value_size = map.value_size as usize;
1102 // Robustness cap mirroring read_bpf_map_value / read_bpf_map_array_value
1103 // (and htab.rs's walk_htab): value_size is read live from kernel memory,
1104 // so a torn read mid-update, a stale offset-table entry for the running
1105 // kernel, or a corrupted-pointer chase could yield a garbage size — and
1106 // here it would allocate value_size bytes PER CPU across the loop below.
1107 // The kernel bounds PERCPU value_size well under 16 MiB (map create
1108 // rejects round_up(value_size, 8) > PCPU_MIN_UNIT_SIZE), so this never
1109 // rejects a legal map; it only fires on a mis-read.
1110 if value_size > MAX_VALUE_SIZE {
1111 return Vec::new();
1112 }
1113 let mut result = Vec::with_capacity(per_cpu_offsets.len());
1114
1115 for (cpu_index, &cpu_off) in per_cpu_offsets.iter().enumerate() {
1116 // Out-of-range CPU detection: kernel `setup_per_cpu_areas`
1117 // (e.g. arch/x86/kernel/setup_percpu.c) only writes
1118 // `__per_cpu_offset[cpu]` for cpus in `for_each_possible_cpu`,
1119 // leaving slots beyond `nr_cpu_ids` at the BSS-initialized
1120 // value of 0. Real SMP kernels assign each possible CPU a
1121 // strictly-positive offset (`delta + unit_offsets[cpu]`) for
1122 // cpu > 0 because `unit_offsets[cpu]` is a positive multiple
1123 // of the per-CPU unit size — only the BSP (cpu_index == 0)
1124 // can legitimately observe a zero offset on systems where
1125 // the delta term is zero. Treating `cpu_off == 0 &&
1126 // cpu_index > 0` as out-of-range prevents the prior aliasing
1127 // bug where every out-of-range slot returned CPU 0's bytes
1128 // (because `percpu_base + 0` translated successfully to
1129 // whatever the bare percpu_base pointed at).
1130 if cpu_off == 0 && cpu_index > 0 {
1131 result.push(None);
1132 continue;
1133 }
1134 let cpu_kva = percpu_base.wrapping_add(cpu_off);
1135 // The percpu base + cpu_off may land in either the direct
1136 // mapping (per-CPU __percpu allocations from the static
1137 // percpu region or kmalloc'd percpu blocks) or vmalloc'd
1138 // percpu memory (large dynamic per-CPU allocations served
1139 // from pcpu_get_vm_areas). `read_percpu_value_bytes` walks the
1140 // value page-by-page through `translate_any_kva` (direct
1141 // mapping first, page-table walk for vmalloc'd percpu), so a
1142 // value straddling physically discontiguous vmalloc frames is
1143 // read correctly; it drops the slot to `None` on any unmapped
1144 // page or end-of-DRAM short read.
1145 result.push(read_percpu_value_bytes(ctx, cpu_kva, value_size));
1146 }
1147
1148 result
1149}
1150
1151/// Chase modifiers (Volatile, Const, Typedef, TypeTag, Restrict,
1152/// DeclTag) and pointers from `type_id` to find a Struct or Union.
1153///
1154/// Returns `None` if the chain ends in a type that is neither Struct
1155/// nor Union, or exceeds depth 32. Also resolves through Ptr (for
1156/// pointer-to-struct members).
1157pub(crate) fn resolve_to_struct(btf: &btf_rs::Btf, type_id: u32) -> Option<btf_rs::Struct> {
1158 resolve_to_struct_with_id(btf, type_id).map(|(s, _)| s)
1159}
1160
1161/// Same chain walk as [`resolve_to_struct`] but returns the BTF type
1162/// id of the terminal struct/union instead of the struct value.
1163/// Callers that key data structures on type ids (e.g. the cast
1164/// analyzer's `RegState::Pointer { struct_type_id }`) need the id
1165/// post-peel; callers that need the struct shape use
1166/// [`resolve_to_struct`].
1167pub(crate) fn resolve_to_struct_id(btf: &btf_rs::Btf, type_id: u32) -> Option<u32> {
1168 resolve_to_struct_with_id(btf, type_id).map(|(_, tid)| tid)
1169}
1170
1171/// Shared chain walk for [`resolve_to_struct`] and
1172/// [`resolve_to_struct_id`]. Peels Ptr / Volatile / Const / Typedef /
1173/// TypeTag / Restrict / DeclTag up to depth 32, returning `(struct,
1174/// id)` at the first Struct or Union encountered.
1175fn resolve_to_struct_with_id(btf: &btf_rs::Btf, type_id: u32) -> Option<(btf_rs::Struct, u32)> {
1176 let mut tid = type_id;
1177 for _ in 0..32 {
1178 let t = btf.resolve_type_by_id(tid).ok()?;
1179 match t {
1180 btf_rs::Type::Struct(s) | btf_rs::Type::Union(s) => return Some((s, tid)),
1181 btf_rs::Type::Ptr(_)
1182 | btf_rs::Type::Volatile(_)
1183 | btf_rs::Type::Const(_)
1184 | btf_rs::Type::Typedef(_)
1185 | btf_rs::Type::TypeTag(_)
1186 | btf_rs::Type::Restrict(_)
1187 | btf_rs::Type::DeclTag(_) => {
1188 tid = t.as_btf_type()?.get_type_id()?;
1189 }
1190 _ => return None,
1191 }
1192 }
1193 None
1194}
1195
1196/// Read-only abstraction over BPF map enumeration and value reads
1197/// across data sources. Mutating operations (write_value etc.) are
1198/// inherent on each backend, NOT exposed here — the trait surface is
1199/// a snapshot-style read API used by the failure-dump renderer and
1200/// any future read-only consumer.
1201///
1202/// Two implementations exist today: `GuestMemMapAccessor` (this
1203/// module — reads a frozen guest VM's physical memory) and
1204/// `super::bpf_syscall::BpfSyscallAccessor` (live-host introspection
1205/// via the `bpf()` syscall). Both plug into this trait surface.
1206///
1207/// - `GuestMemMapAccessor` — reads from a frozen guest VM's physical
1208/// memory via PTE walks against the frozen `init_mm`. Used by the
1209/// freeze-coordinator path (`super::dump::dump_state`) on the
1210/// in-VM scheduler test runs. Hash map iteration walks
1211/// `bpf_htab.buckets` directly without RCU; the freeze rendezvous
1212/// IS the ordering primitive (every CPU is parked at a known KVM
1213/// exit before the host begins reading memory). Per-CPU value
1214/// reads use the cached `__per_cpu_offset[cpu]` array; out-of-range
1215/// CPUs surface as `None` rather than aliasing CPU 0 (see
1216/// `read_percpu_array_value`).
1217///
1218/// The live-host backend produces identical
1219/// [`BpfMapInfo`] / byte buffers, so the rendering pipeline
1220/// (`super::btf_render::render_value`) stays data-source-agnostic
1221/// and consumes either accessor through this trait. The
1222/// live-host backend's failure modes are different (e.g. hash reads
1223/// will rely on the kernel's RCU read-side critical section,
1224/// `bpf_map_lookup_elem` rejection for non-readable types) and
1225/// individual method docs spell those out where they matter.
1226///
1227/// `dump_state` currently takes a concrete
1228/// `GuestMemMapAccessor` because its sdt_alloc post-pass walks
1229/// the underlying `super::guest::GuestKernel` — that handle is
1230/// not part of the trait surface. Once sdt_alloc walking moves
1231/// into a backend-specific path, `dump_state` can switch to
1232/// `&dyn BpfMapAccessor`. Other call
1233/// sites that need only the trait surface can already bind on
1234/// `&dyn BpfMapAccessor` (or `<A: BpfMapAccessor>`) without paying
1235/// virtual dispatch.
1236#[allow(dead_code)]
1237pub trait BpfMapAccessor {
1238 /// Enumerate every BPF map visible to this accessor.
1239 ///
1240 /// Order is implementation-defined: the guest-memory backend walks
1241 /// `map_idr` (allocation order); the bpf-syscall backend walks the
1242 /// kernel's id space via `BPF_MAP_GET_NEXT_ID` (also allocation
1243 /// order, modulo concurrent destruction races on the live host).
1244 /// Callers that want a stable view should sort by name.
1245 fn maps(&self) -> Vec<BpfMapInfo>;
1246
1247 /// Find the first BPF map whose name ends with `name_suffix`.
1248 ///
1249 /// Default impl walks [`Self::maps`]. Backends with cheaper
1250 /// targeted lookups can override (e.g. a libbpf-handle-backed
1251 /// accessor that already holds a name index).
1252 fn find_map(&self, name_suffix: &str) -> Option<BpfMapInfo> {
1253 self.maps()
1254 .into_iter()
1255 .find(|m| m.name().ends_with(name_suffix))
1256 }
1257
1258 /// Read a contiguous byte range from a map's value region.
1259 ///
1260 /// Returns `None` for non-readable map types (e.g. ARENA — use
1261 /// [`Self::read_arena_pages`]; HASH — use [`Self::iter_hash_map`])
1262 /// or when the backing read fails. The guest-memory backend's
1263 /// failure modes are unmapped guest pages and out-of-range value
1264 /// regions; the bpf-syscall backend additionally surfaces
1265 /// `bpf_map_lookup_elem` rejection (e.g. `-EINVAL` on
1266 /// arena maps, kernel-side ACL denials).
1267 fn read_value(&self, map: &BpfMapInfo, offset: usize, len: usize) -> Option<Vec<u8>>;
1268
1269 /// Read the value bytes of one entry of a `BPF_MAP_TYPE_ARRAY` map
1270 /// by entry index.
1271 ///
1272 /// Parallels [`Self::read_percpu_array`] (also keyed by entry
1273 /// index) but for a plain ARRAY: one value per key, so the return
1274 /// is a single `Option<Vec<u8>>` rather than a per-CPU vector. On
1275 /// success the buffer is exactly `map.value_size` bytes.
1276 ///
1277 /// Returns `None` for non-ARRAY maps, `key >= map.max_entries`, or
1278 /// when the backing read fails (unmapped guest page on the
1279 /// guest-memory backend; `bpf_map_lookup_elem` rejection on the
1280 /// live-host backend). Distinct from [`Self::read_value`], which
1281 /// stays the byte-range reader for single-entry global-section
1282 /// ARRAYs and STRUCT_OPS (both key 0); multi-entry ARRAY indexing
1283 /// goes through this method so `read_value`'s key-0 contract is
1284 /// untouched.
1285 fn read_array(&self, map: &BpfMapInfo, key: u32) -> Option<Vec<u8>>;
1286
1287 /// Iterate every entry in a `BPF_MAP_TYPE_HASH` or
1288 /// `BPF_MAP_TYPE_LRU_HASH` map.
1289 ///
1290 /// Both share the inline-value `htab_elem` layout
1291 /// (`kernel/bpf/hashtab.c::htab_elem_value`); LRU adds an
1292 /// eviction policy but the value bytes still sit at
1293 /// `key + round_up(key_size, 8)`. Returns an empty vec for any
1294 /// other map type.
1295 ///
1296 /// Per-element atomicity is backend-specific: the guest-memory
1297 /// backend reads raw bytes at the freeze instant (the freeze
1298 /// rendezvous IS the synchronization — no concurrent writers
1299 /// exist while parked vCPUs stay parked); the bpf-syscall backend
1300 /// reads under the kernel's RCU read-side critical section
1301 /// (`bpf_map_lookup_elem` -> `htab_map_lookup_elem`). Both can
1302 /// produce torn views relative to a multi-element transaction
1303 /// the scheduler intended to commit atomically — that's a feature
1304 /// of reading without locking the whole table.
1305 fn iter_hash_map(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)>;
1306
1307 /// Iterate every entry in a `BPF_MAP_TYPE_PERCPU_HASH` or
1308 /// `BPF_MAP_TYPE_LRU_PERCPU_HASH` map. Returns
1309 /// `(key_bytes, per_cpu_values)` where `per_cpu_values` is one
1310 /// entry per CPU indexed by CPU number; `Some(bytes)` when the
1311 /// CPU's slot is readable, `None` otherwise (unmapped page or
1312 /// out-of-range CPU).
1313 ///
1314 /// Returns an empty vec for any other map type. Default
1315 /// implementation returns empty so backends that haven't yet
1316 /// wired the percpu-hash path don't break trait dispatch — the
1317 /// dump renderer surfaces the resulting empty list as a
1318 /// "no entries" outcome rather than a panic.
1319 fn iter_percpu_hash_map(&self, _map: &BpfMapInfo, _num_cpus: u32) -> PerCpuHashEntries {
1320 Vec::new()
1321 }
1322
1323 /// Iterate every entry in a `BPF_MAP_TYPE_TASK_STORAGE` map (and
1324 /// the shape-identical `INODE_STORAGE` / `SK_STORAGE` /
1325 /// `CGRP_STORAGE` variants — they all use
1326 /// `super::btf_offsets::TaskStorageOffsets`).
1327 ///
1328 /// Returned tuples are `(owner_kva_le_bytes, value_bytes)`:
1329 /// - `owner_kva_le_bytes` is the 8-byte little-endian encoding of
1330 /// the `bpf_local_storage.owner` pointer reached by following
1331 /// each `bpf_local_storage_elem.local_storage`. For
1332 /// `TASK_STORAGE` this is the `task_struct` KVA; for the other
1333 /// variants it is the inode/sock/cgroup KVA. The walker treats
1334 /// it as opaque so the same shape works across all four map
1335 /// types.
1336 /// - `value_bytes` is `value_size` bytes copied from
1337 /// `bpf_local_storage_elem.sdata.data[]` — the value the
1338 /// scheduler stored under this owner.
1339 ///
1340 /// Returns an empty vec for any other map type, when
1341 /// `task_storage_offsets` is unavailable, or when the map's
1342 /// `buckets` pointer cannot be translated. Returns an empty vec
1343 /// for any other map type. Default implementation returns empty
1344 /// so backends that haven't yet wired this path don't break
1345 /// trait dispatch — the dump renderer surfaces the resulting
1346 /// empty list as a "no entries" outcome rather than a panic.
1347 fn iter_task_storage(&self, _map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
1348 Vec::new()
1349 }
1350
1351 /// Read every CPU's value for a key in a `BPF_MAP_TYPE_PERCPU_ARRAY` map.
1352 ///
1353 /// Returns one entry per CPU, indexed by CPU number. `Some(bytes)`
1354 /// when the per-CPU slot is readable; `None` when it isn't (e.g.
1355 /// an out-of-range CPU index — `__per_cpu_offset[cpu]` reads as
1356 /// the BSS-zero sentinel — or an unmapped page on the
1357 /// guest-memory path; the bpf-syscall backend surfaces
1358 /// out-of-range CPU on `bpf_map_lookup_elem` failure). Returns an
1359 /// empty vec for non-PERCPU_ARRAY maps or `key >= max_entries`.
1360 fn read_percpu_array(&self, map: &BpfMapInfo, key: u32, num_cpus: u32) -> Vec<Option<Vec<u8>>>;
1361
1362 /// Snapshot every mapped page of a `BPF_MAP_TYPE_ARENA` map.
1363 ///
1364 /// `arena_offsets` resolves kernel struct field offsets the
1365 /// guest-memory backend uses to walk `bpf_arena -> kern_vm ->
1366 /// vm_struct.addr`; the bpf-syscall backend mmaps the arena fd
1367 /// directly (the only data path the kernel exposes — arena's
1368 /// `lookup_elem` returns `-EINVAL`, see `kernel/bpf/arena.c`)
1369 /// and ignores `arena_offsets`. The default
1370 /// implementation returns an empty snapshot; backends override to
1371 /// produce real content.
1372 fn read_arena_pages(
1373 &self,
1374 _map: &BpfMapInfo,
1375 _arena_offsets: &super::arena::BpfArenaOffsets,
1376 ) -> super::arena::ArenaSnapshot {
1377 super::arena::ArenaSnapshot::default()
1378 }
1379
1380 /// Load the program BTF object referenced by a map.
1381 ///
1382 /// `base_btf` is the host's vmlinux BTF used as the base for
1383 /// split-BTF parsing. Returns `None` when the map carries no
1384 /// program BTF (e.g. kernel-builtin maps), when the BTF blob can't
1385 /// be loaded, or when [`btf_rs::Btf::from_bytes`] /
1386 /// [`btf_rs::Btf::from_split_bytes`] reject the bytes.
1387 ///
1388 /// The default implementation returns `None`; backends override to
1389 /// hand back a parsed [`btf_rs::Btf`].
1390 fn load_program_btf(&self, _map: &BpfMapInfo, _base_btf: &btf_rs::Btf) -> Option<btf_rs::Btf> {
1391 None
1392 }
1393}
1394
1395/// Host-side BPF map accessor backed by direct guest physical-memory
1396/// reads.
1397///
1398/// Resolves BTF offsets for BPF map structures and provides map
1399/// discovery, value read/write, hash iteration, and per-CPU reads.
1400/// Uses a [`GuestKernel`] for address translation (PTE walks against
1401/// the guest's frozen page tables).
1402///
1403/// Implements the [`BpfMapAccessor`] trait so [`super::dump::dump_state`]
1404/// can dispatch through it without committing to a backend at the call
1405/// site.
1406///
1407/// [`GuestKernel`]: super::guest::GuestKernel
1408pub struct GuestMemMapAccessor<'a> {
1409 kernel: &'a super::guest::GuestKernel,
1410 map_idr_kva: u64,
1411 /// Borrowed from the `GuestMemMapAccessorOwned` that produced this
1412 /// accessor via `as_accessor`, or provided by the caller to
1413 /// `from_guest_kernel`. Borrowing avoids the ~160-byte
1414 /// `BpfMapOffsets` clone that the old owned-field design paid
1415 /// on every `as_accessor()` call.
1416 offsets: &'a BpfMapOffsets,
1417 /// Optional borrow of a `__per_cpu_offset` cache owned by the
1418 /// `GuestMemMapAccessorOwned` wrapper. The cache stores the
1419 /// resolved `Vec<u64>` once per `(num_cpus, accessor)` pair so
1420 /// repeat percpu reads (one ARRAY map + several PERCPU_HASH
1421 /// maps in a single dump) don't re-issue the
1422 /// `read_per_cpu_offsets` array read for every map.
1423 ///
1424 /// `None` for accessors built directly via [`Self::from_guest_kernel`]
1425 /// (which has no owner to host the cache); on that path each
1426 /// percpu method resolves offsets fresh as before. The cached
1427 /// vec stores RAW offsets — consumers (e.g.
1428 /// [`read_percpu_array_value`]) still apply the BSS-zero-tail
1429 /// guard `cpu_off == 0 && cpu_index > 0` to skip aliased CPU
1430 /// slots.
1431 per_cpu_offsets_cache: Option<&'a PerCpuOffsetsCache>,
1432 /// Per-accessor (per-dump) cache of [`find_all_bpf_maps`].
1433 /// Each `as_accessor()` / `from_guest_kernel` call constructs
1434 /// a fresh empty cache, so the cache lifetime matches one
1435 /// dump. Between dumps the guest kernel runs and can create /
1436 /// destroy maps; persisting the cache across the
1437 /// [`GuestMemMapAccessorOwned`] lifetime would return stale
1438 /// entries for freed maps. The borrowed accessor's per-dump
1439 /// lifetime is exactly the right scope.
1440 ///
1441 /// `Mutex<Option<...>>` rather than `RefCell` because the
1442 /// trait surface uses `&self` and any cross-thread `Sync`
1443 /// caller (today single-threaded freeze coordinator, future
1444 /// concurrent dump pipeline) requires `Sync`. Contention is
1445 /// non-existent — the lock is held only for the duration of
1446 /// one Vec move.
1447 maps_cache: std::sync::Mutex<Option<std::sync::Arc<Vec<BpfMapInfo>>>>,
1448}
1449
1450/// Per-`(num_cpus, accessor)` cache of the resolved `__per_cpu_offset`
1451/// array. Lives on [`GuestMemMapAccessorOwned`] so a single freeze-
1452/// dump session amortizes one array read across every percpu map
1453/// access (PERCPU_ARRAY value reads, PERCPU_HASH iteration).
1454///
1455/// Storage: a single-slot cache keyed on the `num_cpus` argument
1456/// every method passes. Different `num_cpus` values overwrite the
1457/// previous slot (we re-resolve from guest memory). Production
1458/// callers pass a constant `num_cpus` for the run, so this is a
1459/// pure win on the freeze hot path; the test path never builds an
1460/// `Owned` wrapper and so doesn't see the cache.
1461///
1462/// Synchronization: `Mutex<Option<...>>` keeps the cache safe for
1463/// the trait's `&self` methods and for any future cross-thread
1464/// dump pipeline (today the freeze coordinator is single-threaded
1465/// for reads, but a `Sync` trait surface lets the cache work even
1466/// when the assumption changes). Contention is non-existent — the
1467/// lock is held only for the duration of one Vec move.
1468#[allow(dead_code)]
1469pub(crate) struct PerCpuOffsetsCache {
1470 inner: std::sync::Mutex<Option<(u32, std::sync::Arc<Vec<u64>>)>>,
1471}
1472
1473#[allow(dead_code)]
1474impl PerCpuOffsetsCache {
1475 pub(crate) fn new() -> Self {
1476 Self {
1477 inner: std::sync::Mutex::new(None),
1478 }
1479 }
1480
1481 /// Resolve `__per_cpu_offset[]` once per `(num_cpus, accessor)`
1482 /// and reuse on subsequent calls with the same `num_cpus`. The
1483 /// closure runs only on a miss (or a `num_cpus` change); its
1484 /// return value is shared via `Arc` so concurrent borrowers see
1485 /// the same vec without holding the mutex across reads.
1486 pub(crate) fn get_or_init<F>(&self, num_cpus: u32, init: F) -> std::sync::Arc<Vec<u64>>
1487 where
1488 F: FnOnce() -> Vec<u64>,
1489 {
1490 let mut guard = self.inner.lock_unpoisoned();
1491 if let Some((cached_n, cached)) = guard.as_ref()
1492 && *cached_n == num_cpus
1493 {
1494 return cached.clone();
1495 }
1496 let arc = std::sync::Arc::new(init());
1497 *guard = Some((num_cpus, arc.clone()));
1498 arc
1499 }
1500}
1501
1502#[allow(dead_code)]
1503impl<'a> GuestMemMapAccessor<'a> {
1504 /// Create from an existing [`GuestKernel`] and a caller-owned
1505 /// [`BpfMapOffsets`].
1506 ///
1507 /// The accessor borrows the offsets for its lifetime, so callers
1508 /// typically stash them in a `GuestMemMapAccessorOwned` (or another
1509 /// stable location) before calling this. Build `offsets` once via
1510 /// [`BpfMapOffsets::from_vmlinux`] and reuse — they're per-kernel,
1511 /// not per-call.
1512 ///
1513 /// [`GuestKernel`]: super::guest::GuestKernel
1514 pub fn from_guest_kernel(
1515 kernel: &'a super::guest::GuestKernel,
1516 offsets: &'a BpfMapOffsets,
1517 ) -> anyhow::Result<Self> {
1518 let map_idr_kva = kernel
1519 .symbol_kva("map_idr")
1520 .ok_or_else(|| anyhow::anyhow!("map_idr symbol not found in vmlinux"))?;
1521
1522 Ok(Self {
1523 kernel,
1524 map_idr_kva,
1525 offsets,
1526 per_cpu_offsets_cache: None,
1527 maps_cache: std::sync::Mutex::new(None),
1528 })
1529 }
1530
1531 /// Build a `GuestMemMapAccessor` for unit tests, bypassing the
1532 /// `map_idr` symbol lookup `from_guest_kernel` performs.
1533 ///
1534 /// Cross-module tests for the per-map render helpers
1535 /// (`render_ringbuf_state`, `render_stack_traces`,
1536 /// `render_fd_array_slots`) and for `iter_percpu_hash_map` need
1537 /// an accessor over a synthetic `GuestKernel`. The production
1538 /// `from_guest_kernel` requires the kernel to expose a `map_idr`
1539 /// symbol, which synthetic kernels constructed via
1540 /// `GuestKernel::new_for_test` typically do not. This
1541 /// constructor takes `map_idr_kva` directly so the caller can
1542 /// pass `0` (the per-map render helpers never read through the
1543 /// map_idr) or a known-good KVA when exercising
1544 /// `find_all_bpf_maps`.
1545 #[cfg(test)]
1546 pub(crate) fn new_for_test(
1547 kernel: &'a super::guest::GuestKernel,
1548 offsets: &'a BpfMapOffsets,
1549 map_idr_kva: u64,
1550 ) -> Self {
1551 Self {
1552 kernel,
1553 map_idr_kva,
1554 offsets,
1555 per_cpu_offsets_cache: None,
1556 maps_cache: std::sync::Mutex::new(None),
1557 }
1558 }
1559
1560 /// Build the [`AccessorCtx`] used by every map-read/write routine.
1561 fn ctx(&self) -> AccessorCtx<'_> {
1562 AccessorCtx {
1563 mem: self.kernel.mem(),
1564 cr3_pa: Cr3Pa(self.kernel.cr3_pa()),
1565 page_offset: PageOffset(self.kernel.page_offset()),
1566 offsets: self.offsets,
1567 l5: self.kernel.l5(),
1568 tcr_el1: self.kernel.tcr_el1(),
1569 start_kernel_map: self.kernel.start_kernel_map(),
1570 phys_base: self.kernel.phys_base(),
1571 iter_max: MAP_WALK_ITER_MAX,
1572 }
1573 }
1574
1575 /// Borrow the resolved BPF map field offsets. Used by callers
1576 /// that need to read kernel struct fields (e.g. `struct btf` for
1577 /// the program-BTF loader) without going through the
1578 /// map-access trait surface.
1579 pub fn offsets(&self) -> &BpfMapOffsets {
1580 self.offsets
1581 }
1582
1583 /// Borrow the underlying [`super::guest::GuestKernel`] for callers
1584 /// that need direct access to symbol resolution / page-walk
1585 /// primitives outside the map-discovery surface (e.g. arena page
1586 /// enumeration in [`super::arena`], sdt_alloc tree walks).
1587 pub fn kernel(&self) -> &'a super::guest::GuestKernel {
1588 self.kernel
1589 }
1590
1591 /// Find the first BPF ARRAY map whose name ends with `name_suffix`.
1592 ///
1593 /// Only returns `BPF_MAP_TYPE_ARRAY` maps — distinct from the
1594 /// suffix-only [`BpfMapAccessor::find_map`] trait method. The
1595 /// distinct name keeps inherent-over-trait method resolution
1596 /// honest: a concrete-receiver caller that wants the ARRAY
1597 /// filter (value-region read/write needs `value_kva`, which is
1598 /// `Some` only for ARRAY maps) names it explicitly here, and the
1599 /// compiler errors instead of silently shadowing the trait
1600 /// method when the receiver type changes. Use
1601 /// [`BpfMapAccessor::maps`] to enumerate maps of all types.
1602 /// Goes through the per-accessor maps cache so repeat
1603 /// `find_array_map` calls within one dump amortize the IDR walk.
1604 pub fn find_array_map(&self, name_suffix: &str) -> Option<BpfMapInfo> {
1605 let mut guard = self.maps_cache.lock_unpoisoned();
1606 if guard.is_none() {
1607 *guard = Some(std::sync::Arc::new(find_all_bpf_maps(
1608 &self.ctx(),
1609 self.map_idr_kva,
1610 )));
1611 }
1612 guard
1613 .as_ref()
1614 .unwrap()
1615 .iter()
1616 .find(|m| m.map_type == BPF_MAP_TYPE_ARRAY && m.name().ends_with(name_suffix))
1617 .cloned()
1618 }
1619
1620 /// Write bytes to a map's value region.
1621 ///
1622 /// Returns `false` if the map has no value KVA (non-ARRAY map)
1623 /// or any page in the range is unmapped.
1624 pub fn write_value(&self, map: &BpfMapInfo, offset: usize, data: &[u8]) -> bool {
1625 write_bpf_map_value(&self.ctx(), map, offset, data)
1626 }
1627
1628 /// Write a u32 to a map's value region.
1629 pub fn write_value_u32(&self, map: &BpfMapInfo, offset: usize, val: u32) -> bool {
1630 write_bpf_map_value_u32(&self.ctx(), map, offset, val)
1631 }
1632
1633 /// Read a u32 from a map's value region.
1634 pub fn read_value_u32(&self, map: &BpfMapInfo, offset: usize) -> Option<u32> {
1635 read_bpf_map_value_u32(&self.ctx(), map, offset)
1636 }
1637
1638 /// Resolve `__per_cpu_offset[]` for `num_cpus` CPUs, using the
1639 /// owner-side cache when present.
1640 ///
1641 /// Returns `None` only when the `__per_cpu_offset` symbol is
1642 /// missing from the vmlinux symtab — every other failure mode
1643 /// (out-of-bounds reads, BSS-zero tail entries) surfaces as
1644 /// zero offsets that the caller's BSS-zero guard rejects, so
1645 /// the cache stores the raw resolved vec without filtering.
1646 /// The returned `Arc` lets the cache hand out the same vec to
1647 /// multiple concurrent readers (the freeze hot path is
1648 /// single-threaded today, but the cache surface is `Sync` for
1649 /// future cross-thread use).
1650 pub(crate) fn resolve_per_cpu_offsets(
1651 &self,
1652 num_cpus: u32,
1653 ) -> Option<std::sync::Arc<Vec<u64>>> {
1654 let pco_kva = self.kernel.symbol_kva("__per_cpu_offset")?;
1655 let pco_pa = self.kernel.text_kva_to_pa(pco_kva);
1656 let mem = self.kernel.mem();
1657 match self.per_cpu_offsets_cache {
1658 Some(cache) => Some(cache.get_or_init(num_cpus, || {
1659 super::symbols::read_per_cpu_offsets(mem, pco_pa, num_cpus)
1660 })),
1661 None => Some(std::sync::Arc::new(super::symbols::read_per_cpu_offsets(
1662 mem, pco_pa, num_cpus,
1663 ))),
1664 }
1665 }
1666}
1667
1668impl BpfMapAccessor for GuestMemMapAccessor<'_> {
1669 /// Enumerate every BPF map. Caches the result for this
1670 /// accessor's lifetime so repeat `maps()` / `find_map(...)`
1671 /// calls within a single dump pay the IDR walk only once.
1672 /// The cache is per-accessor (per-dump), not per-owner, so it
1673 /// cannot return stale entries for maps the guest kernel
1674 /// created / destroyed between dumps.
1675 fn maps(&self) -> Vec<BpfMapInfo> {
1676 let mut guard = self.maps_cache.lock_unpoisoned();
1677 if let Some(cached) = guard.as_ref() {
1678 return (**cached).clone();
1679 }
1680 let maps = find_all_bpf_maps(&self.ctx(), self.map_idr_kva);
1681 let arc = std::sync::Arc::new(maps);
1682 let out = (*arc).clone();
1683 *guard = Some(arc);
1684 out
1685 }
1686
1687 /// Find the first BPF map whose name ends with `name_suffix`.
1688 /// Override the trait's default `self.maps().into_iter()` impl
1689 /// so the cache lookup amortizes across `find_map(...)` calls
1690 /// within one dump. Without this override, every `find_map`
1691 /// returned a clone-and-drop of the full `Vec<BpfMapInfo>`
1692 /// from the cache only to scan it linearly.
1693 fn find_map(&self, name_suffix: &str) -> Option<BpfMapInfo> {
1694 let mut guard = self.maps_cache.lock_unpoisoned();
1695 if guard.is_none() {
1696 *guard = Some(std::sync::Arc::new(find_all_bpf_maps(
1697 &self.ctx(),
1698 self.map_idr_kva,
1699 )));
1700 }
1701 guard
1702 .as_ref()
1703 .unwrap()
1704 .iter()
1705 .find(|m| m.name().ends_with(name_suffix))
1706 .cloned()
1707 }
1708
1709 fn read_value(&self, map: &BpfMapInfo, offset: usize, len: usize) -> Option<Vec<u8>> {
1710 read_bpf_map_value(&self.ctx(), map, offset, len)
1711 }
1712
1713 fn read_array(&self, map: &BpfMapInfo, key: u32) -> Option<Vec<u8>> {
1714 read_bpf_map_array_value(&self.ctx(), map, key)
1715 }
1716
1717 fn iter_hash_map(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
1718 iter_htab_entries(&self.ctx(), map)
1719 }
1720
1721 /// Read per-CPU values for a key in a `BPF_MAP_TYPE_PERCPU_ARRAY` map.
1722 ///
1723 /// Resolves `__per_cpu_offset` from the guest kernel (via the
1724 /// owner-side cache when present, otherwise fresh) and reads
1725 /// each CPU's slot via [`translate_any_kva`]. Out-of-range CPUs
1726 /// (those whose `__per_cpu_offset` slot reads as zero —
1727 /// including reads past the end of guest memory and BSS-zero
1728 /// slots beyond `nr_cpu_ids`) return `None` rather than
1729 /// aliasing CPU 0's bytes; see the cpu_off==0 guard in
1730 /// [`read_percpu_array_value`].
1731 fn read_percpu_array(&self, map: &BpfMapInfo, key: u32, num_cpus: u32) -> Vec<Option<Vec<u8>>> {
1732 let Some(per_cpu_offsets) = self.resolve_per_cpu_offsets(num_cpus) else {
1733 return Vec::new();
1734 };
1735 read_percpu_array_value(&self.ctx(), map, key, per_cpu_offsets.as_slice())
1736 }
1737
1738 /// Walk a `BPF_MAP_TYPE_PERCPU_HASH` or
1739 /// `BPF_MAP_TYPE_LRU_PERCPU_HASH` map, dereferencing each
1740 /// element's per-CPU pointer for every CPU.
1741 ///
1742 /// Reuses the same `__per_cpu_offset` resolution path as
1743 /// [`Self::read_percpu_array`].
1744 fn iter_percpu_hash_map(&self, map: &BpfMapInfo, num_cpus: u32) -> PerCpuHashEntries {
1745 let Some(per_cpu_offsets) = self.resolve_per_cpu_offsets(num_cpus) else {
1746 return Vec::new();
1747 };
1748 iter_percpu_htab_entries(&self.ctx(), map, per_cpu_offsets.as_slice())
1749 }
1750
1751 fn read_arena_pages(
1752 &self,
1753 map: &BpfMapInfo,
1754 arena_offsets: &super::arena::BpfArenaOffsets,
1755 ) -> super::arena::ArenaSnapshot {
1756 super::arena::snapshot_arena(self.kernel, map, arena_offsets)
1757 }
1758
1759 /// Walk every selem of a TASK_STORAGE / INODE_STORAGE /
1760 /// SK_STORAGE / CGRP_STORAGE map. Returns
1761 /// `(owner_kva_le_bytes, value_bytes)` per entry — see
1762 /// [`iter_local_storage_entries`] for the kernel-side walk
1763 /// shape (`bpf_local_storage_map.buckets[i].list` — regular
1764 /// hlist, NULL termination — with `map_node` at offset 0 of the
1765 /// elem, so the node KVA is the elem base and no `container_of`
1766 /// subtraction is needed).
1767 fn iter_task_storage(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
1768 iter_local_storage_entries(&self.ctx(), map)
1769 }
1770
1771 fn load_program_btf(&self, map: &BpfMapInfo, base_btf: &btf_rs::Btf) -> Option<btf_rs::Btf> {
1772 if map.btf_kva == 0 {
1773 return None;
1774 }
1775 super::dump::load_program_btf_kva(self, map.btf_kva, base_btf)
1776 }
1777}
1778
1779/// Owns a [`GuestKernel`] and provides BPF map access through the
1780/// [`GuestMemMapAccessor`] borrow.
1781///
1782/// Returned by [`GuestMemMapAccessorOwned::new`] which builds the
1783/// `GuestKernel` internally. Borrow as [`GuestMemMapAccessor`] via
1784/// [`as_accessor`](Self::as_accessor) for map operations.
1785///
1786/// [`GuestKernel`]: super::guest::GuestKernel
1787pub struct GuestMemMapAccessorOwned {
1788 kernel: super::guest::GuestKernel,
1789 map_idr_kva: u64,
1790 offsets: BpfMapOffsets,
1791 /// Single-shot `__per_cpu_offset[]` cache keyed on the
1792 /// `num_cpus` argument the trait's percpu methods pass. See
1793 /// [`PerCpuOffsetsCache`] for the contract.
1794 per_cpu_offsets_cache: PerCpuOffsetsCache,
1795}
1796
1797#[allow(dead_code)]
1798impl GuestMemMapAccessorOwned {
1799 /// Create from GuestMem and vmlinux path.
1800 ///
1801 /// One-shot constructor: builds a [`GuestKernel`] from `vmlinux`,
1802 /// parses BTF to resolve the map-related struct offsets, and
1803 /// locates the `map_idr` symbol. The resulting handle owns both
1804 /// the `GuestKernel` and the `BpfMapOffsets`.
1805 ///
1806 /// Prefer [`GuestMemMapAccessor::from_guest_kernel`] when you already
1807 /// hold a `GuestKernel` **and** a pre-built `&BpfMapOffsets` — it
1808 /// builds a borrowed accessor without taking ownership of either,
1809 /// so callers that maintain their own offsets cache (e.g. across
1810 /// multiple map probes in the same poll cycle) don't pay repeat
1811 /// BTF parses. `new` is the convenience path when you want the
1812 /// accessor to own its offsets.
1813 ///
1814 /// [`GuestKernel`]: super::guest::GuestKernel
1815 pub fn new(
1816 mem: std::sync::Arc<GuestMem>,
1817 vmlinux: &std::path::Path,
1818 tcr_el1: u64,
1819 cr3_pa: u64,
1820 ) -> anyhow::Result<Self> {
1821 // Read the vmlinux file and parse its ELF once, then share
1822 // the parse between `GuestKernel::from_elf` (kernel symbols
1823 // + paging state) and `BpfMapOffsets::from_elf` (BTF section
1824 // extraction on sidecar cache miss). The previous structure
1825 // ran `std::fs::read` and `goblin::elf::Elf::parse` twice —
1826 // once inside `GuestKernel::new` and once again inside
1827 // `BpfMapOffsets::from_vmlinux` — and the freeze coordinator
1828 // calls this in a retry loop until the boot-time symbols
1829 // settle, multiplying that cost across every retry tick.
1830 let data = std::fs::read(vmlinux)
1831 .with_context(|| format!("read vmlinux: {}", vmlinux.display()))?;
1832 let elf = goblin::elf::Elf::parse(&data).context("parse vmlinux ELF")?;
1833 let kernel = super::guest::GuestKernel::from_elf(mem, &elf, tcr_el1, cr3_pa)?;
1834 let offsets = BpfMapOffsets::from_elf(&elf, &data, vmlinux)?;
1835
1836 let map_idr_kva = kernel
1837 .symbol_kva("map_idr")
1838 .ok_or_else(|| anyhow::anyhow!("map_idr symbol not found in vmlinux"))?;
1839
1840 Ok(Self {
1841 kernel,
1842 map_idr_kva,
1843 offsets,
1844 per_cpu_offsets_cache: PerCpuOffsetsCache::new(),
1845 })
1846 }
1847
1848 /// Create from pre-read vmlinux bytes and pre-parsed ELF.
1849 ///
1850 /// Avoids re-reading + re-parsing vmlinux on every retry in
1851 /// the freeze coordinator's BPF map write loop.
1852 pub fn from_elf(
1853 mem: std::sync::Arc<GuestMem>,
1854 elf: &goblin::elf::Elf<'_>,
1855 data: &[u8],
1856 vmlinux: &std::path::Path,
1857 tcr_el1: u64,
1858 cr3_pa: u64,
1859 ) -> anyhow::Result<Self> {
1860 Self::from_elf_inner(mem, elf, data, vmlinux, tcr_el1, cr3_pa, 0)
1861 }
1862
1863 pub fn from_elf_with_hint(
1864 mem: std::sync::Arc<GuestMem>,
1865 elf: &goblin::elf::Elf<'_>,
1866 data: &[u8],
1867 vmlinux: &std::path::Path,
1868 tcr_el1: u64,
1869 cr3_pa: u64,
1870 phys_base_hint: u64,
1871 ) -> anyhow::Result<Self> {
1872 Self::from_elf_inner(mem, elf, data, vmlinux, tcr_el1, cr3_pa, phys_base_hint)
1873 }
1874
1875 fn from_elf_inner(
1876 mem: std::sync::Arc<GuestMem>,
1877 elf: &goblin::elf::Elf<'_>,
1878 data: &[u8],
1879 vmlinux: &std::path::Path,
1880 tcr_el1: u64,
1881 cr3_pa: u64,
1882 phys_base_hint: u64,
1883 ) -> anyhow::Result<Self> {
1884 let kernel = super::guest::GuestKernel::from_elf_with_hint(
1885 mem,
1886 elf,
1887 tcr_el1,
1888 cr3_pa,
1889 phys_base_hint,
1890 )?;
1891 let offsets = BpfMapOffsets::from_elf(elf, data, vmlinux)?;
1892 let map_idr_kva = kernel
1893 .symbol_kva("map_idr")
1894 .ok_or_else(|| anyhow::anyhow!("map_idr symbol not found in vmlinux"))?;
1895 Ok(Self {
1896 kernel,
1897 map_idr_kva,
1898 offsets,
1899 per_cpu_offsets_cache: PerCpuOffsetsCache::new(),
1900 })
1901 }
1902
1903 /// Borrow as a [`GuestMemMapAccessor`] for map operations.
1904 ///
1905 /// The returned accessor borrows `self.offsets` and the
1906 /// `__per_cpu_offset` cache; no clone on the hot path. Subsequent
1907 /// borrows reuse the cached `__per_cpu_offset` array across
1908 /// percpu reads in the same dump.
1909 ///
1910 /// The map enumeration cache (`maps_cache`) is freshly
1911 /// initialised on each `as_accessor()` call so the cached
1912 /// `Vec<BpfMapInfo>` lifetime matches one dump. Persisting it
1913 /// across dumps would return stale entries for maps the guest
1914 /// kernel created or destroyed between freeze cycles.
1915 pub fn as_accessor(&self) -> GuestMemMapAccessor<'_> {
1916 GuestMemMapAccessor {
1917 kernel: &self.kernel,
1918 map_idr_kva: self.map_idr_kva,
1919 offsets: &self.offsets,
1920 per_cpu_offsets_cache: Some(&self.per_cpu_offsets_cache),
1921 maps_cache: std::sync::Mutex::new(None),
1922 }
1923 }
1924
1925 /// Access the underlying [`GuestKernel`] for low-level memory reads.
1926 ///
1927 /// [`GuestKernel`]: super::guest::GuestKernel
1928 pub fn guest_kernel(&self) -> &super::guest::GuestKernel {
1929 &self.kernel
1930 }
1931
1932 /// Build an owned accessor around a test-constructed
1933 /// [`GuestKernel`] without parsing a vmlinux. The map offsets are
1934 /// [`BpfMapOffsets::EMPTY`] and `map_idr_kva` is `0`, so this is
1935 /// only usable by callers that touch the `GuestKernel`
1936 /// (`page_offset`, symbol lookup, raw reads) and never walk the
1937 /// map IDR — e.g. [`crate::vmm::capture_scx::build`], which reads
1938 /// only `guest_kernel().page_offset()`. Production must use
1939 /// [`Self::new`] / [`Self::from_elf`].
1940 #[cfg(test)]
1941 pub(crate) fn new_for_test(kernel: super::guest::GuestKernel) -> Self {
1942 Self {
1943 kernel,
1944 map_idr_kva: 0,
1945 offsets: BpfMapOffsets::EMPTY,
1946 per_cpu_offsets_cache: PerCpuOffsetsCache::new(),
1947 }
1948 }
1949
1950 // Map operations live on [`GuestMemMapAccessor`]. Borrow via
1951 // [`as_accessor`] to call them: `owned.as_accessor().find_map(...)`.
1952 // The wrapper type exists only to own the `GuestKernel` and
1953 // `BpfMapOffsets`; it does not duplicate the accessor's surface.
1954}