ktstr/monitor/bpf_map/
mod.rs

1//! Host-side BPF map discovery, read/write, and iteration via guest physical memory.
2//!
3//! Walks the kernel's `map_idr` xarray from the host, finds a BPF map
4//! by name suffix, and provides read/write access to the map's value
5//! region. No guest cooperation is needed — all reads go through the
6//! guest physical memory mapping.
7//!
8//! Address translation strategy:
9//! - `map_idr` is a kernel BSS symbol: use `text_kva_to_pa_with_base`
10//!   (or [`super::guest::GuestKernel::text_kva_to_pa`]).
11//! - xa_node structs are SLAB-allocated (direct mapping): use `kva_to_pa`.
12//! - bpf_map/bpf_array may be kmalloc'd or vmalloc'd: use `translate_any_kva`.
13//! - .bss value region is vmalloc'd: use `translate_kva`.
14//! - Per-CPU values (`PERCPU_ARRAY` / `PERCPU_HASH`) live in dynamic
15//!   per-CPU memory — the embedded first chunk is in the direct
16//!   mapping, larger allocations are vmalloc'd. Add
17//!   `__per_cpu_offset[cpu]` to the `__percpu` base and read the value
18//!   page-by-page via `read_percpu_value_bytes` (`translate_any_kva`).
19
20use crate::sync::MutexExt;
21use anyhow::Context;
22
23use super::btf_offsets::BpfMapOffsets;
24use super::idr::{translate_any_kva, xa_load};
25use super::reader::GuestMem;
26use super::symbols::text_kva_to_pa_with_base;
27use super::{Cr3Pa, Kva, PageOffset};
28
29mod htab;
30mod local_storage;
31#[cfg(test)]
32mod tests;
33use htab::{iter_htab_entries, iter_percpu_htab_entries};
34use local_storage::iter_local_storage_entries;
35
36/// Per-element row from a percpu-hash iteration: `(key_bytes,
37/// per_cpu_values)` where `per_cpu_values[cpu]` is `Some(value_bytes)`
38/// when the per-CPU slot is readable and `None` when the page is
39/// unmapped or the CPU index is out of range. Returned by
40/// [`BpfMapAccessor::iter_percpu_hash_map`] and the underlying walker
41/// helpers in [`htab`].
42pub(crate) type PerCpuHashEntries = Vec<(Vec<u8>, Vec<Option<Vec<u8>>>)>;
43
44/// Maximum chain-element visits per map walk — the production value of
45/// [`AccessorCtx::iter_max`]. A corrupted `next` pointer that forms a
46/// cycle is bounded here so a hash-bucket or local-storage walk can't
47/// spin the freeze hot path until the rendezvous timeout. Shared by
48/// the bucket walker `htab::walk_htab` and the local-storage walker
49/// `local_storage::iter_local_storage_entries`.
50pub(crate) const MAP_WALK_ITER_MAX: usize = 1_000_000;
51
52/// Maximum chain entries a walker materializes into its result `Vec`
53/// before the renderer's per-map `.take`. One-past
54/// (`out.len() > MAP_MATERIALIZE_MAX`) so the renderer's
55/// `len > MAX_HASH_ENTRIES` truncation check still fires on a truncated
56/// map. Without it the guest-memory walker materializes up to
57/// [`MAP_WALK_ITER_MAX`] entries (×num_cpus for per-CPU values) before
58/// the renderer truncates to 4096 — a freeze-hot-path memory spike.
59/// `dump::render_map::MAX_HASH_ENTRIES` aliases this so the render cap
60/// and the walker materialize cap are one value.
61pub(crate) const MAP_MATERIALIZE_MAX: usize = 4096;
62
63/// Bundle of borrow-held state every map-access routine threads
64/// through the page-table walk, bounds check, and byte read/write path.
65///
66/// Every free function in this module previously took the same four-
67/// to eight-argument fan of `mem`, `cr3_pa`, `page_offset`, `offsets`,
68/// `l5` (some also took `map_idr_kva`); callers invariably forwarded
69/// the same fields from their [`GuestMemMapAccessor`] because all six
70/// originate on the accessor. Grouping them here drops the duplication
71/// and lets additional shared context (per-CPU offset cache, BTF
72/// cache, etc.) ride the same lifetime without touching every
73/// signature. `cr3_pa` and `page_offset` are newtyped so the page-
74/// walker can't silently swap them at a call site.
75pub(crate) struct AccessorCtx<'a> {
76    pub mem: &'a GuestMem,
77    pub cr3_pa: Cr3Pa,
78    pub page_offset: PageOffset,
79    pub offsets: &'a BpfMapOffsets,
80    pub l5: bool,
81    /// Cached TCR_EL1 register; drives the aarch64 page-table walker's
82    /// granule selection. Always 0 on x86_64 (the walker ignores it).
83    pub tcr_el1: u64,
84    /// Runtime kernel image base (`__START_KERNEL_map` on x86_64,
85    /// `KIMAGE_VADDR` on aarch64). Used for translating
86    /// kernel-text/data symbols (e.g. `map_idr`) to physical
87    /// addresses. Mirrors [`super::guest::GuestKernel::start_kernel_map`].
88    pub start_kernel_map: u64,
89    /// Runtime KASLR offset (`phys_base` on x86_64; `0` on aarch64
90    /// and on non-KASLR x86_64 boots). Threaded through every
91    /// `text_kva_to_pa_with_base` call so KASLR'd kernels resolve
92    /// text/data symbols correctly. See
93    /// [`super::guest::GuestKernel::phys_base`].
94    pub phys_base: u64,
95    /// Maximum chain-element visits per map walk (hash buckets,
96    /// local-storage chains). A corrupted `next` pointer that loops
97    /// back into a chain would otherwise spin the walker on the freeze
98    /// hot path; this bounds the walk. Production sets
99    /// [`MAP_WALK_ITER_MAX`]; tests override it with a small value to
100    /// exercise the cap cheaply.
101    pub iter_max: usize,
102}
103
104// Map type discriminants from `enum bpf_map_type` in
105// `include/uapi/linux/bpf.h`. Kept as flat `pub const u32` rather
106// than a Rust enum so a kernel that adds a new map type past this
107// list still surfaces as a numeric `map_type` on the
108// [`BpfMapInfo`] / [`super::dump::FailureDumpMap`] wire format —
109// the dump renderer falls through to a generic
110// "unknown map type {n}" arm rather than failing to deserialize.
111
112/// `BPF_MAP_TYPE_HASH` — generic hash table. Inline value bytes at
113/// `htab_elem_value` (`key + round_up(key_size, 8)`).
114pub const BPF_MAP_TYPE_HASH: u32 = 1;
115
116/// `BPF_MAP_TYPE_ARRAY` — fixed-size array of values. Inline values
117/// at the `bpf_array.value` flex array.
118pub const BPF_MAP_TYPE_ARRAY: u32 = 2;
119
120/// `BPF_MAP_TYPE_PROG_ARRAY` — array of `struct bpf_prog *` slots
121/// used by `bpf_tail_call`. Userspace-visible value is a program fd
122/// (or its kernel pointer); the underlying program is not data.
123pub const BPF_MAP_TYPE_PROG_ARRAY: u32 = 3;
124
125/// `BPF_MAP_TYPE_PERF_EVENT_ARRAY` — array of perf event fds. Same
126/// shape as `PROG_ARRAY` but stores perf event references.
127pub const BPF_MAP_TYPE_PERF_EVENT_ARRAY: u32 = 4;
128
129/// `BPF_MAP_TYPE_PERCPU_HASH` — like `HASH` but value is a
130/// `void __percpu *` resolved per-CPU via `__per_cpu_offset[cpu]`.
131pub const BPF_MAP_TYPE_PERCPU_HASH: u32 = 5;
132
133/// `BPF_MAP_TYPE_PERCPU_ARRAY` — like `ARRAY` but each slot is a
134/// `void __percpu *` resolved per-CPU.
135pub const BPF_MAP_TYPE_PERCPU_ARRAY: u32 = 6;
136
137/// `BPF_MAP_TYPE_STACK_TRACE` — kernel-side stack trace storage
138/// keyed by stackid. Values are transient (cleared after read by
139/// `bpf_get_stackid`); not a persistent state surface.
140pub const BPF_MAP_TYPE_STACK_TRACE: u32 = 7;
141
142/// `BPF_MAP_TYPE_CGROUP_ARRAY` — array of cgroup fds. FD-array shape
143/// like `PROG_ARRAY`.
144pub const BPF_MAP_TYPE_CGROUP_ARRAY: u32 = 8;
145
146/// `BPF_MAP_TYPE_LRU_HASH` — `HASH` plus LRU eviction. Value layout
147/// identical to `HASH` (inline value bytes); `htab_elem` carries
148/// `lru_node` in the same union slot as `ptr_to_pptr`.
149pub const BPF_MAP_TYPE_LRU_HASH: u32 = 9;
150
151/// `BPF_MAP_TYPE_LRU_PERCPU_HASH` — `PERCPU_HASH` plus LRU eviction.
152/// Same value-position-is-percpu-pointer layout as `PERCPU_HASH`.
153pub const BPF_MAP_TYPE_LRU_PERCPU_HASH: u32 = 10;
154
155/// `BPF_MAP_TYPE_LPM_TRIE` — longest-prefix-match trie. Keyed by
156/// (prefixlen, data); values are bytes. Iteration requires the
157/// trie's per-node walk, not provided here.
158pub const BPF_MAP_TYPE_LPM_TRIE: u32 = 11;
159
160/// `BPF_MAP_TYPE_ARRAY_OF_MAPS` — array slots store map fds.
161pub const BPF_MAP_TYPE_ARRAY_OF_MAPS: u32 = 12;
162
163/// `BPF_MAP_TYPE_HASH_OF_MAPS` — hash slots store map fds.
164pub const BPF_MAP_TYPE_HASH_OF_MAPS: u32 = 13;
165
166/// `BPF_MAP_TYPE_DEVMAP` — array of net_device fds for XDP
167/// redirection.
168pub const BPF_MAP_TYPE_DEVMAP: u32 = 14;
169
170/// `BPF_MAP_TYPE_SOCKMAP` — array of socket fds.
171pub const BPF_MAP_TYPE_SOCKMAP: u32 = 15;
172
173/// `BPF_MAP_TYPE_CPUMAP` — array of cpumap entries for XDP
174/// redirection.
175pub const BPF_MAP_TYPE_CPUMAP: u32 = 16;
176
177/// `BPF_MAP_TYPE_XSKMAP` — array of AF_XDP socket fds.
178pub const BPF_MAP_TYPE_XSKMAP: u32 = 17;
179
180/// `BPF_MAP_TYPE_SOCKHASH` — hash of socket fds.
181pub const BPF_MAP_TYPE_SOCKHASH: u32 = 18;
182
183/// `BPF_MAP_TYPE_CGROUP_STORAGE` — deprecated cgroup-attached
184/// storage. Replaced by `CGRP_STORAGE`. Reading requires the
185/// cgroup context the program was attached to.
186pub const BPF_MAP_TYPE_CGROUP_STORAGE: u32 = 19;
187
188/// `BPF_MAP_TYPE_REUSEPORT_SOCKARRAY` — array of SO_REUSEPORT
189/// socket fds.
190pub const BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: u32 = 20;
191
192/// `BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE` — deprecated per-CPU
193/// cgroup-attached storage.
194pub const BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: u32 = 21;
195
196/// `BPF_MAP_TYPE_QUEUE` — FIFO queue (no key). Values are popped
197/// destructively by `bpf_map_pop_elem`.
198pub const BPF_MAP_TYPE_QUEUE: u32 = 22;
199
200/// `BPF_MAP_TYPE_STACK` — LIFO stack (no key). Same destructive
201/// pop semantics as `QUEUE`.
202pub const BPF_MAP_TYPE_STACK: u32 = 23;
203
204/// `BPF_MAP_TYPE_SK_STORAGE` — per-socket storage. Reading requires
205/// iterating sockets, not a flat key/value walk.
206pub const BPF_MAP_TYPE_SK_STORAGE: u32 = 24;
207
208/// `BPF_MAP_TYPE_DEVMAP_HASH` — hash of net_device fds.
209pub const BPF_MAP_TYPE_DEVMAP_HASH: u32 = 25;
210
211/// `BPF_MAP_TYPE_STRUCT_OPS` — kernel struct table (e.g.
212/// `tcp_congestion_ops`, `sched_ext_ops`). The map holds a single
213/// `bpf_struct_ops_value` whose `data` field is the registered
214/// kernel struct. `lookup_elem` returns `-EINVAL`; the live-host
215/// path reads via `BPF_MAP_LOOKUP_ELEM` at key=0 anyway because the
216/// kernel's syscall ABI does the read for `STRUCT_OPS` maps.
217pub const BPF_MAP_TYPE_STRUCT_OPS: u32 = 26;
218
219/// `BPF_MAP_TYPE_RINGBUF` — single-producer/single-consumer ring
220/// buffer for streaming events. No key/value access; consumers
221/// poll via `bpf_ringbuf_poll`.
222pub const BPF_MAP_TYPE_RINGBUF: u32 = 27;
223
224/// `BPF_MAP_TYPE_INODE_STORAGE` — per-inode storage. Reading
225/// requires iterating inodes.
226pub const BPF_MAP_TYPE_INODE_STORAGE: u32 = 28;
227
228/// `BPF_MAP_TYPE_TASK_STORAGE` — per-task storage. Reading
229/// requires iterating tasks.
230pub const BPF_MAP_TYPE_TASK_STORAGE: u32 = 29;
231
232/// `BPF_MAP_TYPE_BLOOM_FILTER` — probabilistic set membership.
233/// No key enumeration — only `bpf_map_peek_elem` returns whether
234/// a probe value is "maybe present".
235pub const BPF_MAP_TYPE_BLOOM_FILTER: u32 = 30;
236
237/// `BPF_MAP_TYPE_USER_RINGBUF` — userspace producer / BPF
238/// consumer ring buffer. Same transient nature as `RINGBUF`.
239pub const BPF_MAP_TYPE_USER_RINGBUF: u32 = 31;
240
241/// `BPF_MAP_TYPE_CGRP_STORAGE` — per-cgroup storage (replaces
242/// `CGROUP_STORAGE`). Reading requires iterating cgroups.
243pub const BPF_MAP_TYPE_CGRP_STORAGE: u32 = 32;
244
245/// `BPF_MAP_TYPE_ARENA` — sparse, page-granular memory region
246/// shared between BPF programs and userspace. The host-side
247/// walker for arena pages lives in `super::arena`.
248pub const BPF_MAP_TYPE_ARENA: u32 = 33;
249
250/// `BPF_MAP_TYPE_INSN_ARRAY` — array of bpf instructions used by
251/// the verifier for indirect-jump targets. Values are kernel-side
252/// program references, not application data.
253pub const BPF_MAP_TYPE_INSN_ARRAY: u32 = 34;
254
255/// BPF_OBJ_NAME_LEN from include/linux/bpf.h.
256pub const BPF_OBJ_NAME_LEN: usize = 16;
257
258/// Discovered BPF map metadata and value location.
259#[derive(Debug, Clone, Default)]
260#[allow(dead_code)]
261pub struct BpfMapInfo {
262    /// Guest physical address of the `struct bpf_map`.
263    pub map_pa: u64,
264    /// Guest KVA of the `struct bpf_map` (or containing struct like
265    /// `bpf_array`/`bpf_htab`). Needed for hash map iteration to
266    /// read `bpf_htab` fields relative to this base.
267    pub map_kva: u64,
268    /// Map name as raw bytes (kernel `bpf_map.name`), null-padded to
269    /// `BPF_OBJ_NAME_LEN`. The active prefix length is in
270    /// [`Self::name_len`]; use [`Self::name`] for a `&str` view.
271    /// Holding the bytes inline avoids a per-map heap allocation on
272    /// the freeze hot path.
273    pub name_bytes: [u8; BPF_OBJ_NAME_LEN],
274    /// Active byte length of [`Self::name_bytes`]. Offset of the
275    /// first NUL byte; `BPF_OBJ_NAME_LEN` is the upper bound but
276    /// every kernel-registered map name is NUL-terminated within the
277    /// `BPF_OBJ_NAME_LEN`-byte field. The kernel's
278    /// `bpf_obj_name_cpy` (kernel/bpf/syscall.c) memsets the
279    /// destination to zero before copying and rejects names whose
280    /// source bytes fill the field without a NUL — see the
281    /// `if (src == end) return -EINVAL;` guard. So `name_len` is
282    /// strictly less than `BPF_OBJ_NAME_LEN` in practice; the
283    /// `unwrap_or(BPF_OBJ_NAME_LEN)` fallback in `find_all_bpf_maps`
284    /// is defense-in-depth against a corrupted guest read, not a
285    /// shape the kernel itself produces.
286    pub name_len: u8,
287    /// `map_type` field value.
288    pub map_type: u32,
289    /// `map_flags` field value.
290    pub map_flags: u32,
291    /// `key_size` field value.
292    pub key_size: u32,
293    /// `value_size` field value — the size of ONE entry's value.
294    /// For `BPF_MAP_TYPE_ARRAY` the kernel's per-entry stride is
295    /// `array->elem_size = round_up(value_size, 8)`
296    /// (kernel/bpf/arraymap.c:93) and the value region spans
297    /// `max_entries * elem_size`; a multi-entry ARRAY is read one
298    /// entry at a time via [`BpfMapAccessor::read_array`], not as a
299    /// single `value_size`-byte buffer.
300    pub value_size: u32,
301    /// `max_entries` field value.
302    pub max_entries: u32,
303    /// Guest KVA of the map's value region (entry 0). `Some(kva)`
304    /// when the renderer can read an entry starting at this address;
305    /// `None` when the map type requires a different walker (hash
306    /// iteration, arena page snapshot, …) or the kva resolution
307    /// failed.
308    ///
309    /// Populated for:
310    /// * `BPF_MAP_TYPE_ARRAY` — points at `bpf_array.value` (the
311    ///   inline flex array, entry 0). A single-entry ARRAY
312    ///   (`max_entries <= 1`, incl. `.bss`/`.data`/`.rodata`) reads
313    ///   `value_size` bytes via [`BpfMapAccessor::read_value`]; a
314    ///   multi-entry ARRAY reads entry `k` at
315    ///   `value_kva + k * round_up(value_size, 8)` via
316    ///   [`BpfMapAccessor::read_array`].
317    /// * `BPF_MAP_TYPE_STRUCT_OPS` — points at `kvalue.data` (the
318    ///   embedded registered struct's bytes, after the
319    ///   `bpf_struct_ops_common_value` header). Renderer reads
320    ///   `value_size - data_off` bytes to match the size of the
321    ///   `btf_value_type_id` type, which describes the data payload
322    ///   only. `None` when struct_ops BTF offsets are unresolved.
323    pub value_kva: Option<u64>,
324    /// Guest KVA of the map's `struct btf` (guest-memory backend),
325    /// or `btf_id` cast to u64 (live-host backend reading via the
326    /// bpf(2) syscall: `BPF_OBJ_GET_INFO_BY_FD` returns `btf_id`,
327    /// not a kernel pointer). The dump path treats the value as
328    /// opaque — only `btf_kva == 0` is meaningful (no BTF
329    /// associated with this map). Backend-specific consumers cast
330    /// to the shape they need.
331    /// 0 if the map has no BTF.
332    pub btf_kva: u64,
333    /// BTF type ID for the map's value type. 0 if the map has no BTF.
334    pub btf_value_type_id: u32,
335    /// BTF type ID for the kernel-side `bpf_struct_ops_<name>`
336    /// wrapper in vmlinux BTF, populated for `BPF_MAP_TYPE_STRUCT_OPS`
337    /// maps. libbpf zeros `btf_value_type_id` for STRUCT_OPS and
338    /// passes the wrapper id via the kernel-only
339    /// `btf_vmlinux_value_type_id` field on `struct bpf_map`. The
340    /// dump path uses it to BTF-render the data payload by walking
341    /// the wrapper's `data` member to the per-ops struct (e.g.
342    /// `sched_ext_ops`). Zero on every other map type.
343    pub btf_vmlinux_value_type_id: u32,
344    /// BTF type ID for the map's key type. 0 when the map's BTF is
345    /// missing or the map type does not record a key type id (most
346    /// ARRAY-family maps store a synthetic `__u32` key implicitly).
347    /// HASH maps populate this so the dump path can render keys via
348    /// BTF instead of falling back to hex.
349    pub btf_key_type_id: u32,
350}
351
352impl BpfMapInfo {
353    /// Active name bytes: `&name_bytes[..name_len]`.
354    pub fn name_bytes_active(&self) -> &[u8] {
355        &self.name_bytes[..self.name_len as usize]
356    }
357
358    /// Map name as a `&str` view over [`Self::name_bytes`]. Lossily
359    /// renders any non-UTF-8 bytes via `String::from_utf8_lossy`,
360    /// allocating only when the active region contains invalid UTF-8.
361    /// Most kernel-registered names are ASCII so the common path is
362    /// alloc-free.
363    pub fn name(&self) -> std::borrow::Cow<'_, str> {
364        String::from_utf8_lossy(self.name_bytes_active())
365    }
366}
367
368impl std::fmt::Display for BpfMapInfo {
369    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
370        f.write_str(&self.name())
371    }
372}
373
374/// Maximum bytes covered by [`MapMetadata`]'s batched read of
375/// `struct bpf_map`. The struct itself is ~250 bytes on 6.16+
376/// kernels (verified against include/linux/bpf.h `struct bpf_map`),
377/// and every field [`find_all_bpf_maps`] touches falls within this
378/// span. The bound keeps the batched read to a fixed scratch size; if
379/// `map_pa` straddles end-of-DRAM `read_bytes` truncates short, so
380/// any field whose end exceeds the actual copy length falls through
381/// to the scalar read path via `MapMetadata::u32_at` / `u64_at`.
382const MAP_METADATA_SPAN: usize = 384;
383
384/// Cached scratch buffer over `struct bpf_map` bytes, batched into
385/// one [`GuestMem::read_bytes`] call. Replaces ~10 individual
386/// `read_u32`/`read_u64` syscall-equivalents per map per IDR walk
387/// with one bulk copy + local LE parses, paying one bounds check +
388/// region resolve instead of N.
389///
390/// All accessor methods bounds-check the requested offset against
391/// the bytes actually copied; a short copy (e.g. when `map_pa` is
392/// near end-of-DRAM) cleanly falls through to the scalar read path
393/// rather than indexing into uninitialized scratch.
394struct MapMetadata<'a> {
395    mem: &'a GuestMem,
396    map_pa: u64,
397    /// Stack-allocated scratch buffer holding the bulk-read bytes;
398    /// `copied` is the number actually filled by `read_bytes`.
399    buf: [u8; MAP_METADATA_SPAN],
400    copied: usize,
401}
402
403impl<'a> MapMetadata<'a> {
404    /// Issue the bulk `read_bytes` covering enough of `struct bpf_map`
405    /// for every offset [`find_all_bpf_maps`] dereferences.
406    fn read(mem: &'a GuestMem, map_pa: u64, _offsets: &BpfMapOffsets) -> Self {
407        // _offsets is reserved for a future tightening of the read
408        // span: callers could compute the exact `[lo, hi)` from the
409        // BTF-resolved fields rather than the conservative
410        // `MAP_METADATA_SPAN` cap, trimming the bulk read length on
411        // kernels with smaller `struct bpf_map`. Left unused today
412        // because the cap covers every supported layout and the
413        // savings are smaller than the page-table walks the bulk
414        // read replaces.
415        let mut buf = [0u8; MAP_METADATA_SPAN];
416        let copied = mem.read_bytes(map_pa, &mut buf);
417        Self {
418            mem,
419            map_pa,
420            buf,
421            copied,
422        }
423    }
424
425    /// Read a `u32` from the cached buffer. Falls through to the
426    /// scalar `read_u32` path (one volatile read, with its own
427    /// bounds check) when the offset spans past the bulk copy — the
428    /// scalar path returns 0 for out-of-bounds reads, matching the
429    /// pre-batch behaviour bit for bit.
430    fn u32_at(&self, off: usize) -> u32 {
431        if off + 4 <= self.copied {
432            u32::from_ne_bytes(self.buf[off..off + 4].try_into().unwrap())
433        } else {
434            self.mem.read_u32(self.map_pa, off)
435        }
436    }
437
438    /// Read a `u64` from the cached buffer; same fall-through
439    /// contract as [`Self::u32_at`].
440    fn u64_at(&self, off: usize) -> u64 {
441        if off + 8 <= self.copied {
442            u64::from_ne_bytes(self.buf[off..off + 8].try_into().unwrap())
443        } else {
444            self.mem.read_u64(self.map_pa, off)
445        }
446    }
447
448    /// Borrow `BPF_OBJ_NAME_LEN` name bytes at `name_off`. When the
449    /// name straddles end-of-buffer, falls through to a fresh
450    /// `read_bytes` call on the scalar path so the caller never sees
451    /// silent truncation.
452    fn name_bytes(&self, name_off: usize) -> std::borrow::Cow<'_, [u8]> {
453        if name_off + BPF_OBJ_NAME_LEN <= self.copied {
454            std::borrow::Cow::Borrowed(&self.buf[name_off..name_off + BPF_OBJ_NAME_LEN])
455        } else {
456            let mut name_buf = vec![0u8; BPF_OBJ_NAME_LEN];
457            self.mem
458                .read_bytes(self.map_pa + name_off as u64, &mut name_buf);
459            std::borrow::Cow::Owned(name_buf)
460        }
461    }
462}
463
464/// Enumerate all BPF maps in the kernel's `map_idr` xarray.
465///
466/// Returns metadata for every map whose KVA can be translated.
467/// No filtering by type or name — callers select from the result.
468///
469/// `value_kva` is populated for `BPF_MAP_TYPE_ARRAY` (inline
470/// `bpf_array.value`) and `BPF_MAP_TYPE_STRUCT_OPS`
471/// (`kvalue.data` inside `bpf_struct_ops_map`). All other map types
472/// resolve to `None` — they require dedicated walkers
473/// ([`iter_htab_entries`] for HASH, [`super::arena::snapshot_arena`]
474/// for ARENA, …).
475pub(crate) fn find_all_bpf_maps(ctx: &AccessorCtx<'_>, map_idr_kva: u64) -> Vec<BpfMapInfo> {
476    let idr_pa = text_kva_to_pa_with_base(map_idr_kva, ctx.start_kernel_map, ctx.phys_base);
477    let offsets = ctx.offsets;
478
479    let xa_head = ctx.mem.read_u64(idr_pa, offsets.idr_xa_head);
480    if xa_head == 0 {
481        return Vec::new();
482    }
483    let idr_next = ctx.mem.read_u32(idr_pa, offsets.idr_next).min(65536);
484
485    let mut maps = Vec::new();
486
487    for id in 0..idr_next {
488        let Some(entry) = xa_load(
489            ctx.mem,
490            ctx.page_offset.0,
491            xa_head,
492            id as u64,
493            offsets.xa_node_slots,
494            offsets.xa_node_shift,
495        ) else {
496            continue;
497        };
498        if entry == 0 {
499            continue;
500        }
501
502        let Some(map_pa) = translate_any_kva(
503            ctx.mem,
504            ctx.cr3_pa.0,
505            ctx.page_offset.0,
506            entry,
507            ctx.l5,
508            ctx.tcr_el1,
509        ) else {
510            continue;
511        };
512
513        // Batch the metadata reads: one `read_bytes` covering every
514        // field we touch on `struct bpf_map` collapses ~10 individual
515        // volatile scalar reads into one bulk copy + local indexed
516        // parses, saving 9 page-lookup + bounds-check round trips per
517        // map per IDR walk.
518        let meta = MapMetadata::read(ctx.mem, map_pa, offsets);
519
520        let map_type = meta.u32_at(offsets.map_type);
521        let map_flags = meta.u32_at(offsets.map_flags);
522        let key_size = meta.u32_at(offsets.key_size);
523        let value_size = meta.u32_at(offsets.value_size);
524        let max_entries = meta.u32_at(offsets.max_entries);
525
526        let mut name_bytes = [0u8; BPF_OBJ_NAME_LEN];
527        name_bytes.copy_from_slice(&meta.name_bytes(offsets.map_name));
528        let name_len = name_bytes
529            .iter()
530            .position(|&b| b == 0)
531            .unwrap_or(BPF_OBJ_NAME_LEN) as u8;
532
533        // value_kva is the start KVA the renderer reads value bytes
534        // from. Two map types populate it:
535        //
536        // * `BPF_MAP_TYPE_ARRAY`: `bpf_array` embeds `bpf_map` at
537        //   offset 0 and the value flex array is inline at
538        //   `bpf_array.value`.
539        // * `BPF_MAP_TYPE_STRUCT_OPS`: `bpf_struct_ops_map` embeds
540        //   `kvalue` (a `bpf_struct_ops_value`) inline; the registered
541        //   kernel struct lives at `kvalue.data`. `map->btf_value_type_id`
542        //   describes only the data payload, not the prefixing
543        //   `bpf_struct_ops_common_value`, so value_kva points at
544        //   `data` and the renderer reads `value_size - data_off` bytes
545        //   to fit the typed shape.
546        //
547        // Other map types (HASH, RINGBUF, ARENA, …) have no contiguous
548        // value region the renderer can read with a single offset/len
549        // pair — they use dedicated walkers (`iter_hash_map`,
550        // `read_arena_pages`, …).
551        let value_kva = match map_type {
552            BPF_MAP_TYPE_ARRAY => Some(entry + offsets.array_value as u64),
553            BPF_MAP_TYPE_STRUCT_OPS => offsets
554                .struct_ops_offsets
555                .as_ref()
556                .map(|so| entry + so.kvalue as u64 + so.value_data as u64),
557            _ => None,
558        };
559
560        let btf_kva = meta.u64_at(offsets.map_btf);
561        let btf_value_type_id = meta.u32_at(offsets.map_btf_value_type_id);
562        // `btf_vmlinux_value_type_id` lives at offset 0 only when the
563        // resolver couldn't locate the field (kernel built without
564        // CONFIG_BPF_JIT). Treat offset 0 as "unresolved" — reading
565        // u32 at offset 0 of `struct bpf_map` would alias `map_type`,
566        // which is decidedly NOT a btf type id. The STRUCT_OPS arm
567        // checks for non-zero before using.
568        let btf_vmlinux_value_type_id = if offsets.map_btf_vmlinux_value_type_id != 0 {
569            meta.u32_at(offsets.map_btf_vmlinux_value_type_id)
570        } else {
571            0
572        };
573        let btf_key_type_id = meta.u32_at(offsets.map_btf_key_type_id);
574
575        maps.push(BpfMapInfo {
576            map_pa,
577            map_kva: entry,
578            name_bytes,
579            name_len,
580            map_type,
581            map_flags,
582            key_size,
583            value_size,
584            max_entries,
585            value_kva,
586            btf_kva,
587            btf_value_type_id,
588            btf_vmlinux_value_type_id,
589            btf_key_type_id,
590        });
591    }
592
593    maps
594}
595
596/// Find the first BPF ARRAY map whose name ends with `name_suffix`.
597///
598/// Only returns `BPF_MAP_TYPE_ARRAY` maps. Use [`find_all_bpf_maps`]
599/// to enumerate maps of all types.
600///
601/// Walks the IDR directly and short-circuits on `map_type` for each
602/// candidate before reading the rest of the struct fields: an
603/// allocator-fast path that skips ~10 scalar reads + the
604/// `BPF_OBJ_NAME_LEN` name copy on every non-ARRAY map. The kernel
605/// typically registers many non-ARRAY maps (HASH, LRU_HASH,
606/// PERCPU_HASH, RINGBUF, ARENA, …) alongside the small set of ARRAY
607/// maps the failure-dump renderer reaches through, so the savings
608/// are proportional to the reject rate.
609// Production callers go through [`GuestMemMapAccessor::find_array_map`] /
610// [`BpfMapAccessor::maps`]; this single-shot variant is preserved
611// for the `bpf_map::tests` suite that exercises the IDR walk
612// directly.
613#[allow(dead_code)]
614pub(crate) fn find_bpf_map(
615    ctx: &AccessorCtx<'_>,
616    map_idr_kva: u64,
617    name_suffix: &str,
618) -> Option<BpfMapInfo> {
619    let idr_pa = text_kva_to_pa_with_base(map_idr_kva, ctx.start_kernel_map, ctx.phys_base);
620    let offsets = ctx.offsets;
621
622    let xa_head = ctx.mem.read_u64(idr_pa, offsets.idr_xa_head);
623    if xa_head == 0 {
624        return None;
625    }
626    let idr_next = ctx.mem.read_u32(idr_pa, offsets.idr_next).min(65536);
627
628    for id in 0..idr_next {
629        let Some(entry) = xa_load(
630            ctx.mem,
631            ctx.page_offset.0,
632            xa_head,
633            id as u64,
634            offsets.xa_node_slots,
635            offsets.xa_node_shift,
636        ) else {
637            continue;
638        };
639        if entry == 0 {
640            continue;
641        }
642
643        let Some(map_pa) = translate_any_kva(
644            ctx.mem,
645            ctx.cr3_pa.0,
646            ctx.page_offset.0,
647            entry,
648            ctx.l5,
649            ctx.tcr_el1,
650        ) else {
651            continue;
652        };
653
654        // Short-circuit on map_type before reading the rest of the
655        // struct: a non-ARRAY map is rejected with one volatile u32
656        // read instead of the bulk `read_bytes` over `MAP_METADATA_SPAN`
657        // bytes plus the [`BPF_OBJ_NAME_LEN`] name copy that the full
658        // metadata batch would do.
659        //
660        // [`find_bpf_map`] is reached only by direct callers (tests
661        // today; future single-shot probes that don't want to pay the
662        // [`find_all_bpf_maps`] IDR walk). The freeze hot path in
663        // production goes through [`GuestMemMapAccessor::find_array_map`] /
664        // [`BpfMapAccessor::maps`], which build and consult the
665        // per-accessor [`maps_cache`] populated by
666        // [`find_all_bpf_maps`]; that path does the full bulk read
667        // for every map and the type filter happens during the
668        // post-walk scan, not here.
669        let map_type = ctx.mem.read_u32(map_pa, offsets.map_type);
670        if map_type != BPF_MAP_TYPE_ARRAY {
671            continue;
672        }
673
674        // Wrong-name candidates also bail before the heavy reads.
675        let mut name_buf = [0u8; BPF_OBJ_NAME_LEN];
676        ctx.mem
677            .read_bytes(map_pa + offsets.map_name as u64, &mut name_buf);
678        let name_len = name_buf
679            .iter()
680            .position(|&b| b == 0)
681            .unwrap_or(BPF_OBJ_NAME_LEN);
682        let name_str = std::str::from_utf8(&name_buf[..name_len]).unwrap_or("");
683        if !name_str.ends_with(name_suffix) {
684            continue;
685        }
686
687        // Match: now do the full batched metadata read for the
688        // remaining fields and build the BpfMapInfo.
689        let meta = MapMetadata::read(ctx.mem, map_pa, offsets);
690        // `name_buf` is already `[u8; BPF_OBJ_NAME_LEN]` and was
691        // populated by the pre-batch name read above; reuse it
692        // directly instead of round-tripping through a fresh
693        // zero-init + copy_from_slice.
694        let name_bytes = name_buf;
695        let value_kva = Some(entry + offsets.array_value as u64);
696        let btf_vmlinux_value_type_id = if offsets.map_btf_vmlinux_value_type_id != 0 {
697            meta.u32_at(offsets.map_btf_vmlinux_value_type_id)
698        } else {
699            0
700        };
701        return Some(BpfMapInfo {
702            map_pa,
703            map_kva: entry,
704            name_bytes,
705            name_len: name_len as u8,
706            map_type,
707            map_flags: meta.u32_at(offsets.map_flags),
708            key_size: meta.u32_at(offsets.key_size),
709            value_size: meta.u32_at(offsets.value_size),
710            max_entries: meta.u32_at(offsets.max_entries),
711            value_kva,
712            btf_kva: meta.u64_at(offsets.map_btf),
713            btf_value_type_id: meta.u32_at(offsets.map_btf_value_type_id),
714            btf_vmlinux_value_type_id,
715            btf_key_type_id: meta.u32_at(offsets.map_btf_key_type_id),
716        });
717    }
718    None
719}
720
721/// Robustness bound on a single value-region allocation. `value_size`
722/// is read live from kernel memory via page-table translation, so a
723/// torn read mid-update, a stale/wrong offset-table entry for the
724/// running kernel, or a corrupted-pointer chase could yield a garbage
725/// `value_size` — up to ~4 GiB, the u32 max. Capping before the
726/// `vec![0u8; len]` allocation keeps a mis-read from driving a
727/// multi-gigabyte allocation on the freeze hot path. 16 MiB covers
728/// every realistic scheduler-scale map (a global-section `.bss` ARRAY
729/// is the largest practical value, KiB–low-MiB); kernel-legal
730/// non-percpu ARRAYs can exceed it (up to INT_MAX), so the cap is a
731/// robustness heuristic, not a kernel limit.
732const MAX_VALUE_SIZE: usize = 16 * 1024 * 1024;
733
734/// BPF-map I/O wrapper around [`super::kva_io::chunked_kva_io`] that
735/// supplies the `(cr3_pa, l5, tcr_el1, mem)` translator the BPF map
736/// path needs.
737///
738/// The shared helper covers the page-boundary chunking; this thin
739/// wrapper plumbs the per-accessor translation context through. See
740/// the shared helper's docs for the chunking semantics and the
741/// caller-side bytes-tracking contract.
742fn chunked_kva_io<F>(ctx: &AccessorCtx<'_>, target_kva: u64, len: usize, chunk_fn: F) -> bool
743where
744    F: FnMut(u64, u64, usize),
745{
746    super::kva_io::chunked_kva_io(
747        |kva| {
748            ctx.mem
749                .translate_kva(ctx.cr3_pa.0, Kva(kva), ctx.l5, ctx.tcr_el1)
750        },
751        target_kva,
752        len,
753        chunk_fn,
754    )
755}
756
757/// Write bytes to a BPF map's value region at `offset`.
758///
759/// Translates the value KVA (vmalloc'd for .bss maps) through the
760/// page table to find the guest physical address, then writes directly.
761/// Returns `false` if the map has no value KVA (non-ARRAY map),
762/// `offset + data.len()` exceeds `value_size`, or any page in the
763/// range is unmapped. Uses [`chunked_kva_io`] to pay one translate per
764/// 4 KiB page rather than one per byte.
765pub(crate) fn write_bpf_map_value(
766    ctx: &AccessorCtx<'_>,
767    map_info: &BpfMapInfo,
768    offset: usize,
769    data: &[u8],
770) -> bool {
771    let Some(base_kva) = map_info.value_kva else {
772        return false;
773    };
774    // checked_add against pathological offset+len that would
775    // wrap usize. Without the check, a wrap would silently make
776    // `> value_size` false and the chunked write would walk
777    // arbitrary KVAs.
778    let Some(end) = offset.checked_add(data.len()) else {
779        return false;
780    };
781    if end > map_info.value_size as usize {
782        return false;
783    }
784    let target_kva = base_kva + offset as u64;
785
786    let mut bytes_written: usize = 0;
787    let walked = chunked_kva_io(ctx, target_kva, data.len(), |pa, src_off, chunk_len| {
788        // One `copy_nonoverlapping` per chunk replaces the old per-
789        // byte `write_u8` loop. The chunk has already been
790        // bounds-checked against `value_size`, so a short return
791        // here means the page crosses end-of-DRAM. Track bytes_written
792        // and surface the short-write to the caller as `false` —
793        // mirrors the [`read_bpf_map_value`] symmetry where
794        // `bytes_filled != len` returns `None`. Without this guard a
795        // half-landed write would silently report success.
796        let src_off = src_off as usize;
797        let n = ctx
798            .mem
799            .write_bytes_at(pa, 0, &data[src_off..src_off + chunk_len]);
800        bytes_written = bytes_written.saturating_add(n);
801    });
802    walked && bytes_written == data.len()
803}
804
805/// Write a u32 to a BPF map's value region at `offset`.
806pub(crate) fn write_bpf_map_value_u32(
807    ctx: &AccessorCtx<'_>,
808    map_info: &BpfMapInfo,
809    offset: usize,
810    val: u32,
811) -> bool {
812    write_bpf_map_value(ctx, map_info, offset, &val.to_ne_bytes())
813}
814
815/// Read bytes from a BPF map's value region at `offset`.
816///
817/// Translates the value KVA (vmalloc'd for .bss maps) through the
818/// page table to find the guest physical address, then reads directly.
819/// Returns `None` if the map has no value KVA (non-ARRAY map),
820/// `offset + len` exceeds `value_size`, or any page in the range
821/// is unmapped. Uses [`chunked_kva_io`] to pay one translate per 4 KiB
822/// page plus one bulk [`GuestMem::read_bytes`] call, instead of one
823/// translate and one-byte copy per byte.
824pub(crate) fn read_bpf_map_value(
825    ctx: &AccessorCtx<'_>,
826    map_info: &BpfMapInfo,
827    offset: usize,
828    len: usize,
829) -> Option<Vec<u8>> {
830    let base_kva = map_info.value_kva?;
831    // checked_add against pathological offset+len that would
832    // wrap usize. See the matching guard on `write_bpf_map_value`
833    // above for the rationale.
834    let end = offset.checked_add(len)?;
835    if end > map_info.value_size as usize {
836        return None;
837    }
838    // Live-read robustness cap before allocation (see MAX_VALUE_SIZE):
839    // a garbage `value_size` (torn / stale-offset read) or a caller
840    // passing a huge `len` would otherwise allocate up to 4 GiB inside
841    // `vec![0u8; len]`.
842    if len > MAX_VALUE_SIZE {
843        return None;
844    }
845    read_kva_bytes(ctx, base_kva + offset as u64, len)
846}
847
848/// Read the value bytes for one entry of a multi-entry
849/// `BPF_MAP_TYPE_ARRAY` map.
850///
851/// Entries are contiguous starting at `bpf_array.value` with a
852/// per-entry stride of `round_up(value_size, 8)` — the kernel's
853/// `array->elem_size` (kernel/bpf/arraymap.c:93 sets it, :167-176
854/// indexes with it). The value region spans `max_entries *
855/// elem_size`. Unlike [`read_bpf_map_value`], whose bound is one
856/// entry's `value_size`, this reads entry `key` at
857/// `value_kva + key * stride`.
858///
859/// Returns `None` when the map is not `BPF_MAP_TYPE_ARRAY`,
860/// `key >= max_entries` (the kernel's `index_mask` is a Spectre
861/// bound, not a range check — `array_map_lookup_elem` rejects
862/// `index >= max_entries` BEFORE masking, so this replicates the
863/// pre-mask test), `value_size` exceeds `MAX_VALUE_SIZE`, no value
864/// KVA was resolved, the offset would overflow, or any page in the
865/// entry is unmapped. On success the buffer is exactly `value_size`
866/// bytes — the 8-rounded stride is internal padding the kernel's
867/// `copy_map_value` does not copy.
868pub(crate) fn read_bpf_map_array_value(
869    ctx: &AccessorCtx<'_>,
870    map_info: &BpfMapInfo,
871    key: u32,
872) -> Option<Vec<u8>> {
873    if map_info.map_type != BPF_MAP_TYPE_ARRAY {
874        return None;
875    }
876    // Replicate array_map_lookup_elem's pre-mask `index >= max_entries`
877    // rejection (kernel/bpf/arraymap.c:172) — never trust index_mask
878    // to clamp; it only blocks speculation.
879    if key >= map_info.max_entries {
880        return None;
881    }
882    let value_size = map_info.value_size as usize;
883    // Live-read robustness cap before any allocation, matching
884    // `read_bpf_map_value` (see MAX_VALUE_SIZE).
885    if value_size > MAX_VALUE_SIZE {
886        return None;
887    }
888    let base_kva = map_info.value_kva?;
889    // Per-entry stride is round_up(value_size, 8) = array->elem_size
890    // (kernel/bpf/arraymap.c:93). Same `(x + 7) & !7` rounding the
891    // percpu stride math uses in bpf_syscall.rs.
892    let stride = (value_size + 7) & !7;
893    // key < max_entries (checked above) keeps key * stride within the
894    // `max_entries * stride` value region; checked arithmetic guards a
895    // corrupted max_entries from wrapping the KVA past u64.
896    let offset = (key as u64).checked_mul(stride as u64)?;
897    let target_kva = base_kva.checked_add(offset)?;
898    read_kva_bytes(ctx, target_kva, value_size)
899}
900
901/// Page-walk `len` bytes from guest kernel-virtual address
902/// `target_kva` into a fresh buffer, resolving each 4 KiB page through
903/// `translate`. The translator abstracts WHICH KVA→PA strategy
904/// applies: [`read_kva_bytes`] passes the PTE-only `translate_kva`
905/// (vmalloc'd `.bss` value region); [`read_percpu_value_bytes`]
906/// passes `translate_any_kva` (direct-mapping-first, for per-CPU
907/// values that may live in either the direct mapping or vmalloc).
908///
909/// Walking page-by-page is mandatory, not an optimization. A
910/// vmalloc-backed range (a `.bss` map's value region, or a large
911/// dynamic per-CPU allocation) occupies physically discontiguous
912/// order-0 frames, so a value crossing a page boundary lives in
913/// non-adjacent guest physical memory. A single translate of the
914/// first page followed by one bulk read of `len` bytes would copy
915/// whatever frame happens to sit after the first page — garbage — for
916/// every byte past that boundary.
917///
918/// Returns `None` if any page is unmapped (`translate` returns
919/// `None`) or the copy short-reads (`read_bytes` returns fewer bytes
920/// than the chunk at end-of-DRAM); the buffer is adopted via
921/// `set_len` only once every byte is proven written. Performs NO
922/// semantic bounds check: `target_kva` must lie in the value region.
923/// Every value-region caller caps `len`/`value_size` against
924/// `MAX_VALUE_SIZE` before allocating — the `.bss`/ARRAY paths
925/// ([`read_bpf_map_value`], [`read_bpf_map_array_value`]) and the
926/// per-CPU paths ([`read_percpu_array_value`] and the PERCPU-HASH
927/// walker via `walk_htab`) alike — because `value_size` is read live
928/// from kernel memory and a mis-read could otherwise drive a huge
929/// allocation (per CPU, in the per-CPU case).
930fn read_kva_bytes_with<T: Fn(u64) -> Option<u64>>(
931    mem: &GuestMem,
932    translate: T,
933    target_kva: u64,
934    len: usize,
935) -> Option<Vec<u8>> {
936    // `Vec::with_capacity` reserves backing storage without zeroing
937    // — the zero-fill that `vec![0u8; len]` would have emitted is
938    // wasted because every byte gets overwritten by the
939    // `read_bytes` calls below. The buffer's length stays at zero
940    // until we've proven every chunk wrote, then `set_len(len)`
941    // adopts the populated bytes.
942    let mut buf: Vec<u8> = Vec::with_capacity(len);
943
944    // Safety / correctness: `chunked_kva_io` returns false when any
945    // page in the range is unmapped; propagate that to None so callers
946    // see "unreadable" rather than a partial buffer.
947    let buf_ptr = buf.as_mut_ptr();
948    let mut bytes_filled: usize = 0;
949    let ok = super::kva_io::chunked_kva_io(translate, target_kva, len, |pa, dst_off, chunk_len| {
950        // SAFETY: dst_off + chunk_len <= len <= buf.capacity(); the
951        // slice borrows the heap-allocated Vec whose backing storage
952        // is live for the duration of this call (the Vec is pinned in
953        // `buf` above and reborrowed here only through its mutable
954        // pointer). The slice covers reserved-but-uninitialized
955        // memory; `read_bytes` writes every byte before any read of
956        // the slice, and the outer code only adopts the bytes via
957        // `set_len` once `bytes_filled == len`.
958        let slice =
959            unsafe { std::slice::from_raw_parts_mut(buf_ptr.add(dst_off as usize), chunk_len) };
960        // GuestMem::read_bytes returns the count actually copied; the
961        // page was confirmed mapped by `translate`, so a short read
962        // here means the page crosses end-of-DRAM, which the original
963        // byte loop would also have silently short-copied.
964        let n = mem.read_bytes(pa, slice);
965        // `saturating_add` so a pathological accumulation past
966        // `usize::MAX` clamps and the `bytes_filled != len` check
967        // below still surfaces the short read instead of wrapping
968        // back to a value that aliases `len`.
969        bytes_filled = bytes_filled.saturating_add(n);
970    });
971    if !ok || bytes_filled != len {
972        return None;
973    }
974    // SAFETY: every byte in `0..len` of `buf`'s backing storage was
975    // written by the `read_bytes` calls above (`bytes_filled == len`
976    // proves it), the capacity is `len`, and u8 has no validity
977    // invariants.
978    unsafe {
979        buf.set_len(len);
980    }
981    Some(buf)
982}
983
984/// Page-walk `len` bytes from a `.bss`/ARRAY value-region KVA via the
985/// PTE-only `translate_kva` (the value region is vmalloc'd, never in
986/// the direct mapping).
987///
988/// Shared by [`read_bpf_map_value`] (byte-range reads bounded by one
989/// entry's `value_size`) and [`read_bpf_map_array_value`] (per-entry
990/// reads into a multi-entry ARRAY bounded by `max_entries * stride`).
991/// Performs NO semantic bounds check — see [`read_kva_bytes_with`].
992fn read_kva_bytes(ctx: &AccessorCtx<'_>, target_kva: u64, len: usize) -> Option<Vec<u8>> {
993    read_kva_bytes_with(
994        ctx.mem,
995        |kva| {
996            ctx.mem
997                .translate_kva(ctx.cr3_pa.0, Kva(kva), ctx.l5, ctx.tcr_el1)
998        },
999        target_kva,
1000        len,
1001    )
1002}
1003
1004/// Page-walk `len` bytes of one CPU's per-CPU value via
1005/// `translate_any_kva` (direct-mapping-first, then a page-table walk
1006/// for vmalloc'd per-CPU memory).
1007///
1008/// Per-CPU values come from the kernel's dynamic per-CPU allocator:
1009/// the embedded first chunk lives in the direct mapping, while larger
1010/// allocations are vmalloc-backed (`mm/percpu-vm.c` hands out order-0
1011/// pages via `pcpu_get_vm_areas`, so the frames are physically
1012/// discontiguous). Walking page-by-page — [`read_kva_bytes_with`]'s
1013/// core behavior — is therefore required for any value that crosses a
1014/// page boundary; a single translate + bulk read would copy garbage
1015/// past the first page.
1016///
1017/// Returns `None` if any page is unmapped or the read short-reads at
1018/// end-of-DRAM. The end-of-DRAM bound the single-read path checked
1019/// explicitly (`cpu_pa + value_size <= mem.size()`) is subsumed:
1020/// [`GuestMem::read_bytes`] returns 0 for a PA past `mem.size()`, so
1021/// `bytes_filled != len` drops the slot to `None`.
1022fn read_percpu_value_bytes(ctx: &AccessorCtx<'_>, target_kva: u64, len: usize) -> Option<Vec<u8>> {
1023    read_kva_bytes_with(
1024        ctx.mem,
1025        |kva| {
1026            translate_any_kva(
1027                ctx.mem,
1028                ctx.cr3_pa.0,
1029                ctx.page_offset.0,
1030                kva,
1031                ctx.l5,
1032                ctx.tcr_el1,
1033            )
1034        },
1035        target_kva,
1036        len,
1037    )
1038}
1039
1040/// Read a u32 from a BPF map's value region at `offset`.
1041pub(crate) fn read_bpf_map_value_u32(
1042    ctx: &AccessorCtx<'_>,
1043    map_info: &BpfMapInfo,
1044    offset: usize,
1045) -> Option<u32> {
1046    let bytes = read_bpf_map_value(ctx, map_info, offset, 4)?;
1047    Some(u32::from_ne_bytes(bytes.try_into().unwrap()))
1048}
1049
1050/// Read the per-CPU values for a single key in a `BPF_MAP_TYPE_PERCPU_ARRAY` map.
1051///
1052/// `bpf_array.pptrs[key]` holds a `__percpu` pointer. Adding
1053/// `__per_cpu_offset[cpu]` yields the per-CPU KVA, which may live
1054/// either in the direct mapping (static percpu, kmalloc'd percpu)
1055/// or in vmalloc'd memory (large dynamic per-CPU allocations). The
1056/// value is read via [`read_percpu_value_bytes`], which walks it
1057/// page-by-page through [`translate_any_kva`] (direct-mapping-first,
1058/// vmalloc fallback) so a value that misses the direct mapping — or
1059/// that straddles physically discontiguous vmalloc frames — is read
1060/// correctly rather than reading as `None` or copying garbage.
1061///
1062/// Returns one entry per CPU, indexed by CPU number. `Some(bytes)`
1063/// when the per-CPU PA falls within guest memory; `None` when it
1064/// does not. Returns an empty vec if the map is not
1065/// `BPF_MAP_TYPE_PERCPU_ARRAY`, `key >= max_entries`, or the percpu
1066/// pointer is zero.
1067fn read_percpu_array_value(
1068    ctx: &AccessorCtx<'_>,
1069    map: &BpfMapInfo,
1070    key: u32,
1071    per_cpu_offsets: &[u64],
1072) -> Vec<Option<Vec<u8>>> {
1073    if map.map_type != BPF_MAP_TYPE_PERCPU_ARRAY {
1074        return Vec::new();
1075    }
1076    if key >= map.max_entries {
1077        return Vec::new();
1078    }
1079
1080    // pptrs is at the same offset as value (union in bpf_array).
1081    let pptrs_kva = map.map_kva + ctx.offsets.array_value as u64;
1082    // pptrs[key] is a void __percpu * — 8 bytes.
1083    let pptr_kva = pptrs_kva + (key as u64) * 8;
1084
1085    // bpf_array may be kmalloc'd or vmalloc'd — try direct mapping first.
1086    let Some(pptr_pa) = translate_any_kva(
1087        ctx.mem,
1088        ctx.cr3_pa.0,
1089        ctx.page_offset.0,
1090        pptr_kva,
1091        ctx.l5,
1092        ctx.tcr_el1,
1093    ) else {
1094        return Vec::new();
1095    };
1096    let percpu_base = ctx.mem.read_u64(pptr_pa, 0);
1097    if percpu_base == 0 {
1098        return Vec::new();
1099    }
1100
1101    let value_size = map.value_size as usize;
1102    // Robustness cap mirroring read_bpf_map_value / read_bpf_map_array_value
1103    // (and htab.rs's walk_htab): value_size is read live from kernel memory,
1104    // so a torn read mid-update, a stale offset-table entry for the running
1105    // kernel, or a corrupted-pointer chase could yield a garbage size — and
1106    // here it would allocate value_size bytes PER CPU across the loop below.
1107    // The kernel bounds PERCPU value_size well under 16 MiB (map create
1108    // rejects round_up(value_size, 8) > PCPU_MIN_UNIT_SIZE), so this never
1109    // rejects a legal map; it only fires on a mis-read.
1110    if value_size > MAX_VALUE_SIZE {
1111        return Vec::new();
1112    }
1113    let mut result = Vec::with_capacity(per_cpu_offsets.len());
1114
1115    for (cpu_index, &cpu_off) in per_cpu_offsets.iter().enumerate() {
1116        // Out-of-range CPU detection: kernel `setup_per_cpu_areas`
1117        // (e.g. arch/x86/kernel/setup_percpu.c) only writes
1118        // `__per_cpu_offset[cpu]` for cpus in `for_each_possible_cpu`,
1119        // leaving slots beyond `nr_cpu_ids` at the BSS-initialized
1120        // value of 0. Real SMP kernels assign each possible CPU a
1121        // strictly-positive offset (`delta + unit_offsets[cpu]`) for
1122        // cpu > 0 because `unit_offsets[cpu]` is a positive multiple
1123        // of the per-CPU unit size — only the BSP (cpu_index == 0)
1124        // can legitimately observe a zero offset on systems where
1125        // the delta term is zero. Treating `cpu_off == 0 &&
1126        // cpu_index > 0` as out-of-range prevents the prior aliasing
1127        // bug where every out-of-range slot returned CPU 0's bytes
1128        // (because `percpu_base + 0` translated successfully to
1129        // whatever the bare percpu_base pointed at).
1130        if cpu_off == 0 && cpu_index > 0 {
1131            result.push(None);
1132            continue;
1133        }
1134        let cpu_kva = percpu_base.wrapping_add(cpu_off);
1135        // The percpu base + cpu_off may land in either the direct
1136        // mapping (per-CPU __percpu allocations from the static
1137        // percpu region or kmalloc'd percpu blocks) or vmalloc'd
1138        // percpu memory (large dynamic per-CPU allocations served
1139        // from pcpu_get_vm_areas). `read_percpu_value_bytes` walks the
1140        // value page-by-page through `translate_any_kva` (direct
1141        // mapping first, page-table walk for vmalloc'd percpu), so a
1142        // value straddling physically discontiguous vmalloc frames is
1143        // read correctly; it drops the slot to `None` on any unmapped
1144        // page or end-of-DRAM short read.
1145        result.push(read_percpu_value_bytes(ctx, cpu_kva, value_size));
1146    }
1147
1148    result
1149}
1150
1151/// Chase modifiers (Volatile, Const, Typedef, TypeTag, Restrict,
1152/// DeclTag) and pointers from `type_id` to find a Struct or Union.
1153///
1154/// Returns `None` if the chain ends in a type that is neither Struct
1155/// nor Union, or exceeds depth 32. Also resolves through Ptr (for
1156/// pointer-to-struct members).
1157pub(crate) fn resolve_to_struct(btf: &btf_rs::Btf, type_id: u32) -> Option<btf_rs::Struct> {
1158    resolve_to_struct_with_id(btf, type_id).map(|(s, _)| s)
1159}
1160
1161/// Same chain walk as [`resolve_to_struct`] but returns the BTF type
1162/// id of the terminal struct/union instead of the struct value.
1163/// Callers that key data structures on type ids (e.g. the cast
1164/// analyzer's `RegState::Pointer { struct_type_id }`) need the id
1165/// post-peel; callers that need the struct shape use
1166/// [`resolve_to_struct`].
1167pub(crate) fn resolve_to_struct_id(btf: &btf_rs::Btf, type_id: u32) -> Option<u32> {
1168    resolve_to_struct_with_id(btf, type_id).map(|(_, tid)| tid)
1169}
1170
1171/// Shared chain walk for [`resolve_to_struct`] and
1172/// [`resolve_to_struct_id`]. Peels Ptr / Volatile / Const / Typedef /
1173/// TypeTag / Restrict / DeclTag up to depth 32, returning `(struct,
1174/// id)` at the first Struct or Union encountered.
1175fn resolve_to_struct_with_id(btf: &btf_rs::Btf, type_id: u32) -> Option<(btf_rs::Struct, u32)> {
1176    let mut tid = type_id;
1177    for _ in 0..32 {
1178        let t = btf.resolve_type_by_id(tid).ok()?;
1179        match t {
1180            btf_rs::Type::Struct(s) | btf_rs::Type::Union(s) => return Some((s, tid)),
1181            btf_rs::Type::Ptr(_)
1182            | btf_rs::Type::Volatile(_)
1183            | btf_rs::Type::Const(_)
1184            | btf_rs::Type::Typedef(_)
1185            | btf_rs::Type::TypeTag(_)
1186            | btf_rs::Type::Restrict(_)
1187            | btf_rs::Type::DeclTag(_) => {
1188                tid = t.as_btf_type()?.get_type_id()?;
1189            }
1190            _ => return None,
1191        }
1192    }
1193    None
1194}
1195
1196/// Read-only abstraction over BPF map enumeration and value reads
1197/// across data sources. Mutating operations (write_value etc.) are
1198/// inherent on each backend, NOT exposed here — the trait surface is
1199/// a snapshot-style read API used by the failure-dump renderer and
1200/// any future read-only consumer.
1201///
1202/// Two implementations exist today: `GuestMemMapAccessor` (this
1203/// module — reads a frozen guest VM's physical memory) and
1204/// `super::bpf_syscall::BpfSyscallAccessor` (live-host introspection
1205/// via the `bpf()` syscall). Both plug into this trait surface.
1206///
1207/// - `GuestMemMapAccessor` — reads from a frozen guest VM's physical
1208///   memory via PTE walks against the frozen `init_mm`. Used by the
1209///   freeze-coordinator path (`super::dump::dump_state`) on the
1210///   in-VM scheduler test runs. Hash map iteration walks
1211///   `bpf_htab.buckets` directly without RCU; the freeze rendezvous
1212///   IS the ordering primitive (every CPU is parked at a known KVM
1213///   exit before the host begins reading memory). Per-CPU value
1214///   reads use the cached `__per_cpu_offset[cpu]` array; out-of-range
1215///   CPUs surface as `None` rather than aliasing CPU 0 (see
1216///   `read_percpu_array_value`).
1217///
1218/// The live-host backend produces identical
1219/// [`BpfMapInfo`] / byte buffers, so the rendering pipeline
1220/// (`super::btf_render::render_value`) stays data-source-agnostic
1221/// and consumes either accessor through this trait. The
1222/// live-host backend's failure modes are different (e.g. hash reads
1223/// will rely on the kernel's RCU read-side critical section,
1224/// `bpf_map_lookup_elem` rejection for non-readable types) and
1225/// individual method docs spell those out where they matter.
1226///
1227/// `dump_state` currently takes a concrete
1228/// `GuestMemMapAccessor` because its sdt_alloc post-pass walks
1229/// the underlying `super::guest::GuestKernel` — that handle is
1230/// not part of the trait surface. Once sdt_alloc walking moves
1231/// into a backend-specific path, `dump_state` can switch to
1232/// `&dyn BpfMapAccessor`. Other call
1233/// sites that need only the trait surface can already bind on
1234/// `&dyn BpfMapAccessor` (or `<A: BpfMapAccessor>`) without paying
1235/// virtual dispatch.
1236#[allow(dead_code)]
1237pub trait BpfMapAccessor {
1238    /// Enumerate every BPF map visible to this accessor.
1239    ///
1240    /// Order is implementation-defined: the guest-memory backend walks
1241    /// `map_idr` (allocation order); the bpf-syscall backend walks the
1242    /// kernel's id space via `BPF_MAP_GET_NEXT_ID` (also allocation
1243    /// order, modulo concurrent destruction races on the live host).
1244    /// Callers that want a stable view should sort by name.
1245    fn maps(&self) -> Vec<BpfMapInfo>;
1246
1247    /// Find the first BPF map whose name ends with `name_suffix`.
1248    ///
1249    /// Default impl walks [`Self::maps`]. Backends with cheaper
1250    /// targeted lookups can override (e.g. a libbpf-handle-backed
1251    /// accessor that already holds a name index).
1252    fn find_map(&self, name_suffix: &str) -> Option<BpfMapInfo> {
1253        self.maps()
1254            .into_iter()
1255            .find(|m| m.name().ends_with(name_suffix))
1256    }
1257
1258    /// Read a contiguous byte range from a map's value region.
1259    ///
1260    /// Returns `None` for non-readable map types (e.g. ARENA — use
1261    /// [`Self::read_arena_pages`]; HASH — use [`Self::iter_hash_map`])
1262    /// or when the backing read fails. The guest-memory backend's
1263    /// failure modes are unmapped guest pages and out-of-range value
1264    /// regions; the bpf-syscall backend additionally surfaces
1265    /// `bpf_map_lookup_elem` rejection (e.g. `-EINVAL` on
1266    /// arena maps, kernel-side ACL denials).
1267    fn read_value(&self, map: &BpfMapInfo, offset: usize, len: usize) -> Option<Vec<u8>>;
1268
1269    /// Read the value bytes of one entry of a `BPF_MAP_TYPE_ARRAY` map
1270    /// by entry index.
1271    ///
1272    /// Parallels [`Self::read_percpu_array`] (also keyed by entry
1273    /// index) but for a plain ARRAY: one value per key, so the return
1274    /// is a single `Option<Vec<u8>>` rather than a per-CPU vector. On
1275    /// success the buffer is exactly `map.value_size` bytes.
1276    ///
1277    /// Returns `None` for non-ARRAY maps, `key >= map.max_entries`, or
1278    /// when the backing read fails (unmapped guest page on the
1279    /// guest-memory backend; `bpf_map_lookup_elem` rejection on the
1280    /// live-host backend). Distinct from [`Self::read_value`], which
1281    /// stays the byte-range reader for single-entry global-section
1282    /// ARRAYs and STRUCT_OPS (both key 0); multi-entry ARRAY indexing
1283    /// goes through this method so `read_value`'s key-0 contract is
1284    /// untouched.
1285    fn read_array(&self, map: &BpfMapInfo, key: u32) -> Option<Vec<u8>>;
1286
1287    /// Iterate every entry in a `BPF_MAP_TYPE_HASH` or
1288    /// `BPF_MAP_TYPE_LRU_HASH` map.
1289    ///
1290    /// Both share the inline-value `htab_elem` layout
1291    /// (`kernel/bpf/hashtab.c::htab_elem_value`); LRU adds an
1292    /// eviction policy but the value bytes still sit at
1293    /// `key + round_up(key_size, 8)`. Returns an empty vec for any
1294    /// other map type.
1295    ///
1296    /// Per-element atomicity is backend-specific: the guest-memory
1297    /// backend reads raw bytes at the freeze instant (the freeze
1298    /// rendezvous IS the synchronization — no concurrent writers
1299    /// exist while parked vCPUs stay parked); the bpf-syscall backend
1300    /// reads under the kernel's RCU read-side critical section
1301    /// (`bpf_map_lookup_elem` -> `htab_map_lookup_elem`). Both can
1302    /// produce torn views relative to a multi-element transaction
1303    /// the scheduler intended to commit atomically — that's a feature
1304    /// of reading without locking the whole table.
1305    fn iter_hash_map(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)>;
1306
1307    /// Iterate every entry in a `BPF_MAP_TYPE_PERCPU_HASH` or
1308    /// `BPF_MAP_TYPE_LRU_PERCPU_HASH` map. Returns
1309    /// `(key_bytes, per_cpu_values)` where `per_cpu_values` is one
1310    /// entry per CPU indexed by CPU number; `Some(bytes)` when the
1311    /// CPU's slot is readable, `None` otherwise (unmapped page or
1312    /// out-of-range CPU).
1313    ///
1314    /// Returns an empty vec for any other map type. Default
1315    /// implementation returns empty so backends that haven't yet
1316    /// wired the percpu-hash path don't break trait dispatch — the
1317    /// dump renderer surfaces the resulting empty list as a
1318    /// "no entries" outcome rather than a panic.
1319    fn iter_percpu_hash_map(&self, _map: &BpfMapInfo, _num_cpus: u32) -> PerCpuHashEntries {
1320        Vec::new()
1321    }
1322
1323    /// Iterate every entry in a `BPF_MAP_TYPE_TASK_STORAGE` map (and
1324    /// the shape-identical `INODE_STORAGE` / `SK_STORAGE` /
1325    /// `CGRP_STORAGE` variants — they all use
1326    /// `super::btf_offsets::TaskStorageOffsets`).
1327    ///
1328    /// Returned tuples are `(owner_kva_le_bytes, value_bytes)`:
1329    /// - `owner_kva_le_bytes` is the 8-byte little-endian encoding of
1330    ///   the `bpf_local_storage.owner` pointer reached by following
1331    ///   each `bpf_local_storage_elem.local_storage`. For
1332    ///   `TASK_STORAGE` this is the `task_struct` KVA; for the other
1333    ///   variants it is the inode/sock/cgroup KVA. The walker treats
1334    ///   it as opaque so the same shape works across all four map
1335    ///   types.
1336    /// - `value_bytes` is `value_size` bytes copied from
1337    ///   `bpf_local_storage_elem.sdata.data[]` — the value the
1338    ///   scheduler stored under this owner.
1339    ///
1340    /// Returns an empty vec for any other map type, when
1341    /// `task_storage_offsets` is unavailable, or when the map's
1342    /// `buckets` pointer cannot be translated. Returns an empty vec
1343    /// for any other map type. Default implementation returns empty
1344    /// so backends that haven't yet wired this path don't break
1345    /// trait dispatch — the dump renderer surfaces the resulting
1346    /// empty list as a "no entries" outcome rather than a panic.
1347    fn iter_task_storage(&self, _map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
1348        Vec::new()
1349    }
1350
1351    /// Read every CPU's value for a key in a `BPF_MAP_TYPE_PERCPU_ARRAY` map.
1352    ///
1353    /// Returns one entry per CPU, indexed by CPU number. `Some(bytes)`
1354    /// when the per-CPU slot is readable; `None` when it isn't (e.g.
1355    /// an out-of-range CPU index — `__per_cpu_offset[cpu]` reads as
1356    /// the BSS-zero sentinel — or an unmapped page on the
1357    /// guest-memory path; the bpf-syscall backend surfaces
1358    /// out-of-range CPU on `bpf_map_lookup_elem` failure). Returns an
1359    /// empty vec for non-PERCPU_ARRAY maps or `key >= max_entries`.
1360    fn read_percpu_array(&self, map: &BpfMapInfo, key: u32, num_cpus: u32) -> Vec<Option<Vec<u8>>>;
1361
1362    /// Snapshot every mapped page of a `BPF_MAP_TYPE_ARENA` map.
1363    ///
1364    /// `arena_offsets` resolves kernel struct field offsets the
1365    /// guest-memory backend uses to walk `bpf_arena -> kern_vm ->
1366    /// vm_struct.addr`; the bpf-syscall backend mmaps the arena fd
1367    /// directly (the only data path the kernel exposes — arena's
1368    /// `lookup_elem` returns `-EINVAL`, see `kernel/bpf/arena.c`)
1369    /// and ignores `arena_offsets`. The default
1370    /// implementation returns an empty snapshot; backends override to
1371    /// produce real content.
1372    fn read_arena_pages(
1373        &self,
1374        _map: &BpfMapInfo,
1375        _arena_offsets: &super::arena::BpfArenaOffsets,
1376    ) -> super::arena::ArenaSnapshot {
1377        super::arena::ArenaSnapshot::default()
1378    }
1379
1380    /// Load the program BTF object referenced by a map.
1381    ///
1382    /// `base_btf` is the host's vmlinux BTF used as the base for
1383    /// split-BTF parsing. Returns `None` when the map carries no
1384    /// program BTF (e.g. kernel-builtin maps), when the BTF blob can't
1385    /// be loaded, or when [`btf_rs::Btf::from_bytes`] /
1386    /// [`btf_rs::Btf::from_split_bytes`] reject the bytes.
1387    ///
1388    /// The default implementation returns `None`; backends override to
1389    /// hand back a parsed [`btf_rs::Btf`].
1390    fn load_program_btf(&self, _map: &BpfMapInfo, _base_btf: &btf_rs::Btf) -> Option<btf_rs::Btf> {
1391        None
1392    }
1393}
1394
1395/// Host-side BPF map accessor backed by direct guest physical-memory
1396/// reads.
1397///
1398/// Resolves BTF offsets for BPF map structures and provides map
1399/// discovery, value read/write, hash iteration, and per-CPU reads.
1400/// Uses a [`GuestKernel`] for address translation (PTE walks against
1401/// the guest's frozen page tables).
1402///
1403/// Implements the [`BpfMapAccessor`] trait so [`super::dump::dump_state`]
1404/// can dispatch through it without committing to a backend at the call
1405/// site.
1406///
1407/// [`GuestKernel`]: super::guest::GuestKernel
1408pub struct GuestMemMapAccessor<'a> {
1409    kernel: &'a super::guest::GuestKernel,
1410    map_idr_kva: u64,
1411    /// Borrowed from the `GuestMemMapAccessorOwned` that produced this
1412    /// accessor via `as_accessor`, or provided by the caller to
1413    /// `from_guest_kernel`. Borrowing avoids the ~160-byte
1414    /// `BpfMapOffsets` clone that the old owned-field design paid
1415    /// on every `as_accessor()` call.
1416    offsets: &'a BpfMapOffsets,
1417    /// Optional borrow of a `__per_cpu_offset` cache owned by the
1418    /// `GuestMemMapAccessorOwned` wrapper. The cache stores the
1419    /// resolved `Vec<u64>` once per `(num_cpus, accessor)` pair so
1420    /// repeat percpu reads (one ARRAY map + several PERCPU_HASH
1421    /// maps in a single dump) don't re-issue the
1422    /// `read_per_cpu_offsets` array read for every map.
1423    ///
1424    /// `None` for accessors built directly via [`Self::from_guest_kernel`]
1425    /// (which has no owner to host the cache); on that path each
1426    /// percpu method resolves offsets fresh as before. The cached
1427    /// vec stores RAW offsets — consumers (e.g.
1428    /// [`read_percpu_array_value`]) still apply the BSS-zero-tail
1429    /// guard `cpu_off == 0 && cpu_index > 0` to skip aliased CPU
1430    /// slots.
1431    per_cpu_offsets_cache: Option<&'a PerCpuOffsetsCache>,
1432    /// Per-accessor (per-dump) cache of [`find_all_bpf_maps`].
1433    /// Each `as_accessor()` / `from_guest_kernel` call constructs
1434    /// a fresh empty cache, so the cache lifetime matches one
1435    /// dump. Between dumps the guest kernel runs and can create /
1436    /// destroy maps; persisting the cache across the
1437    /// [`GuestMemMapAccessorOwned`] lifetime would return stale
1438    /// entries for freed maps. The borrowed accessor's per-dump
1439    /// lifetime is exactly the right scope.
1440    ///
1441    /// `Mutex<Option<...>>` rather than `RefCell` because the
1442    /// trait surface uses `&self` and any cross-thread `Sync`
1443    /// caller (today single-threaded freeze coordinator, future
1444    /// concurrent dump pipeline) requires `Sync`. Contention is
1445    /// non-existent — the lock is held only for the duration of
1446    /// one Vec move.
1447    maps_cache: std::sync::Mutex<Option<std::sync::Arc<Vec<BpfMapInfo>>>>,
1448}
1449
1450/// Per-`(num_cpus, accessor)` cache of the resolved `__per_cpu_offset`
1451/// array. Lives on [`GuestMemMapAccessorOwned`] so a single freeze-
1452/// dump session amortizes one array read across every percpu map
1453/// access (PERCPU_ARRAY value reads, PERCPU_HASH iteration).
1454///
1455/// Storage: a single-slot cache keyed on the `num_cpus` argument
1456/// every method passes. Different `num_cpus` values overwrite the
1457/// previous slot (we re-resolve from guest memory). Production
1458/// callers pass a constant `num_cpus` for the run, so this is a
1459/// pure win on the freeze hot path; the test path never builds an
1460/// `Owned` wrapper and so doesn't see the cache.
1461///
1462/// Synchronization: `Mutex<Option<...>>` keeps the cache safe for
1463/// the trait's `&self` methods and for any future cross-thread
1464/// dump pipeline (today the freeze coordinator is single-threaded
1465/// for reads, but a `Sync` trait surface lets the cache work even
1466/// when the assumption changes). Contention is non-existent — the
1467/// lock is held only for the duration of one Vec move.
1468#[allow(dead_code)]
1469pub(crate) struct PerCpuOffsetsCache {
1470    inner: std::sync::Mutex<Option<(u32, std::sync::Arc<Vec<u64>>)>>,
1471}
1472
1473#[allow(dead_code)]
1474impl PerCpuOffsetsCache {
1475    pub(crate) fn new() -> Self {
1476        Self {
1477            inner: std::sync::Mutex::new(None),
1478        }
1479    }
1480
1481    /// Resolve `__per_cpu_offset[]` once per `(num_cpus, accessor)`
1482    /// and reuse on subsequent calls with the same `num_cpus`. The
1483    /// closure runs only on a miss (or a `num_cpus` change); its
1484    /// return value is shared via `Arc` so concurrent borrowers see
1485    /// the same vec without holding the mutex across reads.
1486    pub(crate) fn get_or_init<F>(&self, num_cpus: u32, init: F) -> std::sync::Arc<Vec<u64>>
1487    where
1488        F: FnOnce() -> Vec<u64>,
1489    {
1490        let mut guard = self.inner.lock_unpoisoned();
1491        if let Some((cached_n, cached)) = guard.as_ref()
1492            && *cached_n == num_cpus
1493        {
1494            return cached.clone();
1495        }
1496        let arc = std::sync::Arc::new(init());
1497        *guard = Some((num_cpus, arc.clone()));
1498        arc
1499    }
1500}
1501
1502#[allow(dead_code)]
1503impl<'a> GuestMemMapAccessor<'a> {
1504    /// Create from an existing [`GuestKernel`] and a caller-owned
1505    /// [`BpfMapOffsets`].
1506    ///
1507    /// The accessor borrows the offsets for its lifetime, so callers
1508    /// typically stash them in a `GuestMemMapAccessorOwned` (or another
1509    /// stable location) before calling this. Build `offsets` once via
1510    /// [`BpfMapOffsets::from_vmlinux`] and reuse — they're per-kernel,
1511    /// not per-call.
1512    ///
1513    /// [`GuestKernel`]: super::guest::GuestKernel
1514    pub fn from_guest_kernel(
1515        kernel: &'a super::guest::GuestKernel,
1516        offsets: &'a BpfMapOffsets,
1517    ) -> anyhow::Result<Self> {
1518        let map_idr_kva = kernel
1519            .symbol_kva("map_idr")
1520            .ok_or_else(|| anyhow::anyhow!("map_idr symbol not found in vmlinux"))?;
1521
1522        Ok(Self {
1523            kernel,
1524            map_idr_kva,
1525            offsets,
1526            per_cpu_offsets_cache: None,
1527            maps_cache: std::sync::Mutex::new(None),
1528        })
1529    }
1530
1531    /// Build a `GuestMemMapAccessor` for unit tests, bypassing the
1532    /// `map_idr` symbol lookup `from_guest_kernel` performs.
1533    ///
1534    /// Cross-module tests for the per-map render helpers
1535    /// (`render_ringbuf_state`, `render_stack_traces`,
1536    /// `render_fd_array_slots`) and for `iter_percpu_hash_map` need
1537    /// an accessor over a synthetic `GuestKernel`. The production
1538    /// `from_guest_kernel` requires the kernel to expose a `map_idr`
1539    /// symbol, which synthetic kernels constructed via
1540    /// `GuestKernel::new_for_test` typically do not. This
1541    /// constructor takes `map_idr_kva` directly so the caller can
1542    /// pass `0` (the per-map render helpers never read through the
1543    /// map_idr) or a known-good KVA when exercising
1544    /// `find_all_bpf_maps`.
1545    #[cfg(test)]
1546    pub(crate) fn new_for_test(
1547        kernel: &'a super::guest::GuestKernel,
1548        offsets: &'a BpfMapOffsets,
1549        map_idr_kva: u64,
1550    ) -> Self {
1551        Self {
1552            kernel,
1553            map_idr_kva,
1554            offsets,
1555            per_cpu_offsets_cache: None,
1556            maps_cache: std::sync::Mutex::new(None),
1557        }
1558    }
1559
1560    /// Build the [`AccessorCtx`] used by every map-read/write routine.
1561    fn ctx(&self) -> AccessorCtx<'_> {
1562        AccessorCtx {
1563            mem: self.kernel.mem(),
1564            cr3_pa: Cr3Pa(self.kernel.cr3_pa()),
1565            page_offset: PageOffset(self.kernel.page_offset()),
1566            offsets: self.offsets,
1567            l5: self.kernel.l5(),
1568            tcr_el1: self.kernel.tcr_el1(),
1569            start_kernel_map: self.kernel.start_kernel_map(),
1570            phys_base: self.kernel.phys_base(),
1571            iter_max: MAP_WALK_ITER_MAX,
1572        }
1573    }
1574
1575    /// Borrow the resolved BPF map field offsets. Used by callers
1576    /// that need to read kernel struct fields (e.g. `struct btf` for
1577    /// the program-BTF loader) without going through the
1578    /// map-access trait surface.
1579    pub fn offsets(&self) -> &BpfMapOffsets {
1580        self.offsets
1581    }
1582
1583    /// Borrow the underlying [`super::guest::GuestKernel`] for callers
1584    /// that need direct access to symbol resolution / page-walk
1585    /// primitives outside the map-discovery surface (e.g. arena page
1586    /// enumeration in [`super::arena`], sdt_alloc tree walks).
1587    pub fn kernel(&self) -> &'a super::guest::GuestKernel {
1588        self.kernel
1589    }
1590
1591    /// Find the first BPF ARRAY map whose name ends with `name_suffix`.
1592    ///
1593    /// Only returns `BPF_MAP_TYPE_ARRAY` maps — distinct from the
1594    /// suffix-only [`BpfMapAccessor::find_map`] trait method. The
1595    /// distinct name keeps inherent-over-trait method resolution
1596    /// honest: a concrete-receiver caller that wants the ARRAY
1597    /// filter (value-region read/write needs `value_kva`, which is
1598    /// `Some` only for ARRAY maps) names it explicitly here, and the
1599    /// compiler errors instead of silently shadowing the trait
1600    /// method when the receiver type changes. Use
1601    /// [`BpfMapAccessor::maps`] to enumerate maps of all types.
1602    /// Goes through the per-accessor maps cache so repeat
1603    /// `find_array_map` calls within one dump amortize the IDR walk.
1604    pub fn find_array_map(&self, name_suffix: &str) -> Option<BpfMapInfo> {
1605        let mut guard = self.maps_cache.lock_unpoisoned();
1606        if guard.is_none() {
1607            *guard = Some(std::sync::Arc::new(find_all_bpf_maps(
1608                &self.ctx(),
1609                self.map_idr_kva,
1610            )));
1611        }
1612        guard
1613            .as_ref()
1614            .unwrap()
1615            .iter()
1616            .find(|m| m.map_type == BPF_MAP_TYPE_ARRAY && m.name().ends_with(name_suffix))
1617            .cloned()
1618    }
1619
1620    /// Write bytes to a map's value region.
1621    ///
1622    /// Returns `false` if the map has no value KVA (non-ARRAY map)
1623    /// or any page in the range is unmapped.
1624    pub fn write_value(&self, map: &BpfMapInfo, offset: usize, data: &[u8]) -> bool {
1625        write_bpf_map_value(&self.ctx(), map, offset, data)
1626    }
1627
1628    /// Write a u32 to a map's value region.
1629    pub fn write_value_u32(&self, map: &BpfMapInfo, offset: usize, val: u32) -> bool {
1630        write_bpf_map_value_u32(&self.ctx(), map, offset, val)
1631    }
1632
1633    /// Read a u32 from a map's value region.
1634    pub fn read_value_u32(&self, map: &BpfMapInfo, offset: usize) -> Option<u32> {
1635        read_bpf_map_value_u32(&self.ctx(), map, offset)
1636    }
1637
1638    /// Resolve `__per_cpu_offset[]` for `num_cpus` CPUs, using the
1639    /// owner-side cache when present.
1640    ///
1641    /// Returns `None` only when the `__per_cpu_offset` symbol is
1642    /// missing from the vmlinux symtab — every other failure mode
1643    /// (out-of-bounds reads, BSS-zero tail entries) surfaces as
1644    /// zero offsets that the caller's BSS-zero guard rejects, so
1645    /// the cache stores the raw resolved vec without filtering.
1646    /// The returned `Arc` lets the cache hand out the same vec to
1647    /// multiple concurrent readers (the freeze hot path is
1648    /// single-threaded today, but the cache surface is `Sync` for
1649    /// future cross-thread use).
1650    pub(crate) fn resolve_per_cpu_offsets(
1651        &self,
1652        num_cpus: u32,
1653    ) -> Option<std::sync::Arc<Vec<u64>>> {
1654        let pco_kva = self.kernel.symbol_kva("__per_cpu_offset")?;
1655        let pco_pa = self.kernel.text_kva_to_pa(pco_kva);
1656        let mem = self.kernel.mem();
1657        match self.per_cpu_offsets_cache {
1658            Some(cache) => Some(cache.get_or_init(num_cpus, || {
1659                super::symbols::read_per_cpu_offsets(mem, pco_pa, num_cpus)
1660            })),
1661            None => Some(std::sync::Arc::new(super::symbols::read_per_cpu_offsets(
1662                mem, pco_pa, num_cpus,
1663            ))),
1664        }
1665    }
1666}
1667
1668impl BpfMapAccessor for GuestMemMapAccessor<'_> {
1669    /// Enumerate every BPF map. Caches the result for this
1670    /// accessor's lifetime so repeat `maps()` / `find_map(...)`
1671    /// calls within a single dump pay the IDR walk only once.
1672    /// The cache is per-accessor (per-dump), not per-owner, so it
1673    /// cannot return stale entries for maps the guest kernel
1674    /// created / destroyed between dumps.
1675    fn maps(&self) -> Vec<BpfMapInfo> {
1676        let mut guard = self.maps_cache.lock_unpoisoned();
1677        if let Some(cached) = guard.as_ref() {
1678            return (**cached).clone();
1679        }
1680        let maps = find_all_bpf_maps(&self.ctx(), self.map_idr_kva);
1681        let arc = std::sync::Arc::new(maps);
1682        let out = (*arc).clone();
1683        *guard = Some(arc);
1684        out
1685    }
1686
1687    /// Find the first BPF map whose name ends with `name_suffix`.
1688    /// Override the trait's default `self.maps().into_iter()` impl
1689    /// so the cache lookup amortizes across `find_map(...)` calls
1690    /// within one dump. Without this override, every `find_map`
1691    /// returned a clone-and-drop of the full `Vec<BpfMapInfo>`
1692    /// from the cache only to scan it linearly.
1693    fn find_map(&self, name_suffix: &str) -> Option<BpfMapInfo> {
1694        let mut guard = self.maps_cache.lock_unpoisoned();
1695        if guard.is_none() {
1696            *guard = Some(std::sync::Arc::new(find_all_bpf_maps(
1697                &self.ctx(),
1698                self.map_idr_kva,
1699            )));
1700        }
1701        guard
1702            .as_ref()
1703            .unwrap()
1704            .iter()
1705            .find(|m| m.name().ends_with(name_suffix))
1706            .cloned()
1707    }
1708
1709    fn read_value(&self, map: &BpfMapInfo, offset: usize, len: usize) -> Option<Vec<u8>> {
1710        read_bpf_map_value(&self.ctx(), map, offset, len)
1711    }
1712
1713    fn read_array(&self, map: &BpfMapInfo, key: u32) -> Option<Vec<u8>> {
1714        read_bpf_map_array_value(&self.ctx(), map, key)
1715    }
1716
1717    fn iter_hash_map(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
1718        iter_htab_entries(&self.ctx(), map)
1719    }
1720
1721    /// Read per-CPU values for a key in a `BPF_MAP_TYPE_PERCPU_ARRAY` map.
1722    ///
1723    /// Resolves `__per_cpu_offset` from the guest kernel (via the
1724    /// owner-side cache when present, otherwise fresh) and reads
1725    /// each CPU's slot via [`translate_any_kva`]. Out-of-range CPUs
1726    /// (those whose `__per_cpu_offset` slot reads as zero —
1727    /// including reads past the end of guest memory and BSS-zero
1728    /// slots beyond `nr_cpu_ids`) return `None` rather than
1729    /// aliasing CPU 0's bytes; see the cpu_off==0 guard in
1730    /// [`read_percpu_array_value`].
1731    fn read_percpu_array(&self, map: &BpfMapInfo, key: u32, num_cpus: u32) -> Vec<Option<Vec<u8>>> {
1732        let Some(per_cpu_offsets) = self.resolve_per_cpu_offsets(num_cpus) else {
1733            return Vec::new();
1734        };
1735        read_percpu_array_value(&self.ctx(), map, key, per_cpu_offsets.as_slice())
1736    }
1737
1738    /// Walk a `BPF_MAP_TYPE_PERCPU_HASH` or
1739    /// `BPF_MAP_TYPE_LRU_PERCPU_HASH` map, dereferencing each
1740    /// element's per-CPU pointer for every CPU.
1741    ///
1742    /// Reuses the same `__per_cpu_offset` resolution path as
1743    /// [`Self::read_percpu_array`].
1744    fn iter_percpu_hash_map(&self, map: &BpfMapInfo, num_cpus: u32) -> PerCpuHashEntries {
1745        let Some(per_cpu_offsets) = self.resolve_per_cpu_offsets(num_cpus) else {
1746            return Vec::new();
1747        };
1748        iter_percpu_htab_entries(&self.ctx(), map, per_cpu_offsets.as_slice())
1749    }
1750
1751    fn read_arena_pages(
1752        &self,
1753        map: &BpfMapInfo,
1754        arena_offsets: &super::arena::BpfArenaOffsets,
1755    ) -> super::arena::ArenaSnapshot {
1756        super::arena::snapshot_arena(self.kernel, map, arena_offsets)
1757    }
1758
1759    /// Walk every selem of a TASK_STORAGE / INODE_STORAGE /
1760    /// SK_STORAGE / CGRP_STORAGE map. Returns
1761    /// `(owner_kva_le_bytes, value_bytes)` per entry — see
1762    /// [`iter_local_storage_entries`] for the kernel-side walk
1763    /// shape (`bpf_local_storage_map.buckets[i].list` — regular
1764    /// hlist, NULL termination — with `map_node` at offset 0 of the
1765    /// elem, so the node KVA is the elem base and no `container_of`
1766    /// subtraction is needed).
1767    fn iter_task_storage(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
1768        iter_local_storage_entries(&self.ctx(), map)
1769    }
1770
1771    fn load_program_btf(&self, map: &BpfMapInfo, base_btf: &btf_rs::Btf) -> Option<btf_rs::Btf> {
1772        if map.btf_kva == 0 {
1773            return None;
1774        }
1775        super::dump::load_program_btf_kva(self, map.btf_kva, base_btf)
1776    }
1777}
1778
1779/// Owns a [`GuestKernel`] and provides BPF map access through the
1780/// [`GuestMemMapAccessor`] borrow.
1781///
1782/// Returned by [`GuestMemMapAccessorOwned::new`] which builds the
1783/// `GuestKernel` internally. Borrow as [`GuestMemMapAccessor`] via
1784/// [`as_accessor`](Self::as_accessor) for map operations.
1785///
1786/// [`GuestKernel`]: super::guest::GuestKernel
1787pub struct GuestMemMapAccessorOwned {
1788    kernel: super::guest::GuestKernel,
1789    map_idr_kva: u64,
1790    offsets: BpfMapOffsets,
1791    /// Single-shot `__per_cpu_offset[]` cache keyed on the
1792    /// `num_cpus` argument the trait's percpu methods pass. See
1793    /// [`PerCpuOffsetsCache`] for the contract.
1794    per_cpu_offsets_cache: PerCpuOffsetsCache,
1795}
1796
1797#[allow(dead_code)]
1798impl GuestMemMapAccessorOwned {
1799    /// Create from GuestMem and vmlinux path.
1800    ///
1801    /// One-shot constructor: builds a [`GuestKernel`] from `vmlinux`,
1802    /// parses BTF to resolve the map-related struct offsets, and
1803    /// locates the `map_idr` symbol. The resulting handle owns both
1804    /// the `GuestKernel` and the `BpfMapOffsets`.
1805    ///
1806    /// Prefer [`GuestMemMapAccessor::from_guest_kernel`] when you already
1807    /// hold a `GuestKernel` **and** a pre-built `&BpfMapOffsets` — it
1808    /// builds a borrowed accessor without taking ownership of either,
1809    /// so callers that maintain their own offsets cache (e.g. across
1810    /// multiple map probes in the same poll cycle) don't pay repeat
1811    /// BTF parses. `new` is the convenience path when you want the
1812    /// accessor to own its offsets.
1813    ///
1814    /// [`GuestKernel`]: super::guest::GuestKernel
1815    pub fn new(
1816        mem: std::sync::Arc<GuestMem>,
1817        vmlinux: &std::path::Path,
1818        tcr_el1: u64,
1819        cr3_pa: u64,
1820    ) -> anyhow::Result<Self> {
1821        // Read the vmlinux file and parse its ELF once, then share
1822        // the parse between `GuestKernel::from_elf` (kernel symbols
1823        // + paging state) and `BpfMapOffsets::from_elf` (BTF section
1824        // extraction on sidecar cache miss). The previous structure
1825        // ran `std::fs::read` and `goblin::elf::Elf::parse` twice —
1826        // once inside `GuestKernel::new` and once again inside
1827        // `BpfMapOffsets::from_vmlinux` — and the freeze coordinator
1828        // calls this in a retry loop until the boot-time symbols
1829        // settle, multiplying that cost across every retry tick.
1830        let data = std::fs::read(vmlinux)
1831            .with_context(|| format!("read vmlinux: {}", vmlinux.display()))?;
1832        let elf = goblin::elf::Elf::parse(&data).context("parse vmlinux ELF")?;
1833        let kernel = super::guest::GuestKernel::from_elf(mem, &elf, tcr_el1, cr3_pa)?;
1834        let offsets = BpfMapOffsets::from_elf(&elf, &data, vmlinux)?;
1835
1836        let map_idr_kva = kernel
1837            .symbol_kva("map_idr")
1838            .ok_or_else(|| anyhow::anyhow!("map_idr symbol not found in vmlinux"))?;
1839
1840        Ok(Self {
1841            kernel,
1842            map_idr_kva,
1843            offsets,
1844            per_cpu_offsets_cache: PerCpuOffsetsCache::new(),
1845        })
1846    }
1847
1848    /// Create from pre-read vmlinux bytes and pre-parsed ELF.
1849    ///
1850    /// Avoids re-reading + re-parsing vmlinux on every retry in
1851    /// the freeze coordinator's BPF map write loop.
1852    pub fn from_elf(
1853        mem: std::sync::Arc<GuestMem>,
1854        elf: &goblin::elf::Elf<'_>,
1855        data: &[u8],
1856        vmlinux: &std::path::Path,
1857        tcr_el1: u64,
1858        cr3_pa: u64,
1859    ) -> anyhow::Result<Self> {
1860        Self::from_elf_inner(mem, elf, data, vmlinux, tcr_el1, cr3_pa, 0)
1861    }
1862
1863    pub fn from_elf_with_hint(
1864        mem: std::sync::Arc<GuestMem>,
1865        elf: &goblin::elf::Elf<'_>,
1866        data: &[u8],
1867        vmlinux: &std::path::Path,
1868        tcr_el1: u64,
1869        cr3_pa: u64,
1870        phys_base_hint: u64,
1871    ) -> anyhow::Result<Self> {
1872        Self::from_elf_inner(mem, elf, data, vmlinux, tcr_el1, cr3_pa, phys_base_hint)
1873    }
1874
1875    fn from_elf_inner(
1876        mem: std::sync::Arc<GuestMem>,
1877        elf: &goblin::elf::Elf<'_>,
1878        data: &[u8],
1879        vmlinux: &std::path::Path,
1880        tcr_el1: u64,
1881        cr3_pa: u64,
1882        phys_base_hint: u64,
1883    ) -> anyhow::Result<Self> {
1884        let kernel = super::guest::GuestKernel::from_elf_with_hint(
1885            mem,
1886            elf,
1887            tcr_el1,
1888            cr3_pa,
1889            phys_base_hint,
1890        )?;
1891        let offsets = BpfMapOffsets::from_elf(elf, data, vmlinux)?;
1892        let map_idr_kva = kernel
1893            .symbol_kva("map_idr")
1894            .ok_or_else(|| anyhow::anyhow!("map_idr symbol not found in vmlinux"))?;
1895        Ok(Self {
1896            kernel,
1897            map_idr_kva,
1898            offsets,
1899            per_cpu_offsets_cache: PerCpuOffsetsCache::new(),
1900        })
1901    }
1902
1903    /// Borrow as a [`GuestMemMapAccessor`] for map operations.
1904    ///
1905    /// The returned accessor borrows `self.offsets` and the
1906    /// `__per_cpu_offset` cache; no clone on the hot path. Subsequent
1907    /// borrows reuse the cached `__per_cpu_offset` array across
1908    /// percpu reads in the same dump.
1909    ///
1910    /// The map enumeration cache (`maps_cache`) is freshly
1911    /// initialised on each `as_accessor()` call so the cached
1912    /// `Vec<BpfMapInfo>` lifetime matches one dump. Persisting it
1913    /// across dumps would return stale entries for maps the guest
1914    /// kernel created or destroyed between freeze cycles.
1915    pub fn as_accessor(&self) -> GuestMemMapAccessor<'_> {
1916        GuestMemMapAccessor {
1917            kernel: &self.kernel,
1918            map_idr_kva: self.map_idr_kva,
1919            offsets: &self.offsets,
1920            per_cpu_offsets_cache: Some(&self.per_cpu_offsets_cache),
1921            maps_cache: std::sync::Mutex::new(None),
1922        }
1923    }
1924
1925    /// Access the underlying [`GuestKernel`] for low-level memory reads.
1926    ///
1927    /// [`GuestKernel`]: super::guest::GuestKernel
1928    pub fn guest_kernel(&self) -> &super::guest::GuestKernel {
1929        &self.kernel
1930    }
1931
1932    /// Build an owned accessor around a test-constructed
1933    /// [`GuestKernel`] without parsing a vmlinux. The map offsets are
1934    /// [`BpfMapOffsets::EMPTY`] and `map_idr_kva` is `0`, so this is
1935    /// only usable by callers that touch the `GuestKernel`
1936    /// (`page_offset`, symbol lookup, raw reads) and never walk the
1937    /// map IDR — e.g. [`crate::vmm::capture_scx::build`], which reads
1938    /// only `guest_kernel().page_offset()`. Production must use
1939    /// [`Self::new`] / [`Self::from_elf`].
1940    #[cfg(test)]
1941    pub(crate) fn new_for_test(kernel: super::guest::GuestKernel) -> Self {
1942        Self {
1943            kernel,
1944            map_idr_kva: 0,
1945            offsets: BpfMapOffsets::EMPTY,
1946            per_cpu_offsets_cache: PerCpuOffsetsCache::new(),
1947        }
1948    }
1949
1950    // Map operations live on [`GuestMemMapAccessor`]. Borrow via
1951    // [`as_accessor`] to call them: `owned.as_accessor().find_map(...)`.
1952    // The wrapper type exists only to own the `GuestKernel` and
1953    // `BpfMapOffsets`; it does not duplicate the accessor's surface.
1954}