ktstr/monitor/
bpf_syscall.rs

1//! Live-host BPF map accessor backed by the `bpf(2)` syscall.
2//!
3//! Companion to [`super::bpf_map::GuestMemMapAccessor`]: same trait
4//! ([`super::bpf_map::BpfMapAccessor`]), different data path. Where
5//! GuestMemMapAccessor walks frozen guest physical memory via PTE
6//! resolution against `init_mm`, this backend talks directly to the
7//! running host kernel through the `bpf()` syscall — KASLR is fully
8//! abstracted, no symbol resolution required, no page-walk math.
9//!
10//! # Backend differences vs. guest-memory path
11//!
12//! | concern        | GuestMemMapAccessor                                            | BpfSyscallAccessor                                                           |
13//! |----------------|----------------------------------------------------------------|------------------------------------------------------------------------------|
14//! | discovery      | walk `map_idr` xarray in guest memory                          | `BPF_MAP_GET_NEXT_ID` + `BPF_MAP_GET_FD_BY_ID` loop                          |
15//! | array values   | follow `bpf_array.value` flex array via PTE walks              | `BPF_MAP_LOOKUP_ELEM(fd, &key=0, buf)` returns the inline value bytes        |
16//! | hash iteration | walk `bpf_htab.buckets` directly (freeze rendezvous = sync)    | `BPF_MAP_GET_NEXT_KEY` + `BPF_MAP_LOOKUP_ELEM` per key (kernel RCU read-side) |
17//! | per-CPU array  | read each CPU's slot via `__per_cpu_offset[cpu]`               | one `BPF_MAP_LOOKUP_ELEM` returns `nr_possible_cpus * value_size` bytes      |
18//! | arena          | walk `bpf_arena -> kern_vm -> vm_struct.addr` PTE-by-PTE        | `mmap(arena_fd, ...)` — `lookup_elem` returns `-EINVAL` on arena             |
19//! | program BTF    | read split-BTF blob from guest memory                          | `BPF_BTF_GET_FD_BY_ID` + `BPF_OBJ_GET_INFO_BY_FD` to extract BTF bytes       |
20//!
21//! # Map fd pinning
22//!
23//! Every map discovered at construction time has its fd held open for
24//! the lifetime of the accessor. The kernel's
25//! `bpf_map_put`/`atomic64_dec_and_test` (`kernel/bpf/syscall.c`) only
26//! frees a map when its refcount reaches zero, and userspace fds count
27//! as references. This means the scheduler can exit and tear down its
28//! struct_ops link while the accessor is still iterating maps — the
29//! underlying memory stays valid.
30//!
31//! # Required capabilities
32//!
33//! `BPF_MAP_GET_NEXT_ID` and `BPF_MAP_GET_FD_BY_ID` require
34//! `CAP_SYS_ADMIN` (or, since 5.16, `CAP_BPF` for some commands;
35//! `..._GET_NEXT_ID` still requires `CAP_SYS_ADMIN`). ktstr always runs
36//! as root in the test environment, so this is a non-issue for the
37//! library's primary consumer; the `from_running_kernel` constructor
38//! surfaces the kernel's `EPERM` directly so live-host CLI use cases
39//! can produce a clear error.
40//!
41//! # Lock-free reads
42//!
43//! Without a freeze rendezvous, the kernel's per-element atomicity is
44//! the only ordering primitive. Per-element u64-aligned fields are
45//! atomic on x86_64; multi-element transactions the scheduler intended
46//! to commit atomically may surface as torn views relative to the
47//! walker. This is identical to the guest-memory backend's torn-read
48//! behavior, just for a different reason. Two-snapshot in-BPF capture
49//! (bpf_timer + tp_btf) is the recommended remedy and lives outside
50//! this backend.
51
52use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
53use std::ptr;
54
55use anyhow::{Context, Result, anyhow};
56use btf_rs::Btf;
57
58use super::arena::{ArenaPage, ArenaSnapshot, BpfArenaOffsets};
59use super::bpf_map::{
60    BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_LRU_HASH,
61    BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_PERCPU_HASH,
62    BPF_MAP_TYPE_STRUCT_OPS, BpfMapAccessor, BpfMapInfo, MAP_MATERIALIZE_MAX,
63};
64
65/// `BPF_MAP_LOOKUP_ELEM` — read one map value into a userspace buffer.
66const BPF_MAP_LOOKUP_ELEM: u32 = 1;
67/// `BPF_MAP_GET_NEXT_KEY` — advance hash iteration cursor.
68const BPF_MAP_GET_NEXT_KEY: u32 = 4;
69/// `BPF_MAP_GET_NEXT_ID` — advance the kernel's map id walk.
70const BPF_MAP_GET_NEXT_ID: u32 = 0xc;
71/// `BPF_MAP_GET_FD_BY_ID` — pin a map by id.
72const BPF_MAP_GET_FD_BY_ID: u32 = 0xe;
73/// `BPF_OBJ_GET_INFO_BY_FD` — fetch map/btf metadata from an open fd.
74const BPF_OBJ_GET_INFO_BY_FD: u32 = 0xf;
75/// `BPF_BTF_GET_FD_BY_ID` — pin a BTF object by id.
76/// Per `include/uapi/linux/bpf.h::enum bpf_cmd`: 19 (0x13). Counting
77/// from `BPF_MAP_CREATE = 0` through `BPF_BTF_LOAD = 18` makes the
78/// next entry `BPF_BTF_GET_FD_BY_ID = 19`.
79const BPF_BTF_GET_FD_BY_ID: u32 = 0x13;
80
81/// `BPF_OBJ_NAME_LEN` from `include/uapi/linux/bpf.h`.
82const BPF_OBJ_NAME_LEN: usize = 16;
83
84/// Fallback arena page size (4 KiB), used only if
85/// `sysconf(_SC_PAGESIZE)` fails — which it cannot on Linux. The real
86/// unit is the host kernel's base `PAGE_SIZE`: `arena_map_alloc`
87/// computes `vm_range = max_entries * PAGE_SIZE` and `arena_vm_fault`
88/// faults at `PAGE_SIZE` stride, both arch-dependent (4 KiB on x86_64,
89/// 16 KiB/64 KiB on aarch64 base granule, distinct from THP/hugetlb).
90/// `read_arena_pages` reads the live value via `host_page_size` so the
91/// mmap length matches the kernel's `user_vm_end` on every arch; the
92/// guest-memory backend parameterizes page size the same way via
93/// `guest_page_size(tcr_el1)` (`src/monitor/arena.rs`).
94const ARENA_PAGE_SIZE: usize = 4096;
95
96/// Page size of the kernel that owns the arena fd, via
97/// `sysconf(_SC_PAGESIZE)`. `read_arena_pages` mmaps the arena fd in
98/// the process holding it, so that process always runs on the arena's
99/// own kernel — the guest VM kernel in the in-guest monitor path
100/// (where scx-ktstr's arena lives), or the host kernel in live-host
101/// mode. That kernel created the arena with `vm_range = max_entries *
102/// PAGE_SIZE`, so this is exactly the unit that makes the mmap length
103/// match `user_vm_end`. Falls back to `ARENA_PAGE_SIZE` (4 KiB) only
104/// if the query fails, which it cannot on Linux.
105fn host_page_size() -> usize {
106    // SAFETY: `sysconf` with a valid name has no preconditions and
107    // writes through no pointer.
108    let v = unsafe { libc::sysconf(libc::_SC_PAGESIZE) };
109    if v > 0 { v as usize } else { ARENA_PAGE_SIZE }
110}
111
112/// mmap placement for an arena read: `(addr_hint, flags, length)`
113/// where `addr_hint == 0` means NULL (let the kernel choose the VA).
114///
115/// When the arena was created with a nonzero `map_extra` (scx
116/// schedulers do, via `lib/arena_map.h`), the kernel pins
117/// `user_vm_start`/`user_vm_end`, and `arena_map_mmap`
118/// (`kernel/bpf/arena.c`) rejects any mapping whose start != map_extra
119/// OR whose end != map_extra + full arena span with `-EBUSY`. So the
120/// read must land at exactly `map_extra` with `MAP_FIXED_NOREPLACE`
121/// and span the full `declared_bytes` — not the capped read window.
122/// When `user_vm_start == 0` the kernel adopts our VA, so a NULL hint
123/// plus the capped prefix is correct and bounds host address-space use.
124fn arena_mmap_placement(
125    user_vm_start: u64,
126    declared_bytes: usize,
127    read_bytes: usize,
128) -> (usize, i32, usize) {
129    if user_vm_start != 0 {
130        (
131            user_vm_start as usize,
132            libc::MAP_SHARED | libc::MAP_FIXED_NOREPLACE,
133            declared_bytes,
134        )
135    } else {
136        (0, libc::MAP_SHARED, read_bytes)
137    }
138}
139
140/// Maximum total bytes the arena snapshot reads via mmap, mirroring the
141/// guest-memory backend's `MAX_VM_RANGE_BYTES`. Keeps a runaway
142/// `max_entries` from inducing a multi-GiB read.
143const MAX_ARENA_BYTES: u64 = 4 * 1024 * 1024 * 1024;
144
145/// Maximum number of arena pages the mmap span covers. Pages beyond
146/// this cap are truncated (surfaced via [`ArenaSnapshot::truncated`]),
147/// not stride-probed — mmap already covers the whole window, so this
148/// backend has no stride sweep. The guest-memory backend uses a
149/// separate sequential cap (`MAX_ARENA_PAGES = 4096` in
150/// `src/monitor/arena.rs`) plus a stride-probe sweep for pages past
151/// that cap; the two constants differ.
152const MAX_ARENA_PAGES: u64 = 16 * 1024;
153
154// `bpf_attr` is a uapi union with many command-specific shapes. Rather
155// than declare the full union we lay out per-command structs covering
156// the fields each command reads, in uapi field order; some are a prefix
157// of the full arm (e.g. `BpfAttrGetId` omits the trailing token fd). The
158// kernel does NOT match the passed size against a per-arm length:
159// `__sys_bpf` (kernel/bpf/syscall.c) calls `bpf_check_uarg_tail_zero`,
160// clamps `size = min(size, sizeof(union bpf_attr))`, zero-fills `attr`,
161// then dispatches on `cmd`. Any size up to `sizeof(union bpf_attr)` is
162// accepted provided bytes past `size` are zero; we pass
163// `size_of::<arm>()` and the kernel zero-fills the union tail we omit.
164
165/// `bpf_attr` arm for `BPF_MAP_*_ELEM` and `BPF_MAP_GET_NEXT_KEY`.
166/// Source: `include/uapi/linux/bpf.h::union bpf_attr` (the
167/// MAP_ELEM_OPS arm).
168#[repr(C)]
169#[derive(Default)]
170struct BpfAttrMapElem {
171    map_fd: u32,
172    _pad0: u32,
173    key: u64,
174    value_or_next_key: u64,
175    flags: u64,
176}
177
178/// `bpf_attr` arm for `BPF_MAP_GET_NEXT_ID`, `BPF_BTF_GET_NEXT_ID`,
179/// and the corresponding `*_GET_FD_BY_ID` commands.
180#[repr(C)]
181#[derive(Default)]
182struct BpfAttrGetId {
183    /// `start_id` for `*_GET_NEXT_ID`; `id` for `*_GET_FD_BY_ID`.
184    id_or_start_id: u32,
185    next_id: u32,
186    open_flags: u32,
187}
188
189/// `bpf_attr` arm for `BPF_OBJ_GET_INFO_BY_FD`.
190#[repr(C)]
191#[derive(Default)]
192struct BpfAttrInfoByFd {
193    bpf_fd: u32,
194    info_len: u32,
195    info: u64,
196}
197
198/// `struct bpf_map_info` from `include/uapi/linux/bpf.h`. The kernel
199/// has grown this struct over time; we pass our struct size as
200/// `info_len` and the kernel zero-fills any tail it doesn't fill in.
201/// All fields are documented in the kernel header.
202#[repr(C)]
203#[derive(Default)]
204struct BpfMapInfoUapi {
205    map_type: u32,
206    id: u32,
207    key_size: u32,
208    value_size: u32,
209    max_entries: u32,
210    map_flags: u32,
211    name: [u8; BPF_OBJ_NAME_LEN],
212    ifindex: u32,
213    btf_vmlinux_value_type_id: u32,
214    netns_dev: u64,
215    netns_ino: u64,
216    btf_id: u32,
217    btf_key_type_id: u32,
218    btf_value_type_id: u32,
219    /// Kernel field `btf_vmlinux_id` per
220    /// `include/uapi/linux/bpf.h::struct bpf_map_info`. Unused by the
221    /// caller; named `_pad` here because the value is currently
222    /// discarded by the BPF accessor — rename without binding the
223    /// field to a public consumer that can rot.
224    _pad: u32,
225    map_extra: u64,
226}
227
228/// `struct bpf_btf_info` from `include/uapi/linux/bpf.h`. Used to
229/// extract a BTF blob's bytes given an open BTF fd.
230#[repr(C)]
231#[derive(Default)]
232struct BpfBtfInfoUapi {
233    btf: u64,
234    btf_size: u32,
235    id: u32,
236    name: u64,
237    name_len: u32,
238    kernel_btf: u32,
239}
240
241/// Raw `bpf(2)` syscall wrapper. Returns the kernel's return value as
242/// `i64` so callers can check for `< 0` and inspect `errno`. The
243/// kernel's `__sys_bpf` (`kernel/bpf/syscall.c`) accepts any `size` up
244/// to `sizeof(union bpf_attr)`: `bpf_check_uarg_tail_zero` rejects only
245/// bytes past `size` that are nonzero, then it clamps to
246/// `sizeof(union bpf_attr)`, zero-fills the rest, and dispatches on
247/// `cmd` — there is no per-arm length match.
248///
249/// SAFETY: `attr_ptr` must point to `attr_size` valid bytes laid out as
250/// the command's `bpf_attr` arm (or a zero-tailed prefix of it). A size
251/// smaller than the command needs is accepted — the kernel zero-fills
252/// the omitted fields — so the caller, not the kernel, must supply every
253/// field the command requires. A size above `PAGE_SIZE`, or one whose
254/// bytes past the union are nonzero, returns `-E2BIG`.
255unsafe fn bpf_syscall(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> i64 {
256    // SAFETY: caller must ensure attr_ptr/attr_size validity. The
257    // syscall itself is signal-safe and reentrant.
258    unsafe { libc::syscall(libc::SYS_bpf, cmd as i64, attr_ptr, attr_size) as i64 }
259}
260
261/// Wrap a `bpf()` syscall result in a `Result<RawFd>` for commands
262/// that return an fd. Negative returns are converted to errno-bearing
263/// errors; non-negative returns become the fd.
264fn bpf_call_fd(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> Result<RawFd> {
265    // SAFETY: caller has built attr_ptr/attr_size correctly per the
266    // command's bpf_attr arm.
267    let ret = unsafe { bpf_syscall(cmd, attr_ptr, attr_size) };
268    if ret < 0 {
269        let err = std::io::Error::last_os_error();
270        Err(anyhow!("bpf({cmd}) failed: {err}"))
271    } else {
272        Ok(ret as RawFd)
273    }
274}
275
276/// Wrap a `bpf()` syscall result for commands that return 0 on
277/// success, `< 0` on error.
278fn bpf_call_status(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> Result<()> {
279    // SAFETY: caller has built attr_ptr/attr_size correctly.
280    let ret = unsafe { bpf_syscall(cmd, attr_ptr, attr_size) };
281    if ret < 0 {
282        let err = std::io::Error::last_os_error();
283        Err(anyhow!("bpf({cmd}) failed: {err}"))
284    } else {
285        Ok(())
286    }
287}
288
289/// One discovered map together with its pinned fd. The `OwnedFd`
290/// guarantees the map's refcount stays >0 for the accessor's
291/// lifetime — even if the scheduler exits and userspace tear-down
292/// runs, `bpf_map_put` only frees when every fd is dropped (see
293/// `kernel/bpf/syscall.c` `bpf_map_put`).
294struct PinnedMap {
295    info: BpfMapInfo,
296    fd: OwnedFd,
297    /// Raw `map_extra` from the kernel info struct. Arena maps
298    /// hardcode this to a deterministic mmap target address (x86:
299    /// `1<<44`, aarch64: `1<<32`) per `lib/arena_map.h`. Surfaced
300    /// here so the arena mmap path can use `MAP_FIXED_NOREPLACE` at
301    /// the kernel-blessed address rather than letting `mmap` pick
302    /// one — which would diverge from what BPF programs see.
303    map_extra: u64,
304}
305
306/// Live-host BPF map accessor.
307///
308/// Construction enumerates every map id reachable via
309/// `BPF_MAP_GET_NEXT_ID`, opens an fd for each via
310/// `BPF_MAP_GET_FD_BY_ID`, and caches the metadata. The fd vector is
311/// held for the accessor's lifetime so the maps cannot be freed
312/// underneath us — even if the scheduler exits and tears down its
313/// struct_ops link mid-walk.
314///
315/// Selectively populating the cache is intentional: the same trait
316/// surface accepts a `BpfMapInfo` argument on every method, so an
317/// accessor that holds only the maps a particular failure dump cares
318/// about (filtered by name suffix at construction time) is just as
319/// valid as one that holds every map on the system. The
320/// `from_running_kernel_filtered` constructor exposes that knob.
321#[allow(dead_code)]
322pub struct BpfSyscallAccessor {
323    maps: Vec<PinnedMap>,
324}
325
326impl BpfSyscallAccessor {
327    /// Discover and pin every BPF map currently visible to the
328    /// running kernel.
329    ///
330    /// Walks the kernel's id space via `BPF_MAP_GET_NEXT_ID` (starting
331    /// from id 0), pinning each map with `BPF_MAP_GET_FD_BY_ID` and
332    /// fetching its metadata via `BPF_OBJ_GET_INFO_BY_FD`. Maps that
333    /// disappear between the `NEXT_ID` and `GET_FD_BY_ID` calls (a
334    /// concurrent scheduler unload, for instance) are silently
335    /// skipped — that race is inherent to live-host enumeration and
336    /// is not an error.
337    ///
338    /// Requires `CAP_SYS_ADMIN`. ktstr always runs as root in the
339    /// test environment so this is a non-issue for the primary
340    /// consumer; live-host CLI users that hit `EPERM` will see it
341    /// in the returned error.
342    #[allow(dead_code)]
343    pub fn from_running_kernel() -> Result<Self> {
344        Self::from_running_kernel_filtered(|_info: &BpfMapInfo| true)
345    }
346
347    /// Discover and pin every BPF map for which `predicate` returns
348    /// `true`. Maps that fail the predicate are closed (their fds
349    /// drop) so the kernel can free them as usual.
350    ///
351    /// Useful when the caller knows which maps the failure dump will
352    /// touch — typically the scheduler's named maps that match a
353    /// specific suffix — and wants to avoid pinning hundreds of
354    /// unrelated maps that happen to be alive (cilium, systemd,
355    /// other workloads).
356    #[allow(dead_code)]
357    pub fn from_running_kernel_filtered<F>(mut predicate: F) -> Result<Self>
358    where
359        F: FnMut(&BpfMapInfo) -> bool,
360    {
361        let mut maps: Vec<PinnedMap> = Vec::new();
362        let mut start_id: u32 = 0;
363
364        loop {
365            // The kernel writes `next_id` via the syscall's raw pointer
366            // path, but Rust's borrow checker doesn't see that — it
367            // sees the struct as never mutated through a Rust binding.
368            // Declare mut anyway so the compiler treats `attr.next_id`
369            // as written, then read it back through a raw read after
370            // the syscall returns.
371            let mut attr = BpfAttrGetId {
372                id_or_start_id: start_id,
373                next_id: 0,
374                open_flags: 0,
375            };
376            // SAFETY: BpfAttrGetId is repr(C) with the exact layout the
377            // kernel expects for *_GET_NEXT_ID.
378            let res = unsafe {
379                bpf_syscall(
380                    BPF_MAP_GET_NEXT_ID,
381                    &raw mut attr as *const u8,
382                    std::mem::size_of::<BpfAttrGetId>(),
383                )
384            };
385            if res < 0 {
386                let err = std::io::Error::last_os_error();
387                if err.raw_os_error() == Some(libc::ENOENT) {
388                    break;
389                }
390                return Err(anyhow!("BPF_MAP_GET_NEXT_ID failed: {err}"));
391            }
392
393            let next_id = attr.next_id;
394            // Defensive: kernel returned 0 for `next_id` somehow.
395            // Shouldn't happen on success, but guard against an
396            // infinite loop.
397            if next_id == 0 {
398                break;
399            }
400            // Advance start_id for the next iteration BEFORE the
401            // get-fd-by-id call so a transient EPERM/ENOENT on a
402            // single id doesn't wedge the walk.
403            start_id = next_id;
404
405            // Try to pin the map. ENOENT here means the map was
406            // freed between the NEXT_ID and GET_FD_BY_ID calls. The
407            // kernel doesn't write to this attr (GET_FD_BY_ID is
408            // input-only), so the binding is plain (no mut).
409            let fd_attr = BpfAttrGetId {
410                id_or_start_id: next_id,
411                next_id: 0,
412                open_flags: 0,
413            };
414            let fd_ret = unsafe {
415                bpf_syscall(
416                    BPF_MAP_GET_FD_BY_ID,
417                    &raw const fd_attr as *const u8,
418                    std::mem::size_of::<BpfAttrGetId>(),
419                )
420            };
421            if fd_ret < 0 {
422                // A failed `BPF_MAP_GET_FD_BY_ID` skips this map and
423                // keeps walking — a single bad map must not abort
424                // enumeration. The error categories matter for
425                // diagnostics, so surface non-ENOENT cases via
426                // tracing rather than silently dropping them:
427                //
428                // - `ENOENT`: the map was freed between
429                //   `GET_NEXT_ID` and `GET_FD_BY_ID`. Routine
430                //   under churn; suppressed at `debug` level so the
431                //   normal log stays quiet.
432                // - `EPERM`: missing CAP_SYS_ADMIN / CAP_BPF for
433                //   this map (e.g. a kernel-internal map a less-
434                //   privileged caller can't pin). Logged at `warn`
435                //   so an operator who expects to see the map knows
436                //   why it's missing.
437                // - `EBADF` / others: a kernel-side state error.
438                //   Logged at `warn` with the errno so the operator
439                //   can correlate against `dmesg`.
440                let err = std::io::Error::last_os_error();
441                let raw = err.raw_os_error().unwrap_or(0);
442                if raw == libc::ENOENT {
443                    tracing::debug!(
444                        map_id = next_id,
445                        "BPF_MAP_GET_FD_BY_ID: map vanished mid-walk (ENOENT); skipping"
446                    );
447                } else {
448                    tracing::warn!(
449                        map_id = next_id,
450                        errno = raw,
451                        error = %err,
452                        "BPF_MAP_GET_FD_BY_ID failed; skipping this map but continuing the walk"
453                    );
454                }
455                continue;
456            }
457            // SAFETY: fd_ret >= 0; the kernel guarantees a valid fd
458            // for non-negative returns.
459            let fd = unsafe { OwnedFd::from_raw_fd(fd_ret as RawFd) };
460
461            // Fetch info to populate BpfMapInfo + decide whether to
462            // keep the fd. A failure here means the map's metadata
463            // can't be read (kernel-side state error or fd was
464            // closed mid-walk); surface it via tracing so the
465            // operator sees the correlation rather than a silently
466            // dropped map.
467            let (info, map_extra) = match obj_get_info_map(fd.as_raw_fd()) {
468                Ok(pair) => pair,
469                Err(e) => {
470                    tracing::warn!(
471                        map_id = next_id,
472                        error = %e,
473                        "BPF_OBJ_GET_INFO_BY_FD failed for pinned map; skipping"
474                    );
475                    continue;
476                }
477            };
478
479            // Hand the predicate a BpfMapInfo for the keep/discard
480            // decision. Discarded fds drop here.
481            if !predicate(&info) {
482                continue;
483            }
484
485            maps.push(PinnedMap {
486                info,
487                fd,
488                map_extra,
489            });
490        }
491
492        Ok(Self { maps })
493    }
494
495    /// Number of pinned maps currently held. Test helper.
496    #[cfg(test)]
497    #[allow(dead_code)]
498    pub(crate) fn pinned_count(&self) -> usize {
499        self.maps.len()
500    }
501
502    /// Look up the pinned fd for a map identified by its
503    /// `BpfMapInfo`. Returns `None` when no pinned map matches.
504    ///
505    /// Match key: `name` field (via [`info_name_matches`]). Map ids
506    /// would be more precise but they're not part of `BpfMapInfo`
507    /// today (a known follow-up if the live-host backend grows other
508    /// consumers); within a single scheduler instance, names are
509    /// unique and stable for the duration of the run.
510    fn pinned_for(&self, target: &BpfMapInfo) -> Option<&PinnedMap> {
511        self.maps
512            .iter()
513            .find(|p| info_name_matches(&p.info, target))
514    }
515}
516
517/// Match key for [`BpfSyscallAccessor::pinned_for`] and the
518/// construction-time predicate filter: two `BpfMapInfo`s identify the
519/// same map iff their active name bytes
520/// ([`BpfMapInfo::name_bytes_active`]) are byte-equal. Extracted as a
521/// free fn so the keep/discard semantics are exercisable over a
522/// hand-built `&[BpfMapInfo]` fixture without the live-kernel walk.
523fn info_name_matches(a: &BpfMapInfo, b: &BpfMapInfo) -> bool {
524    a.name_bytes_active() == b.name_bytes_active()
525}
526
527/// Pure mirror of the construction-time keep/discard step in
528/// [`BpfSyscallAccessor::from_running_kernel_filtered`]: returns the
529/// subset of `infos` for which `predicate` returns `true`, preserving
530/// order. The production constructor applies the same
531/// `if !predicate(&info) { continue; }` gate inline against each map
532/// freshly fetched from the kernel; this fn lets a test pin the
533/// filter's keep/discard contract over a deterministic fixture.
534#[cfg(test)]
535fn select_keeping<F>(infos: &[BpfMapInfo], mut predicate: F) -> Vec<&BpfMapInfo>
536where
537    F: FnMut(&BpfMapInfo) -> bool,
538{
539    infos.iter().filter(|info| predicate(info)).collect()
540}
541
542/// Fetch `bpf_map_info` for an open map fd via
543/// `BPF_OBJ_GET_INFO_BY_FD`. Returns the populated [`BpfMapInfo`]
544/// alongside the raw `map_extra` field — the latter is needed by the
545/// arena mmap path but doesn't fit on the cross-backend
546/// [`BpfMapInfo`] surface (the guest-memory path doesn't use it).
547fn obj_get_info_map(fd: RawFd) -> Result<(BpfMapInfo, u64)> {
548    let mut info = BpfMapInfoUapi::default();
549    let attr = BpfAttrInfoByFd {
550        bpf_fd: fd as u32,
551        info_len: std::mem::size_of::<BpfMapInfoUapi>() as u32,
552        info: &raw mut info as u64,
553    };
554    bpf_call_status(
555        BPF_OBJ_GET_INFO_BY_FD,
556        &raw const attr as *const u8,
557        std::mem::size_of::<BpfAttrInfoByFd>(),
558    )
559    .context("BPF_OBJ_GET_INFO_BY_FD on map fd")?;
560
561    let nul = info
562        .name
563        .iter()
564        .position(|&b| b == 0)
565        .unwrap_or(BPF_OBJ_NAME_LEN);
566    let mut name_bytes = [0u8; BPF_OBJ_NAME_LEN];
567    name_bytes.copy_from_slice(&info.name);
568
569    Ok((
570        BpfMapInfo {
571            // map_pa / map_kva / value_kva are guest-memory concepts
572            // that don't apply on the live host. Populating with 0
573            // is fine — the live-host backend's read paths route
574            // through the pinned fd, not these fields.
575            map_pa: 0,
576            map_kva: 0,
577            name_bytes,
578            name_len: nul as u8,
579            map_type: info.map_type,
580            map_flags: info.map_flags,
581            key_size: info.key_size,
582            value_size: info.value_size,
583            max_entries: info.max_entries,
584            value_kva: None,
585            // btf_kva is similarly a guest-memory locator. Live-host
586            // BTF resolution goes through `btf_id` →
587            // `BPF_BTF_GET_FD_BY_ID` instead.
588            btf_kva: u64::from(info.btf_id),
589            btf_value_type_id: info.btf_value_type_id,
590            // bpf(2) `BPF_OBJ_GET_INFO_BY_FD` does not surface
591            // `btf_vmlinux_value_type_id` directly; the live-host
592            // backend would need a parallel resolution path
593            // (BPF_BTF_GET_INFO_BY_ID + walk the wrapper) to expose
594            // it. Until that lands, leave 0 — the dump's STRUCT_OPS
595            // arm falls through to hex on a zero id, matching the
596            // behavior on guest-memory maps without struct_ops
597            // BTF support.
598            btf_vmlinux_value_type_id: 0,
599            btf_key_type_id: info.btf_key_type_id,
600        },
601        info.map_extra,
602    ))
603}
604
605impl BpfMapAccessor for BpfSyscallAccessor {
606    fn maps(&self) -> Vec<BpfMapInfo> {
607        self.maps.iter().map(|p| p.info.clone()).collect()
608    }
609
610    fn read_value(&self, map: &BpfMapInfo, offset: usize, len: usize) -> Option<Vec<u8>> {
611        let pinned = self.pinned_for(map)?;
612
613        // The live-host backend supports single-buffer value reads on
614        // ARRAY (key=0 returns inline value bytes) and STRUCT_OPS
615        // (key=0 returns the populated `bpf_struct_ops_value`). HASH
616        // goes through `iter_hash_map`; PERCPU_ARRAY through
617        // `read_percpu_array`; ARENA through `read_arena_pages`. Any
618        // other type falls through to None so the dump renderer can
619        // surface a specific reason.
620        //
621        // STRUCT_OPS quirk: the in-kernel
622        // `bpf_struct_ops_map_lookup_elem` returns -EINVAL
623        // (`kernel/bpf/bpf_struct_ops.c:518`), but the syscall path
624        // `bpf_struct_ops_map_sys_lookup_elem`
625        // (`kernel/bpf/bpf_struct_ops.c::bpf_struct_ops_map_sys_lookup_elem`)
626        // implements its own lookup, copying the kernel's
627        // `bpf_struct_ops_value` (refcnt + state + the registered
628        // kernel struct) into the userspace buffer. The kernel-only
629        // `lookup_elem` call is the in-program path; userspace
630        // syscalls reach the sys variant.
631        if map.map_type != BPF_MAP_TYPE_ARRAY && map.map_type != BPF_MAP_TYPE_STRUCT_OPS {
632            return None;
633        }
634
635        // Build the lookup. ARRAY and STRUCT_OPS both use a u32 key;
636        // STRUCT_OPS only ever has one entry (key=0).
637        let mut key: u32 = 0;
638        let mut buf = vec![0u8; map.value_size as usize];
639        let attr = BpfAttrMapElem {
640            map_fd: pinned.fd.as_raw_fd() as u32,
641            _pad0: 0,
642            key: &raw mut key as u64,
643            value_or_next_key: buf.as_mut_ptr() as u64,
644            flags: 0,
645        };
646        bpf_call_status(
647            BPF_MAP_LOOKUP_ELEM,
648            &raw const attr as *const u8,
649            std::mem::size_of::<BpfAttrMapElem>(),
650        )
651        .ok()?;
652
653        // Slice into the requested window. Out-of-bounds offsets
654        // return None to mirror the guest-memory backend's behavior
655        // when a value-region read straddles an unmapped page.
656        let end = offset.checked_add(len)?;
657        if end > buf.len() {
658            return None;
659        }
660        Some(buf[offset..end].to_vec())
661    }
662
663    fn read_array(&self, map: &BpfMapInfo, key: u32) -> Option<Vec<u8>> {
664        let pinned = self.pinned_for(map)?;
665        // ARRAY only. STRUCT_OPS and single-entry global-section maps
666        // go through read_value (key 0); HASH/PERCPU/ARENA have their
667        // own methods. Replicate array_map_lookup_elem's pre-mask
668        // `index >= max_entries` rejection (the kernel's index_mask is
669        // a Spectre bound, not a range check).
670        if map.map_type != BPF_MAP_TYPE_ARRAY {
671            return None;
672        }
673        if key >= map.max_entries {
674            return None;
675        }
676        // BPF_MAP_LOOKUP_ELEM copies value_size bytes for a plain
677        // ARRAY (copy_map_value) — no per-entry stride padding, unlike
678        // the PERCPU_ARRAY path which returns nr_cpus * round_up_8.
679        // Pass the entry index as the key.
680        let mut k: u32 = key;
681        // No MAX_VALUE_SIZE cap here (unlike the guest-memory
682        // `read_bpf_map_array_value`): value_size is sourced from
683        // BPF_OBJ_GET_INFO_BY_FD (kernel-validated metadata), not
684        // corruptible guest DRAM, so the kernel's own value_size
685        // validation guards this allocation.
686        let mut buf = vec![0u8; map.value_size as usize];
687        let attr = BpfAttrMapElem {
688            map_fd: pinned.fd.as_raw_fd() as u32,
689            _pad0: 0,
690            key: &raw mut k as u64,
691            value_or_next_key: buf.as_mut_ptr() as u64,
692            flags: 0,
693        };
694        bpf_call_status(
695            BPF_MAP_LOOKUP_ELEM,
696            &raw const attr as *const u8,
697            std::mem::size_of::<BpfAttrMapElem>(),
698        )
699        .ok()?;
700        Some(buf)
701    }
702
703    fn iter_hash_map(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
704        let Some(pinned) = self.pinned_for(map) else {
705            return Vec::new();
706        };
707        // HASH and LRU_HASH share the inline-value `htab_elem` layout
708        // (`kernel/bpf/hashtab.c::htab_elem_value`), and the kernel
709        // syscall path returns the value bytes directly for both —
710        // `bpf_map_copy_value` falls into the generic `map_lookup_elem`
711        // arm for them. Reject other map types so callers route
712        // PERCPU_HASH/LRU_PERCPU_HASH to `iter_percpu_hash_map` instead.
713        if map.map_type != BPF_MAP_TYPE_HASH && map.map_type != BPF_MAP_TYPE_LRU_HASH {
714            return Vec::new();
715        }
716
717        let key_sz = map.key_size as usize;
718        let val_sz = map.value_size as usize;
719        let mut out: Vec<(Vec<u8>, Vec<u8>)> = Vec::new();
720
721        // First key: pass NULL for the input key per `bpf(2)` man
722        // page — kernel returns the first key in the table.
723        let mut cur_key = vec![0u8; key_sz];
724        let mut next_key = vec![0u8; key_sz];
725
726        // Cap iterations at max_entries * 2 to bound a pathological
727        // walk on a torn table. RCU-protected reads on the kernel
728        // side are best-effort across concurrent updates.
729        let cap = (map.max_entries as usize).saturating_mul(2).max(1);
730        let mut got_first = false;
731        for _ in 0..cap {
732            // Get next key.
733            let attr = BpfAttrMapElem {
734                map_fd: pinned.fd.as_raw_fd() as u32,
735                _pad0: 0,
736                key: if got_first {
737                    cur_key.as_ptr() as u64
738                } else {
739                    0 // first call: NULL means "first key"
740                },
741                value_or_next_key: next_key.as_mut_ptr() as u64,
742                flags: 0,
743            };
744            let ret = unsafe {
745                bpf_syscall(
746                    BPF_MAP_GET_NEXT_KEY,
747                    &raw const attr as *const u8,
748                    std::mem::size_of::<BpfAttrMapElem>(),
749                )
750            };
751            if ret < 0 {
752                // ENOENT marks end of iteration; anything else
753                // ends the walk silently with whatever was
754                // collected so far.
755                break;
756            }
757            got_first = true;
758
759            // Look up the value for next_key.
760            let mut value = vec![0u8; val_sz];
761            let lookup_attr = BpfAttrMapElem {
762                map_fd: pinned.fd.as_raw_fd() as u32,
763                _pad0: 0,
764                key: next_key.as_ptr() as u64,
765                value_or_next_key: value.as_mut_ptr() as u64,
766                flags: 0,
767            };
768            let lret = unsafe {
769                bpf_syscall(
770                    BPF_MAP_LOOKUP_ELEM,
771                    &raw const lookup_attr as *const u8,
772                    std::mem::size_of::<BpfAttrMapElem>(),
773                )
774            };
775            if lret >= 0 {
776                out.push((next_key.clone(), value));
777            }
778            // Bound materialization at the renderer's cap (one-past so
779            // render's truncation check fires); see MAP_MATERIALIZE_MAX.
780            if out.len() > MAP_MATERIALIZE_MAX {
781                break;
782            }
783            // Advance cursor — even when lookup failed (the key
784            // disappeared between get_next_key and lookup_elem; a
785            // concurrent delete is inherent to live-host walking).
786            cur_key.copy_from_slice(&next_key);
787        }
788
789        out
790    }
791
792    fn read_percpu_array(&self, map: &BpfMapInfo, key: u32, num_cpus: u32) -> Vec<Option<Vec<u8>>> {
793        let Some(pinned) = self.pinned_for(map) else {
794            return Vec::new();
795        };
796        if map.map_type != BPF_MAP_TYPE_PERCPU_ARRAY {
797            return Vec::new();
798        }
799        if key >= map.max_entries {
800            return Vec::new();
801        }
802
803        let val_sz = map.value_size as usize;
804        let total = (num_cpus as usize).saturating_mul(val_sz);
805        let mut buf = vec![0u8; total];
806        let mut k: u32 = key;
807        let attr = BpfAttrMapElem {
808            map_fd: pinned.fd.as_raw_fd() as u32,
809            _pad0: 0,
810            key: &raw mut k as u64,
811            value_or_next_key: buf.as_mut_ptr() as u64,
812            flags: 0,
813        };
814        if bpf_call_status(
815            BPF_MAP_LOOKUP_ELEM,
816            &raw const attr as *const u8,
817            std::mem::size_of::<BpfAttrMapElem>(),
818        )
819        .is_err()
820        {
821            return vec![None; num_cpus as usize];
822        }
823
824        // Kernel rounds each CPU's slot up to 8 bytes internally
825        // (see `kernel/bpf/syscall.c` bpf_map_value_size for the
826        // PERCPU_ARRAY arm calling round_up_8). The returned buffer
827        // is `nr_cpus * round_up_8(value_size)` bytes; we slice at
828        // the rounded stride to extract each CPU's bytes and then
829        // truncate to value_size.
830        let stride = (val_sz + 7) & !7;
831        let mut out = Vec::with_capacity(num_cpus as usize);
832        for cpu in 0..num_cpus as usize {
833            let start = cpu * stride;
834            let end = start + val_sz;
835            if end > buf.len() {
836                out.push(None);
837            } else {
838                out.push(Some(buf[start..end].to_vec()));
839            }
840        }
841        out
842    }
843
844    fn iter_percpu_hash_map(
845        &self,
846        map: &BpfMapInfo,
847        num_cpus: u32,
848    ) -> super::bpf_map::PerCpuHashEntries {
849        let Some(pinned) = self.pinned_for(map) else {
850            return Vec::new();
851        };
852        if map.map_type != BPF_MAP_TYPE_PERCPU_HASH && map.map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH
853        {
854            return Vec::new();
855        }
856
857        let key_sz = map.key_size as usize;
858        let val_sz = map.value_size as usize;
859        // Kernel returns nr_cpus * round_up_8(value_size) bytes per
860        // lookup (`bpf_percpu_hash_copy` copies each CPU slot via
861        // `copy_map_value_long` at a `round_up(value_size, 8)`
862        // stride); same 8-byte stride as PERCPU_ARRAY. The buffer
863        // must be sized to the full stride or the kernel writes past
864        // it.
865        let stride = (val_sz + 7) & !7;
866        let buf_total = (num_cpus as usize).saturating_mul(stride);
867        let mut out: super::bpf_map::PerCpuHashEntries = Vec::new();
868
869        let mut cur_key = vec![0u8; key_sz];
870        let mut next_key = vec![0u8; key_sz];
871
872        let cap = (map.max_entries as usize).saturating_mul(2).max(1);
873        let mut got_first = false;
874        for _ in 0..cap {
875            let attr = BpfAttrMapElem {
876                map_fd: pinned.fd.as_raw_fd() as u32,
877                _pad0: 0,
878                key: if got_first {
879                    cur_key.as_ptr() as u64
880                } else {
881                    0
882                },
883                value_or_next_key: next_key.as_mut_ptr() as u64,
884                flags: 0,
885            };
886            let ret = unsafe {
887                bpf_syscall(
888                    BPF_MAP_GET_NEXT_KEY,
889                    &raw const attr as *const u8,
890                    std::mem::size_of::<BpfAttrMapElem>(),
891                )
892            };
893            if ret < 0 {
894                break;
895            }
896            got_first = true;
897
898            let mut value_buf = vec![0u8; buf_total];
899            let lookup_attr = BpfAttrMapElem {
900                map_fd: pinned.fd.as_raw_fd() as u32,
901                _pad0: 0,
902                key: next_key.as_ptr() as u64,
903                value_or_next_key: value_buf.as_mut_ptr() as u64,
904                flags: 0,
905            };
906            let lret = unsafe {
907                bpf_syscall(
908                    BPF_MAP_LOOKUP_ELEM,
909                    &raw const lookup_attr as *const u8,
910                    std::mem::size_of::<BpfAttrMapElem>(),
911                )
912            };
913            if lret >= 0 {
914                let mut per_cpu = Vec::with_capacity(num_cpus as usize);
915                for cpu in 0..num_cpus as usize {
916                    let start = cpu * stride;
917                    let end = start + val_sz;
918                    if end > value_buf.len() {
919                        per_cpu.push(None);
920                    } else {
921                        per_cpu.push(Some(value_buf[start..end].to_vec()));
922                    }
923                }
924                out.push((next_key.clone(), per_cpu));
925            }
926            // Bound materialization at the renderer's cap (one-past so
927            // render's truncation check fires); see MAP_MATERIALIZE_MAX.
928            if out.len() > MAP_MATERIALIZE_MAX {
929                break;
930            }
931            cur_key.copy_from_slice(&next_key);
932        }
933
934        out
935    }
936
937    fn read_arena_pages(
938        &self,
939        map: &BpfMapInfo,
940        _arena_offsets: &BpfArenaOffsets,
941    ) -> ArenaSnapshot {
942        let Some(pinned) = self.pinned_for(map) else {
943            return ArenaSnapshot::default();
944        };
945        if map.map_type != BPF_MAP_TYPE_ARENA {
946            return ArenaSnapshot::default();
947        }
948
949        // The kernel sizes the arena as `max_entries * PAGE_SIZE`
950        // (`arena_map_alloc`) at the host base page size; read it at
951        // runtime so the span — and the mmap length below — match what
952        // the kernel pinned as `user_vm_end`. A hardcoded 4 KiB would
953        // under-size the mapping on a 16 KiB-granule host and trip
954        // `arena_map_mmap`'s `user_vm_end` check (-EBUSY). Same caps as
955        // the guest-memory side for cross-backend parity.
956        let page_size = host_page_size();
957        let declared_bytes_raw = (map.max_entries as u64).saturating_mul(page_size as u64);
958        let span_capped = declared_bytes_raw > MAX_ARENA_BYTES;
959        let declared_bytes = declared_bytes_raw.min(MAX_ARENA_BYTES);
960        let declared_pages = declared_bytes / page_size as u64;
961
962        // Use map_extra as the user_vm_start anchor. BPF programs
963        // see arena addresses at this base (lib/arena_map.h hardcodes
964        // it: x86 `1<<44`, aarch64 `1<<32`). Operators correlating
965        // arena pointers want the same base in the snapshot.
966        // Lifted above the early returns so every snapshot — empty
967        // or populated — carries the anchor in `user_vm_start`; the
968        // pointer-chasing reader needs it to classify arena addresses
969        // even when the page set is empty.
970        let user_vm_start = pinned.map_extra;
971
972        if declared_pages == 0 {
973            return ArenaSnapshot {
974                pages: Vec::new(),
975                truncated: false,
976                declared_pages: 0,
977                span_capped,
978                user_vm_start,
979                ..Default::default()
980            };
981        }
982
983        // The read window is capped at MAX_ARENA_PAGES so a huge arena
984        // can't drive an unbounded mincore/read loop; `truncated`
985        // surfaces the cap. mincore() below filters to the resident set
986        // (arena_vm_fault populates pages on demand, so pages the BPF
987        // program never touched are absent) and we read only those.
988        let read_pages = declared_pages.min(MAX_ARENA_PAGES);
989        let read_bytes = (read_pages as usize) * page_size;
990        let truncated = declared_pages > read_pages;
991
992        // Placement: when map_extra was set at arena creation the
993        // kernel pinned user_vm_start/user_vm_end, so the read must map
994        // the FULL arena at exactly map_extra (MAP_FIXED_NOREPLACE) or
995        // arena_map_mmap returns -EBUSY. See `arena_mmap_placement`.
996        let (hint, mmap_flags, mmap_bytes) =
997            arena_mmap_placement(user_vm_start, declared_bytes as usize, read_bytes);
998
999        // SAFETY: mmap with PROT_READ + MAP_SHARED on an arena fd is
1000        // exactly what `arena_map_mmap` (`kernel/bpf/arena.c`) exports;
1001        // offset 0 is required (the op rejects a nonzero vm_pgoff).
1002        // MAP_FIXED_NOREPLACE places the mapping at the kernel-blessed
1003        // VA without clobbering an existing one (fails EEXIST instead).
1004        let addr = unsafe {
1005            libc::mmap(
1006                if hint == 0 {
1007                    ptr::null_mut()
1008                } else {
1009                    hint as *mut libc::c_void
1010                },
1011                mmap_bytes,
1012                libc::PROT_READ,
1013                mmap_flags,
1014                pinned.fd.as_raw_fd(),
1015                0,
1016            )
1017        };
1018        if addr == libc::MAP_FAILED {
1019            // mmap rejected (e.g. -EBUSY if the arena's user VA is
1020            // pinned elsewhere, EEXIST if map_extra's VA is already
1021            // mapped in this process). Log it: a silently empty arena
1022            // snapshot reads as "arena is empty" when it is actually
1023            // unreadable — exactly how the prior NULL-hint bug hid.
1024            let err = std::io::Error::last_os_error();
1025            tracing::warn!(
1026                user_vm_start = format_args!("{user_vm_start:#x}"),
1027                mmap_bytes,
1028                error = %err,
1029                "read_arena_pages: mmap of arena fd failed; returning empty snapshot"
1030            );
1031            return ArenaSnapshot {
1032                pages: Vec::new(),
1033                truncated,
1034                declared_pages,
1035                span_capped,
1036                user_vm_start,
1037                ..Default::default()
1038            };
1039        }
1040
1041        let mut pages: Vec<ArenaPage> = Vec::new();
1042        // Read the resident pages out of the mmap. We use mincore()
1043        // to filter out pages that aren't present, then read only the
1044        // present ones. mincore returns 0 for
1045        // resident pages, < 0 on error.
1046        let mut residency = vec![0u8; read_pages as usize];
1047        let mincore_ret = unsafe { libc::mincore(addr, read_bytes, residency.as_mut_ptr()) };
1048        if mincore_ret == 0 {
1049            for (idx, &resident) in residency.iter().enumerate() {
1050                if resident & 1 == 0 {
1051                    // Page not in core — sparse arena, never
1052                    // populated by the BPF program. Skip.
1053                    continue;
1054                }
1055                let page_addr = (addr as usize) + idx * page_size;
1056                // SAFETY: page is resident per mincore; reading
1057                // page_size bytes is in-bounds.
1058                let mut buf = vec![0u8; page_size];
1059                unsafe {
1060                    std::ptr::copy_nonoverlapping(
1061                        page_addr as *const u8,
1062                        buf.as_mut_ptr(),
1063                        page_size,
1064                    );
1065                }
1066                // user_vm_start comes from the BPF map's map_extra
1067                // field — a guest-controllable value. A hostile or
1068                // corrupt map metadata could push the page identifier
1069                // past u64::MAX. Skip the page rather than emit a
1070                // wrapped address that consumers would treat as
1071                // legitimate.
1072                let Some(idx_offset) = (idx as u64).checked_mul(page_size as u64) else {
1073                    continue;
1074                };
1075                let Some(user_addr) = user_vm_start.checked_add(idx_offset) else {
1076                    continue;
1077                };
1078                pages.push(ArenaPage {
1079                    user_addr,
1080                    bytes: buf,
1081                });
1082            }
1083        }
1084
1085        // SAFETY: we created this mapping above and aren't using it
1086        // after this point.
1087        unsafe {
1088            libc::munmap(addr, mmap_bytes);
1089        }
1090
1091        ArenaSnapshot {
1092            pages,
1093            truncated,
1094            declared_pages,
1095            span_capped,
1096            user_vm_start,
1097            ..Default::default()
1098        }
1099    }
1100
1101    fn load_program_btf(&self, map: &BpfMapInfo, base_btf: &Btf) -> Option<Btf> {
1102        // map.btf_kva on the live-host backend stores the kernel's
1103        // btf_id (u32) — see obj_get_info_map. 0 means no BTF.
1104        let btf_id = map.btf_kva as u32;
1105        if btf_id == 0 {
1106            return None;
1107        }
1108
1109        // Pin the BTF object by id.
1110        let attr = BpfAttrGetId {
1111            id_or_start_id: btf_id,
1112            next_id: 0,
1113            open_flags: 0,
1114        };
1115        let btf_fd = bpf_call_fd(
1116            BPF_BTF_GET_FD_BY_ID,
1117            &raw const attr as *const u8,
1118            std::mem::size_of::<BpfAttrGetId>(),
1119        )
1120        .ok()?;
1121        // SAFETY: btf_fd >= 0 from a successful bpf_call_fd.
1122        let btf_owned = unsafe { OwnedFd::from_raw_fd(btf_fd) };
1123
1124        // Two-pass info fetch: first call to learn btf_size, then
1125        // allocate a buffer and refetch with `btf` populated to
1126        // pull the BTF blob bytes.
1127        let mut info = BpfBtfInfoUapi::default();
1128        let info_attr = BpfAttrInfoByFd {
1129            bpf_fd: btf_owned.as_raw_fd() as u32,
1130            info_len: std::mem::size_of::<BpfBtfInfoUapi>() as u32,
1131            info: &raw mut info as u64,
1132        };
1133        bpf_call_status(
1134            BPF_OBJ_GET_INFO_BY_FD,
1135            &raw const info_attr as *const u8,
1136            std::mem::size_of::<BpfAttrInfoByFd>(),
1137        )
1138        .ok()?;
1139        if info.btf_size == 0 {
1140            return None;
1141        }
1142
1143        // Second pass with a real buffer.
1144        let mut buf = vec![0u8; info.btf_size as usize];
1145        info.btf = buf.as_mut_ptr() as u64;
1146        let info_attr2 = BpfAttrInfoByFd {
1147            bpf_fd: btf_owned.as_raw_fd() as u32,
1148            info_len: std::mem::size_of::<BpfBtfInfoUapi>() as u32,
1149            info: &raw mut info as u64,
1150        };
1151        bpf_call_status(
1152            BPF_OBJ_GET_INFO_BY_FD,
1153            &raw const info_attr2 as *const u8,
1154            std::mem::size_of::<BpfAttrInfoByFd>(),
1155        )
1156        .ok()?;
1157
1158        if info.kernel_btf != 0 {
1159            Btf::from_split_bytes(&buf, base_btf).ok()
1160        } else {
1161            Btf::from_bytes(&buf).ok()
1162        }
1163    }
1164}
1165
1166#[cfg(test)]
1167mod tests {
1168    use super::*;
1169
1170    /// Verify the bpf_attr arms have the exact UAPI layout the
1171    /// kernel expects. Wrong sizes or field offsets cause -EINVAL
1172    /// on every syscall — this test catches the layout drift before
1173    /// it produces silent failures at runtime.
1174    #[test]
1175    fn bpf_attr_map_elem_size() {
1176        // include/uapi/linux/bpf.h: the MAP_ELEM_OPS arm is exactly
1177        // 32 bytes (4 + 4 pad + 8 + 8 + 8).
1178        assert_eq!(std::mem::size_of::<BpfAttrMapElem>(), 32);
1179    }
1180
1181    #[test]
1182    fn bpf_attr_get_id_size() {
1183        // GET_NEXT_ID / GET_FD_BY_ID: we pass a 12-byte prefix
1184        // (start_id/id + next_id + open_flags) =
1185        // offsetofend(union bpf_attr, open_flags). The full kernel arm
1186        // is 16 bytes — it adds a trailing fd_by_id_token_fd, which the
1187        // kernel zero-fills since we omit it (matching how libbpf sizes
1188        // these calls). This pins our 12-byte prefix, NOT the arm.
1189        assert_eq!(std::mem::size_of::<BpfAttrGetId>(), 12);
1190    }
1191
1192    #[test]
1193    fn bpf_attr_info_by_fd_size() {
1194        // OBJ_GET_INFO_BY_FD arm: 16 bytes (4 + 4 + 8).
1195        assert_eq!(std::mem::size_of::<BpfAttrInfoByFd>(), 16);
1196    }
1197
1198    /// Pin every field offset of [`BpfMapInfoUapi`] against the kernel
1199    /// `struct bpf_map_info` (include/uapi/linux/bpf.h). The kernel
1200    /// writes this struct on `BPF_OBJ_GET_INFO_BY_FD`, so a single
1201    /// shifted offset makes `obj_get_info_map` read garbage from the
1202    /// wrong field (e.g. `value_size` out of `max_entries`) with no
1203    /// runtime error. Pinning only `map_type`@0 and `name`@24 would miss
1204    /// a field insertion between `map_flags` and the tail, so every
1205    /// offset the struct exposes is asserted explicitly.
1206    ///
1207    /// Verdict-routed so a multi-field uapi-shape regression surfaces
1208    /// every drift in one run rather than failing on the first mismatch.
1209    #[test]
1210    fn bpf_map_info_uapi_layout() {
1211        use crate::assert::Verdict;
1212        use std::mem::offset_of;
1213
1214        let mut v = Verdict::new();
1215        // Offsets per `struct bpf_map_info`: u32 fields packed, name[16]
1216        // at 24, u64 fields 8-aligned. Matches the kernel header.
1217        crate::claim!(v, offset_of!(BpfMapInfoUapi, map_type)).eq(0usize);
1218        crate::claim!(v, offset_of!(BpfMapInfoUapi, id)).eq(4usize);
1219        crate::claim!(v, offset_of!(BpfMapInfoUapi, key_size)).eq(8usize);
1220        crate::claim!(v, offset_of!(BpfMapInfoUapi, value_size)).eq(12usize);
1221        crate::claim!(v, offset_of!(BpfMapInfoUapi, max_entries)).eq(16usize);
1222        crate::claim!(v, offset_of!(BpfMapInfoUapi, map_flags)).eq(20usize);
1223        crate::claim!(v, offset_of!(BpfMapInfoUapi, name)).eq(24usize);
1224        crate::claim!(v, offset_of!(BpfMapInfoUapi, ifindex)).eq(40usize);
1225        crate::claim!(v, offset_of!(BpfMapInfoUapi, btf_vmlinux_value_type_id)).eq(44usize);
1226        crate::claim!(v, offset_of!(BpfMapInfoUapi, netns_dev)).eq(48usize);
1227        crate::claim!(v, offset_of!(BpfMapInfoUapi, netns_ino)).eq(56usize);
1228        crate::claim!(v, offset_of!(BpfMapInfoUapi, btf_id)).eq(64usize);
1229        crate::claim!(v, offset_of!(BpfMapInfoUapi, btf_key_type_id)).eq(68usize);
1230        crate::claim!(v, offset_of!(BpfMapInfoUapi, btf_value_type_id)).eq(72usize);
1231        // `_pad` mirrors the kernel's `btf_vmlinux_id` at offset 76.
1232        crate::claim!(v, offset_of!(BpfMapInfoUapi, _pad)).eq(76usize);
1233        crate::claim!(v, offset_of!(BpfMapInfoUapi, map_extra)).eq(80usize);
1234        // `map_extra` is the trailing field we read; our struct ends at
1235        // offset 88 (the kernel's hash/hash_size past it are not read).
1236        crate::claim!(v, std::mem::size_of::<BpfMapInfoUapi>()).eq(88usize);
1237        let r = v.into_result();
1238        assert!(
1239            r.is_pass(),
1240            "bpf_map_info uapi layout drift: {:?}",
1241            r.outcomes,
1242        );
1243    }
1244
1245    /// Round-up arithmetic for percpu stride matches the kernel's
1246    /// `round_up(value_size, 8)`.
1247    #[test]
1248    fn percpu_stride_round_up() {
1249        let cases = [
1250            (0usize, 0),
1251            (1, 8),
1252            (7, 8),
1253            (8, 8),
1254            (9, 16),
1255            (15, 16),
1256            (16, 16),
1257        ];
1258        for (val_sz, expected) in cases {
1259            let stride = (val_sz + 7) & !7;
1260            assert_eq!(stride, expected, "value_size {val_sz} → stride {stride}");
1261        }
1262    }
1263
1264    /// Build a `BpfMapInfo` whose only populated field is the active
1265    /// name — the key both the construction-time predicate filter
1266    /// and `pinned_for` match on.
1267    fn info_named(name: &str) -> BpfMapInfo {
1268        let bytes = name.as_bytes();
1269        assert!(bytes.len() <= BPF_OBJ_NAME_LEN, "test name too long");
1270        let mut name_bytes = [0u8; BPF_OBJ_NAME_LEN];
1271        name_bytes[..bytes.len()].copy_from_slice(bytes);
1272        BpfMapInfo {
1273            name_bytes,
1274            name_len: bytes.len() as u8,
1275            ..Default::default()
1276        }
1277    }
1278
1279    /// The construction-time keep/discard filter
1280    /// ([`select_keeping`], the pure mirror of
1281    /// `from_running_kernel_filtered`'s inline predicate gate) keeps
1282    /// exactly the maps the predicate accepts: a predicate matching
1283    /// no names yields the empty set; a name-suffix predicate yields
1284    /// exactly the matching subset, in order.
1285    #[test]
1286    fn predicate_filters_pinned_set() {
1287        let infos = vec![
1288            info_named("scx_central"),
1289            info_named("central_dsq"),
1290            info_named("cilium_lb"),
1291            info_named("central_data"),
1292        ];
1293
1294        // Match-nothing predicate ⇒ empty kept set.
1295        let none = select_keeping(&infos, |_| false);
1296        assert!(none.is_empty(), "false predicate must discard every map");
1297
1298        // Match-everything predicate ⇒ full set, order preserved.
1299        let all = select_keeping(&infos, |_| true);
1300        assert_eq!(all.len(), 4, "true predicate must keep every map");
1301        assert_eq!(all[0].name(), "scx_central");
1302        assert_eq!(all[3].name(), "central_data");
1303
1304        // Name-suffix predicate ⇒ exactly the "central"-bearing subset.
1305        let kept = select_keeping(&infos, |i| i.name().contains("central"));
1306        let kept_names: Vec<String> = kept.iter().map(|i| i.name().to_string()).collect();
1307        assert_eq!(
1308            kept_names,
1309            vec!["scx_central", "central_dsq", "central_data"],
1310            "suffix predicate must keep exactly the matching subset, in order",
1311        );
1312
1313        // The same name-match key drives pinned_for: a target sharing
1314        // active name bytes matches; a differing name does not.
1315        assert!(
1316            info_name_matches(&info_named("central_dsq"), &info_named("central_dsq")),
1317            "identical active name bytes must match",
1318        );
1319        assert!(
1320            !info_name_matches(&info_named("central_dsq"), &info_named("central_data")),
1321            "differing active name bytes must NOT match",
1322        );
1323        // name_len bounds the compared region: a longer NUL-padded
1324        // buffer with a shorter name_len compares only the active
1325        // prefix, so "scx" (len 3) does not match "scx_central".
1326        assert!(
1327            !info_name_matches(&info_named("scx"), &info_named("scx_central")),
1328            "name_len must bound the match to the active prefix",
1329        );
1330    }
1331
1332    /// A cheap real fd for a test [`PinnedMap`]. `pinned_for` only
1333    /// compares names and never dereferences `.fd`, and every accessor
1334    /// read path under test returns at an early guard before the fd
1335    /// reaches a `bpf()` syscall — so any open fd suffices to satisfy
1336    /// the `OwnedFd` field. `/dev/null`
1337    /// is always present on Linux and `File` -> `OwnedFd` is the
1338    /// std-only conversion (no extra libc unsafe).
1339    fn dummy_fd() -> OwnedFd {
1340        OwnedFd::from(
1341            std::fs::File::open("/dev/null").expect("/dev/null must open on Linux test host"),
1342        )
1343    }
1344
1345    /// Build a [`PinnedMap`] from `info` + `map_extra`, carrying the
1346    /// dummy fd. The fields are private to the parent module; the
1347    /// `tests` child module can name and construct them directly,
1348    /// which is the inject seam the blueprint requires (no live
1349    /// kernel walk, no `from_running_kernel*` syscall path).
1350    fn pinned(info: BpfMapInfo, map_extra: u64) -> PinnedMap {
1351        PinnedMap {
1352            info,
1353            fd: dummy_fd(),
1354            map_extra,
1355        }
1356    }
1357
1358    /// Build a [`BpfSyscallAccessor`] holding exactly `maps`. The
1359    /// production constructors (`from_running_kernel*`) only ever
1360    /// populate `maps` via the live bpf(2) id walk; this literal is
1361    /// the host-test inject seam that lets the early-guard branches
1362    /// run without a kernel.
1363    fn accessor(maps: Vec<PinnedMap>) -> BpfSyscallAccessor {
1364        BpfSyscallAccessor { maps }
1365    }
1366
1367    /// `read_array` returns `None` on three rejections. Two are
1368    /// structurally pre-fd through the inject seam: the no-name-match
1369    /// (`pinned_for` -> None) returns before there is any fd, and the
1370    /// wrong-map-type guard returns before building the lookup attr.
1371    /// The `key >= max_entries` guard also returns `None`, but on this
1372    /// dummy-fd accessor that is indistinguishable from letting the
1373    /// `bpf()` `BPF_MAP_LOOKUP_ELEM` run on the bad fd
1374    /// (`-EINVAL` -> `None`) — so for a scalar `Option` return the
1375    /// key-bound case pins the rejection VALUE, not guard-precedence
1376    /// over the syscall (proving that needs a live map). The
1377    /// `key < max_entries` success path issues a real lookup and is NOT
1378    /// asserted here.
1379    #[test]
1380    fn read_array_pre_lookup_guards_reject() {
1381        let arr = BpfMapInfo {
1382            map_type: BPF_MAP_TYPE_ARRAY,
1383            max_entries: 4,
1384            value_size: 8,
1385            ..info_named("arr")
1386        };
1387        let acc = accessor(vec![pinned(arr.clone(), 0)]);
1388
1389        // No pinned map matches the target name -> pinned_for None.
1390        assert_eq!(acc.read_array(&info_named("missing"), 0), None);
1391
1392        // Name matches but map_type is HASH, not ARRAY -> type-reject.
1393        let hash = BpfMapInfo {
1394            map_type: BPF_MAP_TYPE_HASH,
1395            ..info_named("arr")
1396        };
1397        assert_eq!(acc.read_array(&hash, 0), None);
1398
1399        // key == max_entries and key > max_entries both reject before
1400        // the lookup (the kernel index_mask is a Spectre bound, the
1401        // explicit `key >= max_entries` is the range check).
1402        assert_eq!(acc.read_array(&arr, 4), None);
1403        assert_eq!(acc.read_array(&arr, 99), None);
1404    }
1405
1406    /// `read_value` returns `None` on two rejections. The no-name-match
1407    /// (`pinned_for` -> None) is structurally pre-fd — there is no map,
1408    /// hence no fd to look up. The wrong-map-type rejection (neither
1409    /// ARRAY nor STRUCT_OPS) also returns `None`, but on this dummy-fd
1410    /// accessor that is indistinguishable from letting the `bpf()`
1411    /// lookup run on the bad fd (`-EINVAL` -> `None`), so it pins the
1412    /// rejection VALUE rather than guard-precedence over the syscall.
1413    /// The post-lookup window-bounds / `checked_add` guards sit past the
1414    /// live lookup and need a real map; they are NOT asserted here.
1415    #[test]
1416    fn read_value_pre_lookup_type_reject() {
1417        // PERCPU_HASH is neither ARRAY nor STRUCT_OPS.
1418        let percpu_hash = BpfMapInfo {
1419            map_type: BPF_MAP_TYPE_PERCPU_HASH,
1420            value_size: 8,
1421            ..info_named("v")
1422        };
1423        let acc = accessor(vec![pinned(percpu_hash.clone(), 0)]);
1424
1425        assert_eq!(acc.read_value(&info_named("nomatch"), 0, 4), None);
1426        assert_eq!(acc.read_value(&percpu_hash, 0, 4), None);
1427    }
1428
1429    /// `iter_hash_map` returns an empty `Vec`. The no-name-match
1430    /// let-else is structurally pre-fd (no map, no fd). The type-reject
1431    /// (only HASH and LRU_HASH proceed) also returns empty, but on this
1432    /// dummy-fd accessor that is indistinguishable from the walk loop
1433    /// issuing `BPF_MAP_GET_NEXT_KEY` on the bad fd and breaking on the
1434    /// error — so it pins the empty RESULT, not guard-precedence over
1435    /// the syscall. The populated iteration path needs a live hash map.
1436    #[test]
1437    fn iter_hash_map_pre_walk_guards_empty() {
1438        let arr = BpfMapInfo {
1439            map_type: BPF_MAP_TYPE_ARRAY,
1440            ..info_named("h")
1441        };
1442        let acc = accessor(vec![pinned(arr.clone(), 0)]);
1443
1444        // No pinned match -> let-else returns Vec::new().
1445        assert_eq!(acc.iter_hash_map(&info_named("none")).len(), 0);
1446        // Name matches but map_type is ARRAY, not HASH/LRU_HASH.
1447        assert_eq!(acc.iter_hash_map(&arr).len(), 0);
1448    }
1449
1450    /// `read_percpu_array` returns an empty `Vec` (length 0) on the
1451    /// three pre-lookup guards: the no-name-match (`pinned_for` ->
1452    /// None), the wrong-map-type guard, and the `key >= max_entries`
1453    /// guard. The length distinguishes these from the
1454    /// post-lookup-failure branch which returns `vec![None; num_cpus]`
1455    /// (length `num_cpus`), so the assertions pin LENGTH 0, not just
1456    /// emptiness-of-content.
1457    #[test]
1458    fn read_percpu_array_pre_lookup_guards_empty() {
1459        let pa = BpfMapInfo {
1460            map_type: BPF_MAP_TYPE_PERCPU_ARRAY,
1461            max_entries: 2,
1462            value_size: 8,
1463            ..info_named("pa")
1464        };
1465        let acc = accessor(vec![pinned(pa.clone(), 0)]);
1466
1467        // No pinned match.
1468        assert_eq!(acc.read_percpu_array(&info_named("x"), 0, 4).len(), 0);
1469        // Name matches but map_type is ARRAY, not PERCPU_ARRAY.
1470        let arr = BpfMapInfo {
1471            map_type: BPF_MAP_TYPE_ARRAY,
1472            ..info_named("pa")
1473        };
1474        assert_eq!(acc.read_percpu_array(&arr, 0, 4).len(), 0);
1475        // key == max_entries rejects with length 0 (distinct from the
1476        // num_cpus-length lookup-failure vector).
1477        assert_eq!(acc.read_percpu_array(&pa, 2, 4).len(), 0);
1478    }
1479
1480    /// `iter_percpu_hash_map` returns an empty `PerCpuHashEntries`. The
1481    /// no-name-match let-else is structurally pre-fd (no map, no fd).
1482    /// The type-reject (only PERCPU_HASH and LRU_PERCPU_HASH proceed)
1483    /// also returns empty, but on this dummy-fd accessor that is
1484    /// indistinguishable from the walk loop breaking on the bad fd — so
1485    /// it pins the empty RESULT, not guard-precedence over the syscall.
1486    /// The populated walk path needs a live map.
1487    #[test]
1488    fn iter_percpu_hash_map_pre_walk_guards_empty() {
1489        let hash = BpfMapInfo {
1490            map_type: BPF_MAP_TYPE_HASH,
1491            ..info_named("ph")
1492        };
1493        let acc = accessor(vec![pinned(hash.clone(), 0)]);
1494
1495        // No pinned match.
1496        assert_eq!(acc.iter_percpu_hash_map(&info_named("none"), 4).len(), 0);
1497        // Name matches but map_type is HASH, not PERCPU_HASH/LRU_PERCPU_HASH.
1498        assert_eq!(acc.iter_percpu_hash_map(&hash, 4).len(), 0);
1499    }
1500
1501    /// `read_arena_pages` has three isolable, fd-free blocks ahead of
1502    /// the `mmap`: the no-name-match (`pinned_for` -> None ->
1503    /// `ArenaSnapshot::default()`), the wrong-map-type guard (->
1504    /// default), and the declared-span math + `declared_pages == 0`
1505    /// early return. The span math is pure:
1506    /// `declared_bytes_raw = max_entries * 4096` (saturating),
1507    /// `span_capped = declared_bytes_raw > MAX_ARENA_BYTES` (4 GiB),
1508    /// and the zero-page snapshot carries `user_vm_start = map_extra`.
1509    /// The populated mmap/mincore path needs a live arena fd.
1510    #[test]
1511    fn read_arena_pages_pre_mmap_paths() {
1512        // A 3-field literal: BpfArenaOffsets derives only Debug+Clone
1513        // (no Default), and the value is unused on every path under
1514        // test (the fn parameter `_arena_offsets` is ignored), so the
1515        // concrete offsets are arbitrary.
1516        let offsets = BpfArenaOffsets {
1517            arena_kern_vm: 0,
1518            arena_user_vm_start: 0,
1519            vm_struct_addr: 0,
1520        };
1521
1522        // max_entries == 0 -> declared_pages == 0 early return,
1523        // carrying user_vm_start = map_extra.
1524        let arena0 = BpfMapInfo {
1525            map_type: BPF_MAP_TYPE_ARENA,
1526            max_entries: 0,
1527            ..info_named("a")
1528        };
1529        let acc = accessor(vec![pinned(arena0.clone(), 0x1000)]);
1530
1531        // No name match -> ArenaSnapshot::default() (all-zero).
1532        let no_match = acc.read_arena_pages(&info_named("no"), &offsets);
1533        assert!(no_match.pages.is_empty());
1534        assert_eq!(no_match.declared_pages, 0);
1535        assert_eq!(no_match.user_vm_start, 0);
1536        assert!(!no_match.span_capped);
1537        assert!(!no_match.truncated);
1538
1539        // Name matches but map_type is ARRAY, not ARENA -> default.
1540        let arr = BpfMapInfo {
1541            map_type: BPF_MAP_TYPE_ARRAY,
1542            ..info_named("a")
1543        };
1544        let type_reject = acc.read_arena_pages(&arr, &offsets);
1545        assert!(type_reject.pages.is_empty());
1546        assert_eq!(type_reject.declared_pages, 0);
1547        assert_eq!(type_reject.user_vm_start, 0);
1548        assert!(!type_reject.span_capped);
1549
1550        // declared_pages == 0 path: empty pages, span not capped,
1551        // user_vm_start carried through from map_extra.
1552        let zero = acc.read_arena_pages(&arena0, &offsets);
1553        assert_eq!(zero.pages.len(), 0);
1554        assert_eq!(zero.declared_pages, 0);
1555        assert!(!zero.span_capped);
1556        assert_eq!(zero.user_vm_start, 0x1000);
1557        assert!(!zero.truncated);
1558
1559        // max_entries == u32::MAX -> declared_bytes_raw =
1560        // 0xFFFF_FFFF * 4096 > 4 GiB, so span_capped is set. With the
1561        // span capped to MAX_ARENA_BYTES, declared_pages > 0, so the
1562        // span-math result is only observable on this sub-case
1563        // through the MAP_FAILED snapshot or a populated walk — both
1564        // need a live fd. The dummy /dev/null fd makes mmap fail
1565        // (MAP_FAILED), exercising the MAP_FAILED early return,
1566        // which carries span_capped + user_vm_start. Assert exactly
1567        // those two carry-through fields, which the blueprint marks
1568        // host-assertable.
1569        let arena_max = BpfMapInfo {
1570            map_type: BPF_MAP_TYPE_ARENA,
1571            max_entries: u32::MAX,
1572            ..info_named("a")
1573        };
1574        let acc_max = accessor(vec![pinned(arena_max.clone(), 0x2000)]);
1575        let capped = acc_max.read_arena_pages(&arena_max, &offsets);
1576        assert!(capped.span_capped, "u32::MAX max_entries must cap the span");
1577        assert_eq!(capped.user_vm_start, 0x2000);
1578    }
1579
1580    #[test]
1581    fn arena_mmap_placement_map_extra_pins_full_span_fixed_noreplace() {
1582        // map_extra set (nonzero user_vm_start): the read must land at
1583        // exactly map_extra and span the FULL declared arena (not the
1584        // capped read window) with MAP_FIXED_NOREPLACE, or
1585        // arena_map_mmap returns -EBUSY on the user_vm_start/end check.
1586        let (hint, flags, len) = arena_mmap_placement(0x1_0000_0000, 8192, 4096);
1587        assert_eq!(hint, 0x1_0000_0000, "hint must be map_extra, not NULL");
1588        assert_eq!(flags, libc::MAP_SHARED | libc::MAP_FIXED_NOREPLACE);
1589        assert_eq!(len, 8192, "full declared span, not the capped read window");
1590    }
1591
1592    #[test]
1593    fn arena_mmap_placement_no_map_extra_uses_null_capped_prefix() {
1594        // map_extra == 0: the kernel adopts our VA, so a NULL hint and
1595        // the capped read window are correct.
1596        let (hint, flags, len) = arena_mmap_placement(0, 8192, 4096);
1597        assert_eq!(hint, 0, "NULL hint — kernel chooses the VA");
1598        assert_eq!(flags, libc::MAP_SHARED);
1599        assert_eq!(len, 4096, "capped read window, not the full declared span");
1600    }
1601
1602    /// `load_program_btf` returns `None` immediately when
1603    /// `btf_id == 0` (`btf_id = map.btf_kva as u32`). `info_named`
1604    /// leaves `btf_kva` at its `Default` (0), so the guard fires before
1605    /// the `BPF_BTF_GET_FD_BY_ID` syscall and the `base_btf` argument is
1606    /// never dereferenced (it is only used on the post-fetch arms). Only
1607    /// this guard is host-assertable; the BTF-fetch path needs a live
1608    /// kernel BTF object.
1609    #[test]
1610    fn load_program_btf_btf_id_zero_returns_none() {
1611        // info_named leaves btf_kva == 0 (Default).
1612        let prog = info_named("prog");
1613        assert_eq!(prog.btf_kva, 0, "info_named must default btf_kva to 0");
1614        let acc = accessor(vec![pinned(prog.clone(), 0)]);
1615
1616        // base_btf: a minimal valid BTF blob — magic 0xEB9F, version 1,
1617        // 24-byte header, one Int type (id 1) so the type section is
1618        // non-empty, strtab leading with NUL. Mirrors the
1619        // `cast_analysis` tests' `build_btf` minimal layout. Never
1620        // dereferenced on the btf_id==0 path; built only to satisfy
1621        // the `&Btf` parameter.
1622        let base = minimal_btf();
1623        // `btf_rs::Btf` derives neither `PartialEq` nor `Debug`, so
1624        // `assert_eq!(.., None)` cannot be used on `Option<Btf>`;
1625        // `is_none()` is the exact discriminant check here (the
1626        // btf_id==0 guard returns the `None` variant outright, with
1627        // no `Btf` value to compare).
1628        assert!(
1629            acc.load_program_btf(&prog, &base).is_none(),
1630            "btf_id==0 must short-circuit to None before any bpf() call",
1631        );
1632    }
1633
1634    /// Hand-build a minimal parseable BTF blob: a single `int` type
1635    /// (id 1, named "u64", 8 bytes) plus a NUL-led string table,
1636    /// wrapped in the 24-byte BTF header. Layout verified against the
1637    /// in-tree `src/monitor/cast_analysis/tests/mod.rs::build_btf`
1638    /// minimal path (the `empty_btf_no_panic` test proves a
1639    /// single-Int blob parses via `Btf::from_bytes`).
1640    fn minimal_btf() -> Btf {
1641        // String table: leading NUL (offset 0 = anonymous) + "u64\0".
1642        let mut strings: Vec<u8> = vec![0];
1643        let n_u64 = strings.len() as u32;
1644        strings.extend_from_slice(b"u64\0");
1645
1646        // Type section: one BTF_KIND_INT (kind 1).
1647        let mut type_section: Vec<u8> = Vec::new();
1648        const BTF_KIND_INT: u32 = 1;
1649        type_section.extend_from_slice(&n_u64.to_le_bytes()); // name_off
1650        let info = (BTF_KIND_INT << 24) & 0x1f00_0000; // vlen 0
1651        type_section.extend_from_slice(&info.to_le_bytes());
1652        type_section.extend_from_slice(&8u32.to_le_bytes()); // size = 8
1653        // btf_int data word: encoding 0, offset 0, bits 64.
1654        let int_data: u32 = 64;
1655        type_section.extend_from_slice(&int_data.to_le_bytes());
1656
1657        let type_len = type_section.len() as u32;
1658        let str_len = strings.len() as u32;
1659
1660        let mut blob: Vec<u8> = Vec::new();
1661        blob.extend_from_slice(&0xEB9F_u16.to_le_bytes()); // magic
1662        blob.push(1); // version
1663        blob.push(0); // flags
1664        blob.extend_from_slice(&24u32.to_le_bytes()); // hdr_len
1665        blob.extend_from_slice(&0u32.to_le_bytes()); // type_off
1666        blob.extend_from_slice(&type_len.to_le_bytes()); // type_len
1667        blob.extend_from_slice(&type_len.to_le_bytes()); // str_off = type_len
1668        blob.extend_from_slice(&str_len.to_le_bytes()); // str_len
1669        blob.extend_from_slice(&type_section);
1670        blob.extend_from_slice(&strings);
1671
1672        Btf::from_bytes(&blob).expect("minimal synthetic BTF must parse")
1673    }
1674}
ktstr/monitor/bpf_syscall.rs

ktstr/monitor/
bpf_syscall.rs