ktstr/monitor/bpf_syscall.rs
1//! Live-host BPF map accessor backed by the `bpf(2)` syscall.
2//!
3//! Companion to [`super::bpf_map::GuestMemMapAccessor`]: same trait
4//! ([`super::bpf_map::BpfMapAccessor`]), different data path. Where
5//! GuestMemMapAccessor walks frozen guest physical memory via PTE
6//! resolution against `init_mm`, this backend talks directly to the
7//! running host kernel through the `bpf()` syscall — KASLR is fully
8//! abstracted, no symbol resolution required, no page-walk math.
9//!
10//! # Backend differences vs. guest-memory path
11//!
12//! | concern | GuestMemMapAccessor | BpfSyscallAccessor |
13//! |----------------|----------------------------------------------------------------|------------------------------------------------------------------------------|
14//! | discovery | walk `map_idr` xarray in guest memory | `BPF_MAP_GET_NEXT_ID` + `BPF_MAP_GET_FD_BY_ID` loop |
15//! | array values | follow `bpf_array.value` flex array via PTE walks | `BPF_MAP_LOOKUP_ELEM(fd, &key=0, buf)` returns the inline value bytes |
16//! | hash iteration | walk `bpf_htab.buckets` directly (freeze rendezvous = sync) | `BPF_MAP_GET_NEXT_KEY` + `BPF_MAP_LOOKUP_ELEM` per key (kernel RCU read-side) |
17//! | per-CPU array | read each CPU's slot via `__per_cpu_offset[cpu]` | one `BPF_MAP_LOOKUP_ELEM` returns `nr_possible_cpus * value_size` bytes |
18//! | arena | walk `bpf_arena -> kern_vm -> vm_struct.addr` PTE-by-PTE | `mmap(arena_fd, ...)` — `lookup_elem` returns `-EINVAL` on arena |
19//! | program BTF | read split-BTF blob from guest memory | `BPF_BTF_GET_FD_BY_ID` + `BPF_OBJ_GET_INFO_BY_FD` to extract BTF bytes |
20//!
21//! # Map fd pinning
22//!
23//! Every map discovered at construction time has its fd held open for
24//! the lifetime of the accessor. The kernel's
25//! `bpf_map_put`/`atomic64_dec_and_test` (`kernel/bpf/syscall.c`) only
26//! frees a map when its refcount reaches zero, and userspace fds count
27//! as references. This means the scheduler can exit and tear down its
28//! struct_ops link while the accessor is still iterating maps — the
29//! underlying memory stays valid.
30//!
31//! # Required capabilities
32//!
33//! `BPF_MAP_GET_NEXT_ID` and `BPF_MAP_GET_FD_BY_ID` require
34//! `CAP_SYS_ADMIN` (or, since 5.16, `CAP_BPF` for some commands;
35//! `..._GET_NEXT_ID` still requires `CAP_SYS_ADMIN`). ktstr always runs
36//! as root in the test environment, so this is a non-issue for the
37//! library's primary consumer; the `from_running_kernel` constructor
38//! surfaces the kernel's `EPERM` directly so live-host CLI use cases
39//! can produce a clear error.
40//!
41//! # Lock-free reads
42//!
43//! Without a freeze rendezvous, the kernel's per-element atomicity is
44//! the only ordering primitive. Per-element u64-aligned fields are
45//! atomic on x86_64; multi-element transactions the scheduler intended
46//! to commit atomically may surface as torn views relative to the
47//! walker. This is identical to the guest-memory backend's torn-read
48//! behavior, just for a different reason. Two-snapshot in-BPF capture
49//! (bpf_timer + tp_btf) is the recommended remedy and lives outside
50//! this backend.
51
52use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
53use std::ptr;
54
55use anyhow::{Context, Result, anyhow};
56use btf_rs::Btf;
57
58use super::arena::{ArenaPage, ArenaSnapshot, BpfArenaOffsets};
59use super::bpf_map::{
60 BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_LRU_HASH,
61 BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_PERCPU_HASH,
62 BPF_MAP_TYPE_STRUCT_OPS, BpfMapAccessor, BpfMapInfo, MAP_MATERIALIZE_MAX,
63};
64
65/// `BPF_MAP_LOOKUP_ELEM` — read one map value into a userspace buffer.
66const BPF_MAP_LOOKUP_ELEM: u32 = 1;
67/// `BPF_MAP_GET_NEXT_KEY` — advance hash iteration cursor.
68const BPF_MAP_GET_NEXT_KEY: u32 = 4;
69/// `BPF_MAP_GET_NEXT_ID` — advance the kernel's map id walk.
70const BPF_MAP_GET_NEXT_ID: u32 = 0xc;
71/// `BPF_MAP_GET_FD_BY_ID` — pin a map by id.
72const BPF_MAP_GET_FD_BY_ID: u32 = 0xe;
73/// `BPF_OBJ_GET_INFO_BY_FD` — fetch map/btf metadata from an open fd.
74const BPF_OBJ_GET_INFO_BY_FD: u32 = 0xf;
75/// `BPF_BTF_GET_FD_BY_ID` — pin a BTF object by id.
76/// Per `include/uapi/linux/bpf.h::enum bpf_cmd`: 19 (0x13). Counting
77/// from `BPF_MAP_CREATE = 0` through `BPF_BTF_LOAD = 18` makes the
78/// next entry `BPF_BTF_GET_FD_BY_ID = 19`.
79const BPF_BTF_GET_FD_BY_ID: u32 = 0x13;
80
81/// `BPF_OBJ_NAME_LEN` from `include/uapi/linux/bpf.h`.
82const BPF_OBJ_NAME_LEN: usize = 16;
83
84/// Fallback arena page size (4 KiB), used only if
85/// `sysconf(_SC_PAGESIZE)` fails — which it cannot on Linux. The real
86/// unit is the host kernel's base `PAGE_SIZE`: `arena_map_alloc`
87/// computes `vm_range = max_entries * PAGE_SIZE` and `arena_vm_fault`
88/// faults at `PAGE_SIZE` stride, both arch-dependent (4 KiB on x86_64,
89/// 16 KiB/64 KiB on aarch64 base granule, distinct from THP/hugetlb).
90/// `read_arena_pages` reads the live value via `host_page_size` so the
91/// mmap length matches the kernel's `user_vm_end` on every arch; the
92/// guest-memory backend parameterizes page size the same way via
93/// `guest_page_size(tcr_el1)` (`src/monitor/arena.rs`).
94const ARENA_PAGE_SIZE: usize = 4096;
95
96/// Page size of the kernel that owns the arena fd, via
97/// `sysconf(_SC_PAGESIZE)`. `read_arena_pages` mmaps the arena fd in
98/// the process holding it, so that process always runs on the arena's
99/// own kernel — the guest VM kernel in the in-guest monitor path
100/// (where scx-ktstr's arena lives), or the host kernel in live-host
101/// mode. That kernel created the arena with `vm_range = max_entries *
102/// PAGE_SIZE`, so this is exactly the unit that makes the mmap length
103/// match `user_vm_end`. Falls back to `ARENA_PAGE_SIZE` (4 KiB) only
104/// if the query fails, which it cannot on Linux.
105fn host_page_size() -> usize {
106 // SAFETY: `sysconf` with a valid name has no preconditions and
107 // writes through no pointer.
108 let v = unsafe { libc::sysconf(libc::_SC_PAGESIZE) };
109 if v > 0 { v as usize } else { ARENA_PAGE_SIZE }
110}
111
112/// mmap placement for an arena read: `(addr_hint, flags, length)`
113/// where `addr_hint == 0` means NULL (let the kernel choose the VA).
114///
115/// When the arena was created with a nonzero `map_extra` (scx
116/// schedulers do, via `lib/arena_map.h`), the kernel pins
117/// `user_vm_start`/`user_vm_end`, and `arena_map_mmap`
118/// (`kernel/bpf/arena.c`) rejects any mapping whose start != map_extra
119/// OR whose end != map_extra + full arena span with `-EBUSY`. So the
120/// read must land at exactly `map_extra` with `MAP_FIXED_NOREPLACE`
121/// and span the full `declared_bytes` — not the capped read window.
122/// When `user_vm_start == 0` the kernel adopts our VA, so a NULL hint
123/// plus the capped prefix is correct and bounds host address-space use.
124fn arena_mmap_placement(
125 user_vm_start: u64,
126 declared_bytes: usize,
127 read_bytes: usize,
128) -> (usize, i32, usize) {
129 if user_vm_start != 0 {
130 (
131 user_vm_start as usize,
132 libc::MAP_SHARED | libc::MAP_FIXED_NOREPLACE,
133 declared_bytes,
134 )
135 } else {
136 (0, libc::MAP_SHARED, read_bytes)
137 }
138}
139
140/// Maximum total bytes the arena snapshot reads via mmap, mirroring the
141/// guest-memory backend's `MAX_VM_RANGE_BYTES`. Keeps a runaway
142/// `max_entries` from inducing a multi-GiB read.
143const MAX_ARENA_BYTES: u64 = 4 * 1024 * 1024 * 1024;
144
145/// Maximum number of arena pages the mmap span covers. Pages beyond
146/// this cap are truncated (surfaced via [`ArenaSnapshot::truncated`]),
147/// not stride-probed — mmap already covers the whole window, so this
148/// backend has no stride sweep. The guest-memory backend uses a
149/// separate sequential cap (`MAX_ARENA_PAGES = 4096` in
150/// `src/monitor/arena.rs`) plus a stride-probe sweep for pages past
151/// that cap; the two constants differ.
152const MAX_ARENA_PAGES: u64 = 16 * 1024;
153
154// `bpf_attr` is a uapi union with many command-specific shapes. Rather
155// than declare the full union we lay out per-command structs covering
156// the fields each command reads, in uapi field order; some are a prefix
157// of the full arm (e.g. `BpfAttrGetId` omits the trailing token fd). The
158// kernel does NOT match the passed size against a per-arm length:
159// `__sys_bpf` (kernel/bpf/syscall.c) calls `bpf_check_uarg_tail_zero`,
160// clamps `size = min(size, sizeof(union bpf_attr))`, zero-fills `attr`,
161// then dispatches on `cmd`. Any size up to `sizeof(union bpf_attr)` is
162// accepted provided bytes past `size` are zero; we pass
163// `size_of::<arm>()` and the kernel zero-fills the union tail we omit.
164
165/// `bpf_attr` arm for `BPF_MAP_*_ELEM` and `BPF_MAP_GET_NEXT_KEY`.
166/// Source: `include/uapi/linux/bpf.h::union bpf_attr` (the
167/// MAP_ELEM_OPS arm).
168#[repr(C)]
169#[derive(Default)]
170struct BpfAttrMapElem {
171 map_fd: u32,
172 _pad0: u32,
173 key: u64,
174 value_or_next_key: u64,
175 flags: u64,
176}
177
178/// `bpf_attr` arm for `BPF_MAP_GET_NEXT_ID`, `BPF_BTF_GET_NEXT_ID`,
179/// and the corresponding `*_GET_FD_BY_ID` commands.
180#[repr(C)]
181#[derive(Default)]
182struct BpfAttrGetId {
183 /// `start_id` for `*_GET_NEXT_ID`; `id` for `*_GET_FD_BY_ID`.
184 id_or_start_id: u32,
185 next_id: u32,
186 open_flags: u32,
187}
188
189/// `bpf_attr` arm for `BPF_OBJ_GET_INFO_BY_FD`.
190#[repr(C)]
191#[derive(Default)]
192struct BpfAttrInfoByFd {
193 bpf_fd: u32,
194 info_len: u32,
195 info: u64,
196}
197
198/// `struct bpf_map_info` from `include/uapi/linux/bpf.h`. The kernel
199/// has grown this struct over time; we pass our struct size as
200/// `info_len` and the kernel zero-fills any tail it doesn't fill in.
201/// All fields are documented in the kernel header.
202#[repr(C)]
203#[derive(Default)]
204struct BpfMapInfoUapi {
205 map_type: u32,
206 id: u32,
207 key_size: u32,
208 value_size: u32,
209 max_entries: u32,
210 map_flags: u32,
211 name: [u8; BPF_OBJ_NAME_LEN],
212 ifindex: u32,
213 btf_vmlinux_value_type_id: u32,
214 netns_dev: u64,
215 netns_ino: u64,
216 btf_id: u32,
217 btf_key_type_id: u32,
218 btf_value_type_id: u32,
219 /// Kernel field `btf_vmlinux_id` per
220 /// `include/uapi/linux/bpf.h::struct bpf_map_info`. Unused by the
221 /// caller; named `_pad` here because the value is currently
222 /// discarded by the BPF accessor — rename without binding the
223 /// field to a public consumer that can rot.
224 _pad: u32,
225 map_extra: u64,
226}
227
228/// `struct bpf_btf_info` from `include/uapi/linux/bpf.h`. Used to
229/// extract a BTF blob's bytes given an open BTF fd.
230#[repr(C)]
231#[derive(Default)]
232struct BpfBtfInfoUapi {
233 btf: u64,
234 btf_size: u32,
235 id: u32,
236 name: u64,
237 name_len: u32,
238 kernel_btf: u32,
239}
240
241/// Raw `bpf(2)` syscall wrapper. Returns the kernel's return value as
242/// `i64` so callers can check for `< 0` and inspect `errno`. The
243/// kernel's `__sys_bpf` (`kernel/bpf/syscall.c`) accepts any `size` up
244/// to `sizeof(union bpf_attr)`: `bpf_check_uarg_tail_zero` rejects only
245/// bytes past `size` that are nonzero, then it clamps to
246/// `sizeof(union bpf_attr)`, zero-fills the rest, and dispatches on
247/// `cmd` — there is no per-arm length match.
248///
249/// SAFETY: `attr_ptr` must point to `attr_size` valid bytes laid out as
250/// the command's `bpf_attr` arm (or a zero-tailed prefix of it). A size
251/// smaller than the command needs is accepted — the kernel zero-fills
252/// the omitted fields — so the caller, not the kernel, must supply every
253/// field the command requires. A size above `PAGE_SIZE`, or one whose
254/// bytes past the union are nonzero, returns `-E2BIG`.
255unsafe fn bpf_syscall(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> i64 {
256 // SAFETY: caller must ensure attr_ptr/attr_size validity. The
257 // syscall itself is signal-safe and reentrant.
258 unsafe { libc::syscall(libc::SYS_bpf, cmd as i64, attr_ptr, attr_size) as i64 }
259}
260
261/// Wrap a `bpf()` syscall result in a `Result<RawFd>` for commands
262/// that return an fd. Negative returns are converted to errno-bearing
263/// errors; non-negative returns become the fd.
264fn bpf_call_fd(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> Result<RawFd> {
265 // SAFETY: caller has built attr_ptr/attr_size correctly per the
266 // command's bpf_attr arm.
267 let ret = unsafe { bpf_syscall(cmd, attr_ptr, attr_size) };
268 if ret < 0 {
269 let err = std::io::Error::last_os_error();
270 Err(anyhow!("bpf({cmd}) failed: {err}"))
271 } else {
272 Ok(ret as RawFd)
273 }
274}
275
276/// Wrap a `bpf()` syscall result for commands that return 0 on
277/// success, `< 0` on error.
278fn bpf_call_status(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> Result<()> {
279 // SAFETY: caller has built attr_ptr/attr_size correctly.
280 let ret = unsafe { bpf_syscall(cmd, attr_ptr, attr_size) };
281 if ret < 0 {
282 let err = std::io::Error::last_os_error();
283 Err(anyhow!("bpf({cmd}) failed: {err}"))
284 } else {
285 Ok(())
286 }
287}
288
289/// One discovered map together with its pinned fd. The `OwnedFd`
290/// guarantees the map's refcount stays >0 for the accessor's
291/// lifetime — even if the scheduler exits and userspace tear-down
292/// runs, `bpf_map_put` only frees when every fd is dropped (see
293/// `kernel/bpf/syscall.c` `bpf_map_put`).
294struct PinnedMap {
295 info: BpfMapInfo,
296 fd: OwnedFd,
297 /// Raw `map_extra` from the kernel info struct. Arena maps
298 /// hardcode this to a deterministic mmap target address (x86:
299 /// `1<<44`, aarch64: `1<<32`) per `lib/arena_map.h`. Surfaced
300 /// here so the arena mmap path can use `MAP_FIXED_NOREPLACE` at
301 /// the kernel-blessed address rather than letting `mmap` pick
302 /// one — which would diverge from what BPF programs see.
303 map_extra: u64,
304}
305
306/// Live-host BPF map accessor.
307///
308/// Construction enumerates every map id reachable via
309/// `BPF_MAP_GET_NEXT_ID`, opens an fd for each via
310/// `BPF_MAP_GET_FD_BY_ID`, and caches the metadata. The fd vector is
311/// held for the accessor's lifetime so the maps cannot be freed
312/// underneath us — even if the scheduler exits and tears down its
313/// struct_ops link mid-walk.
314///
315/// Selectively populating the cache is intentional: the same trait
316/// surface accepts a `BpfMapInfo` argument on every method, so an
317/// accessor that holds only the maps a particular failure dump cares
318/// about (filtered by name suffix at construction time) is just as
319/// valid as one that holds every map on the system. The
320/// `from_running_kernel_filtered` constructor exposes that knob.
321#[allow(dead_code)]
322pub struct BpfSyscallAccessor {
323 maps: Vec<PinnedMap>,
324}
325
326impl BpfSyscallAccessor {
327 /// Discover and pin every BPF map currently visible to the
328 /// running kernel.
329 ///
330 /// Walks the kernel's id space via `BPF_MAP_GET_NEXT_ID` (starting
331 /// from id 0), pinning each map with `BPF_MAP_GET_FD_BY_ID` and
332 /// fetching its metadata via `BPF_OBJ_GET_INFO_BY_FD`. Maps that
333 /// disappear between the `NEXT_ID` and `GET_FD_BY_ID` calls (a
334 /// concurrent scheduler unload, for instance) are silently
335 /// skipped — that race is inherent to live-host enumeration and
336 /// is not an error.
337 ///
338 /// Requires `CAP_SYS_ADMIN`. ktstr always runs as root in the
339 /// test environment so this is a non-issue for the primary
340 /// consumer; live-host CLI users that hit `EPERM` will see it
341 /// in the returned error.
342 #[allow(dead_code)]
343 pub fn from_running_kernel() -> Result<Self> {
344 Self::from_running_kernel_filtered(|_info: &BpfMapInfo| true)
345 }
346
347 /// Discover and pin every BPF map for which `predicate` returns
348 /// `true`. Maps that fail the predicate are closed (their fds
349 /// drop) so the kernel can free them as usual.
350 ///
351 /// Useful when the caller knows which maps the failure dump will
352 /// touch — typically the scheduler's named maps that match a
353 /// specific suffix — and wants to avoid pinning hundreds of
354 /// unrelated maps that happen to be alive (cilium, systemd,
355 /// other workloads).
356 #[allow(dead_code)]
357 pub fn from_running_kernel_filtered<F>(mut predicate: F) -> Result<Self>
358 where
359 F: FnMut(&BpfMapInfo) -> bool,
360 {
361 let mut maps: Vec<PinnedMap> = Vec::new();
362 let mut start_id: u32 = 0;
363
364 loop {
365 // The kernel writes `next_id` via the syscall's raw pointer
366 // path, but Rust's borrow checker doesn't see that — it
367 // sees the struct as never mutated through a Rust binding.
368 // Declare mut anyway so the compiler treats `attr.next_id`
369 // as written, then read it back through a raw read after
370 // the syscall returns.
371 let mut attr = BpfAttrGetId {
372 id_or_start_id: start_id,
373 next_id: 0,
374 open_flags: 0,
375 };
376 // SAFETY: BpfAttrGetId is repr(C) with the exact layout the
377 // kernel expects for *_GET_NEXT_ID.
378 let res = unsafe {
379 bpf_syscall(
380 BPF_MAP_GET_NEXT_ID,
381 &raw mut attr as *const u8,
382 std::mem::size_of::<BpfAttrGetId>(),
383 )
384 };
385 if res < 0 {
386 let err = std::io::Error::last_os_error();
387 if err.raw_os_error() == Some(libc::ENOENT) {
388 break;
389 }
390 return Err(anyhow!("BPF_MAP_GET_NEXT_ID failed: {err}"));
391 }
392
393 let next_id = attr.next_id;
394 // Defensive: kernel returned 0 for `next_id` somehow.
395 // Shouldn't happen on success, but guard against an
396 // infinite loop.
397 if next_id == 0 {
398 break;
399 }
400 // Advance start_id for the next iteration BEFORE the
401 // get-fd-by-id call so a transient EPERM/ENOENT on a
402 // single id doesn't wedge the walk.
403 start_id = next_id;
404
405 // Try to pin the map. ENOENT here means the map was
406 // freed between the NEXT_ID and GET_FD_BY_ID calls. The
407 // kernel doesn't write to this attr (GET_FD_BY_ID is
408 // input-only), so the binding is plain (no mut).
409 let fd_attr = BpfAttrGetId {
410 id_or_start_id: next_id,
411 next_id: 0,
412 open_flags: 0,
413 };
414 let fd_ret = unsafe {
415 bpf_syscall(
416 BPF_MAP_GET_FD_BY_ID,
417 &raw const fd_attr as *const u8,
418 std::mem::size_of::<BpfAttrGetId>(),
419 )
420 };
421 if fd_ret < 0 {
422 // A failed `BPF_MAP_GET_FD_BY_ID` skips this map and
423 // keeps walking — a single bad map must not abort
424 // enumeration. The error categories matter for
425 // diagnostics, so surface non-ENOENT cases via
426 // tracing rather than silently dropping them:
427 //
428 // - `ENOENT`: the map was freed between
429 // `GET_NEXT_ID` and `GET_FD_BY_ID`. Routine
430 // under churn; suppressed at `debug` level so the
431 // normal log stays quiet.
432 // - `EPERM`: missing CAP_SYS_ADMIN / CAP_BPF for
433 // this map (e.g. a kernel-internal map a less-
434 // privileged caller can't pin). Logged at `warn`
435 // so an operator who expects to see the map knows
436 // why it's missing.
437 // - `EBADF` / others: a kernel-side state error.
438 // Logged at `warn` with the errno so the operator
439 // can correlate against `dmesg`.
440 let err = std::io::Error::last_os_error();
441 let raw = err.raw_os_error().unwrap_or(0);
442 if raw == libc::ENOENT {
443 tracing::debug!(
444 map_id = next_id,
445 "BPF_MAP_GET_FD_BY_ID: map vanished mid-walk (ENOENT); skipping"
446 );
447 } else {
448 tracing::warn!(
449 map_id = next_id,
450 errno = raw,
451 error = %err,
452 "BPF_MAP_GET_FD_BY_ID failed; skipping this map but continuing the walk"
453 );
454 }
455 continue;
456 }
457 // SAFETY: fd_ret >= 0; the kernel guarantees a valid fd
458 // for non-negative returns.
459 let fd = unsafe { OwnedFd::from_raw_fd(fd_ret as RawFd) };
460
461 // Fetch info to populate BpfMapInfo + decide whether to
462 // keep the fd. A failure here means the map's metadata
463 // can't be read (kernel-side state error or fd was
464 // closed mid-walk); surface it via tracing so the
465 // operator sees the correlation rather than a silently
466 // dropped map.
467 let (info, map_extra) = match obj_get_info_map(fd.as_raw_fd()) {
468 Ok(pair) => pair,
469 Err(e) => {
470 tracing::warn!(
471 map_id = next_id,
472 error = %e,
473 "BPF_OBJ_GET_INFO_BY_FD failed for pinned map; skipping"
474 );
475 continue;
476 }
477 };
478
479 // Hand the predicate a BpfMapInfo for the keep/discard
480 // decision. Discarded fds drop here.
481 if !predicate(&info) {
482 continue;
483 }
484
485 maps.push(PinnedMap {
486 info,
487 fd,
488 map_extra,
489 });
490 }
491
492 Ok(Self { maps })
493 }
494
495 /// Number of pinned maps currently held. Test helper.
496 #[cfg(test)]
497 #[allow(dead_code)]
498 pub(crate) fn pinned_count(&self) -> usize {
499 self.maps.len()
500 }
501
502 /// Look up the pinned fd for a map identified by its
503 /// `BpfMapInfo`. Returns `None` when no pinned map matches.
504 ///
505 /// Match key: `name` field (via [`info_name_matches`]). Map ids
506 /// would be more precise but they're not part of `BpfMapInfo`
507 /// today (a known follow-up if the live-host backend grows other
508 /// consumers); within a single scheduler instance, names are
509 /// unique and stable for the duration of the run.
510 fn pinned_for(&self, target: &BpfMapInfo) -> Option<&PinnedMap> {
511 self.maps
512 .iter()
513 .find(|p| info_name_matches(&p.info, target))
514 }
515}
516
517/// Match key for [`BpfSyscallAccessor::pinned_for`] and the
518/// construction-time predicate filter: two `BpfMapInfo`s identify the
519/// same map iff their active name bytes
520/// ([`BpfMapInfo::name_bytes_active`]) are byte-equal. Extracted as a
521/// free fn so the keep/discard semantics are exercisable over a
522/// hand-built `&[BpfMapInfo]` fixture without the live-kernel walk.
523fn info_name_matches(a: &BpfMapInfo, b: &BpfMapInfo) -> bool {
524 a.name_bytes_active() == b.name_bytes_active()
525}
526
527/// Pure mirror of the construction-time keep/discard step in
528/// [`BpfSyscallAccessor::from_running_kernel_filtered`]: returns the
529/// subset of `infos` for which `predicate` returns `true`, preserving
530/// order. The production constructor applies the same
531/// `if !predicate(&info) { continue; }` gate inline against each map
532/// freshly fetched from the kernel; this fn lets a test pin the
533/// filter's keep/discard contract over a deterministic fixture.
534#[cfg(test)]
535fn select_keeping<F>(infos: &[BpfMapInfo], mut predicate: F) -> Vec<&BpfMapInfo>
536where
537 F: FnMut(&BpfMapInfo) -> bool,
538{
539 infos.iter().filter(|info| predicate(info)).collect()
540}
541
542/// Fetch `bpf_map_info` for an open map fd via
543/// `BPF_OBJ_GET_INFO_BY_FD`. Returns the populated [`BpfMapInfo`]
544/// alongside the raw `map_extra` field — the latter is needed by the
545/// arena mmap path but doesn't fit on the cross-backend
546/// [`BpfMapInfo`] surface (the guest-memory path doesn't use it).
547fn obj_get_info_map(fd: RawFd) -> Result<(BpfMapInfo, u64)> {
548 let mut info = BpfMapInfoUapi::default();
549 let attr = BpfAttrInfoByFd {
550 bpf_fd: fd as u32,
551 info_len: std::mem::size_of::<BpfMapInfoUapi>() as u32,
552 info: &raw mut info as u64,
553 };
554 bpf_call_status(
555 BPF_OBJ_GET_INFO_BY_FD,
556 &raw const attr as *const u8,
557 std::mem::size_of::<BpfAttrInfoByFd>(),
558 )
559 .context("BPF_OBJ_GET_INFO_BY_FD on map fd")?;
560
561 let nul = info
562 .name
563 .iter()
564 .position(|&b| b == 0)
565 .unwrap_or(BPF_OBJ_NAME_LEN);
566 let mut name_bytes = [0u8; BPF_OBJ_NAME_LEN];
567 name_bytes.copy_from_slice(&info.name);
568
569 Ok((
570 BpfMapInfo {
571 // map_pa / map_kva / value_kva are guest-memory concepts
572 // that don't apply on the live host. Populating with 0
573 // is fine — the live-host backend's read paths route
574 // through the pinned fd, not these fields.
575 map_pa: 0,
576 map_kva: 0,
577 name_bytes,
578 name_len: nul as u8,
579 map_type: info.map_type,
580 map_flags: info.map_flags,
581 key_size: info.key_size,
582 value_size: info.value_size,
583 max_entries: info.max_entries,
584 value_kva: None,
585 // btf_kva is similarly a guest-memory locator. Live-host
586 // BTF resolution goes through `btf_id` →
587 // `BPF_BTF_GET_FD_BY_ID` instead.
588 btf_kva: u64::from(info.btf_id),
589 btf_value_type_id: info.btf_value_type_id,
590 // bpf(2) `BPF_OBJ_GET_INFO_BY_FD` does not surface
591 // `btf_vmlinux_value_type_id` directly; the live-host
592 // backend would need a parallel resolution path
593 // (BPF_BTF_GET_INFO_BY_ID + walk the wrapper) to expose
594 // it. Until that lands, leave 0 — the dump's STRUCT_OPS
595 // arm falls through to hex on a zero id, matching the
596 // behavior on guest-memory maps without struct_ops
597 // BTF support.
598 btf_vmlinux_value_type_id: 0,
599 btf_key_type_id: info.btf_key_type_id,
600 },
601 info.map_extra,
602 ))
603}
604
605impl BpfMapAccessor for BpfSyscallAccessor {
606 fn maps(&self) -> Vec<BpfMapInfo> {
607 self.maps.iter().map(|p| p.info.clone()).collect()
608 }
609
610 fn read_value(&self, map: &BpfMapInfo, offset: usize, len: usize) -> Option<Vec<u8>> {
611 let pinned = self.pinned_for(map)?;
612
613 // The live-host backend supports single-buffer value reads on
614 // ARRAY (key=0 returns inline value bytes) and STRUCT_OPS
615 // (key=0 returns the populated `bpf_struct_ops_value`). HASH
616 // goes through `iter_hash_map`; PERCPU_ARRAY through
617 // `read_percpu_array`; ARENA through `read_arena_pages`. Any
618 // other type falls through to None so the dump renderer can
619 // surface a specific reason.
620 //
621 // STRUCT_OPS quirk: the in-kernel
622 // `bpf_struct_ops_map_lookup_elem` returns -EINVAL
623 // (`kernel/bpf/bpf_struct_ops.c:518`), but the syscall path
624 // `bpf_struct_ops_map_sys_lookup_elem`
625 // (`kernel/bpf/bpf_struct_ops.c::bpf_struct_ops_map_sys_lookup_elem`)
626 // implements its own lookup, copying the kernel's
627 // `bpf_struct_ops_value` (refcnt + state + the registered
628 // kernel struct) into the userspace buffer. The kernel-only
629 // `lookup_elem` call is the in-program path; userspace
630 // syscalls reach the sys variant.
631 if map.map_type != BPF_MAP_TYPE_ARRAY && map.map_type != BPF_MAP_TYPE_STRUCT_OPS {
632 return None;
633 }
634
635 // Build the lookup. ARRAY and STRUCT_OPS both use a u32 key;
636 // STRUCT_OPS only ever has one entry (key=0).
637 let mut key: u32 = 0;
638 let mut buf = vec![0u8; map.value_size as usize];
639 let attr = BpfAttrMapElem {
640 map_fd: pinned.fd.as_raw_fd() as u32,
641 _pad0: 0,
642 key: &raw mut key as u64,
643 value_or_next_key: buf.as_mut_ptr() as u64,
644 flags: 0,
645 };
646 bpf_call_status(
647 BPF_MAP_LOOKUP_ELEM,
648 &raw const attr as *const u8,
649 std::mem::size_of::<BpfAttrMapElem>(),
650 )
651 .ok()?;
652
653 // Slice into the requested window. Out-of-bounds offsets
654 // return None to mirror the guest-memory backend's behavior
655 // when a value-region read straddles an unmapped page.
656 let end = offset.checked_add(len)?;
657 if end > buf.len() {
658 return None;
659 }
660 Some(buf[offset..end].to_vec())
661 }
662
663 fn read_array(&self, map: &BpfMapInfo, key: u32) -> Option<Vec<u8>> {
664 let pinned = self.pinned_for(map)?;
665 // ARRAY only. STRUCT_OPS and single-entry global-section maps
666 // go through read_value (key 0); HASH/PERCPU/ARENA have their
667 // own methods. Replicate array_map_lookup_elem's pre-mask
668 // `index >= max_entries` rejection (the kernel's index_mask is
669 // a Spectre bound, not a range check).
670 if map.map_type != BPF_MAP_TYPE_ARRAY {
671 return None;
672 }
673 if key >= map.max_entries {
674 return None;
675 }
676 // BPF_MAP_LOOKUP_ELEM copies value_size bytes for a plain
677 // ARRAY (copy_map_value) — no per-entry stride padding, unlike
678 // the PERCPU_ARRAY path which returns nr_cpus * round_up_8.
679 // Pass the entry index as the key.
680 let mut k: u32 = key;
681 // No MAX_VALUE_SIZE cap here (unlike the guest-memory
682 // `read_bpf_map_array_value`): value_size is sourced from
683 // BPF_OBJ_GET_INFO_BY_FD (kernel-validated metadata), not
684 // corruptible guest DRAM, so the kernel's own value_size
685 // validation guards this allocation.
686 let mut buf = vec![0u8; map.value_size as usize];
687 let attr = BpfAttrMapElem {
688 map_fd: pinned.fd.as_raw_fd() as u32,
689 _pad0: 0,
690 key: &raw mut k as u64,
691 value_or_next_key: buf.as_mut_ptr() as u64,
692 flags: 0,
693 };
694 bpf_call_status(
695 BPF_MAP_LOOKUP_ELEM,
696 &raw const attr as *const u8,
697 std::mem::size_of::<BpfAttrMapElem>(),
698 )
699 .ok()?;
700 Some(buf)
701 }
702
703 fn iter_hash_map(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
704 let Some(pinned) = self.pinned_for(map) else {
705 return Vec::new();
706 };
707 // HASH and LRU_HASH share the inline-value `htab_elem` layout
708 // (`kernel/bpf/hashtab.c::htab_elem_value`), and the kernel
709 // syscall path returns the value bytes directly for both —
710 // `bpf_map_copy_value` falls into the generic `map_lookup_elem`
711 // arm for them. Reject other map types so callers route
712 // PERCPU_HASH/LRU_PERCPU_HASH to `iter_percpu_hash_map` instead.
713 if map.map_type != BPF_MAP_TYPE_HASH && map.map_type != BPF_MAP_TYPE_LRU_HASH {
714 return Vec::new();
715 }
716
717 let key_sz = map.key_size as usize;
718 let val_sz = map.value_size as usize;
719 let mut out: Vec<(Vec<u8>, Vec<u8>)> = Vec::new();
720
721 // First key: pass NULL for the input key per `bpf(2)` man
722 // page — kernel returns the first key in the table.
723 let mut cur_key = vec![0u8; key_sz];
724 let mut next_key = vec![0u8; key_sz];
725
726 // Cap iterations at max_entries * 2 to bound a pathological
727 // walk on a torn table. RCU-protected reads on the kernel
728 // side are best-effort across concurrent updates.
729 let cap = (map.max_entries as usize).saturating_mul(2).max(1);
730 let mut got_first = false;
731 for _ in 0..cap {
732 // Get next key.
733 let attr = BpfAttrMapElem {
734 map_fd: pinned.fd.as_raw_fd() as u32,
735 _pad0: 0,
736 key: if got_first {
737 cur_key.as_ptr() as u64
738 } else {
739 0 // first call: NULL means "first key"
740 },
741 value_or_next_key: next_key.as_mut_ptr() as u64,
742 flags: 0,
743 };
744 let ret = unsafe {
745 bpf_syscall(
746 BPF_MAP_GET_NEXT_KEY,
747 &raw const attr as *const u8,
748 std::mem::size_of::<BpfAttrMapElem>(),
749 )
750 };
751 if ret < 0 {
752 // ENOENT marks end of iteration; anything else
753 // ends the walk silently with whatever was
754 // collected so far.
755 break;
756 }
757 got_first = true;
758
759 // Look up the value for next_key.
760 let mut value = vec![0u8; val_sz];
761 let lookup_attr = BpfAttrMapElem {
762 map_fd: pinned.fd.as_raw_fd() as u32,
763 _pad0: 0,
764 key: next_key.as_ptr() as u64,
765 value_or_next_key: value.as_mut_ptr() as u64,
766 flags: 0,
767 };
768 let lret = unsafe {
769 bpf_syscall(
770 BPF_MAP_LOOKUP_ELEM,
771 &raw const lookup_attr as *const u8,
772 std::mem::size_of::<BpfAttrMapElem>(),
773 )
774 };
775 if lret >= 0 {
776 out.push((next_key.clone(), value));
777 }
778 // Bound materialization at the renderer's cap (one-past so
779 // render's truncation check fires); see MAP_MATERIALIZE_MAX.
780 if out.len() > MAP_MATERIALIZE_MAX {
781 break;
782 }
783 // Advance cursor — even when lookup failed (the key
784 // disappeared between get_next_key and lookup_elem; a
785 // concurrent delete is inherent to live-host walking).
786 cur_key.copy_from_slice(&next_key);
787 }
788
789 out
790 }
791
792 fn read_percpu_array(&self, map: &BpfMapInfo, key: u32, num_cpus: u32) -> Vec<Option<Vec<u8>>> {
793 let Some(pinned) = self.pinned_for(map) else {
794 return Vec::new();
795 };
796 if map.map_type != BPF_MAP_TYPE_PERCPU_ARRAY {
797 return Vec::new();
798 }
799 if key >= map.max_entries {
800 return Vec::new();
801 }
802
803 let val_sz = map.value_size as usize;
804 let total = (num_cpus as usize).saturating_mul(val_sz);
805 let mut buf = vec![0u8; total];
806 let mut k: u32 = key;
807 let attr = BpfAttrMapElem {
808 map_fd: pinned.fd.as_raw_fd() as u32,
809 _pad0: 0,
810 key: &raw mut k as u64,
811 value_or_next_key: buf.as_mut_ptr() as u64,
812 flags: 0,
813 };
814 if bpf_call_status(
815 BPF_MAP_LOOKUP_ELEM,
816 &raw const attr as *const u8,
817 std::mem::size_of::<BpfAttrMapElem>(),
818 )
819 .is_err()
820 {
821 return vec![None; num_cpus as usize];
822 }
823
824 // Kernel rounds each CPU's slot up to 8 bytes internally
825 // (see `kernel/bpf/syscall.c` bpf_map_value_size for the
826 // PERCPU_ARRAY arm calling round_up_8). The returned buffer
827 // is `nr_cpus * round_up_8(value_size)` bytes; we slice at
828 // the rounded stride to extract each CPU's bytes and then
829 // truncate to value_size.
830 let stride = (val_sz + 7) & !7;
831 let mut out = Vec::with_capacity(num_cpus as usize);
832 for cpu in 0..num_cpus as usize {
833 let start = cpu * stride;
834 let end = start + val_sz;
835 if end > buf.len() {
836 out.push(None);
837 } else {
838 out.push(Some(buf[start..end].to_vec()));
839 }
840 }
841 out
842 }
843
844 fn iter_percpu_hash_map(
845 &self,
846 map: &BpfMapInfo,
847 num_cpus: u32,
848 ) -> super::bpf_map::PerCpuHashEntries {
849 let Some(pinned) = self.pinned_for(map) else {
850 return Vec::new();
851 };
852 if map.map_type != BPF_MAP_TYPE_PERCPU_HASH && map.map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH
853 {
854 return Vec::new();
855 }
856
857 let key_sz = map.key_size as usize;
858 let val_sz = map.value_size as usize;
859 // Kernel returns nr_cpus * round_up_8(value_size) bytes per
860 // lookup (`bpf_percpu_hash_copy` copies each CPU slot via
861 // `copy_map_value_long` at a `round_up(value_size, 8)`
862 // stride); same 8-byte stride as PERCPU_ARRAY. The buffer
863 // must be sized to the full stride or the kernel writes past
864 // it.
865 let stride = (val_sz + 7) & !7;
866 let buf_total = (num_cpus as usize).saturating_mul(stride);
867 let mut out: super::bpf_map::PerCpuHashEntries = Vec::new();
868
869 let mut cur_key = vec![0u8; key_sz];
870 let mut next_key = vec![0u8; key_sz];
871
872 let cap = (map.max_entries as usize).saturating_mul(2).max(1);
873 let mut got_first = false;
874 for _ in 0..cap {
875 let attr = BpfAttrMapElem {
876 map_fd: pinned.fd.as_raw_fd() as u32,
877 _pad0: 0,
878 key: if got_first {
879 cur_key.as_ptr() as u64
880 } else {
881 0
882 },
883 value_or_next_key: next_key.as_mut_ptr() as u64,
884 flags: 0,
885 };
886 let ret = unsafe {
887 bpf_syscall(
888 BPF_MAP_GET_NEXT_KEY,
889 &raw const attr as *const u8,
890 std::mem::size_of::<BpfAttrMapElem>(),
891 )
892 };
893 if ret < 0 {
894 break;
895 }
896 got_first = true;
897
898 let mut value_buf = vec![0u8; buf_total];
899 let lookup_attr = BpfAttrMapElem {
900 map_fd: pinned.fd.as_raw_fd() as u32,
901 _pad0: 0,
902 key: next_key.as_ptr() as u64,
903 value_or_next_key: value_buf.as_mut_ptr() as u64,
904 flags: 0,
905 };
906 let lret = unsafe {
907 bpf_syscall(
908 BPF_MAP_LOOKUP_ELEM,
909 &raw const lookup_attr as *const u8,
910 std::mem::size_of::<BpfAttrMapElem>(),
911 )
912 };
913 if lret >= 0 {
914 let mut per_cpu = Vec::with_capacity(num_cpus as usize);
915 for cpu in 0..num_cpus as usize {
916 let start = cpu * stride;
917 let end = start + val_sz;
918 if end > value_buf.len() {
919 per_cpu.push(None);
920 } else {
921 per_cpu.push(Some(value_buf[start..end].to_vec()));
922 }
923 }
924 out.push((next_key.clone(), per_cpu));
925 }
926 // Bound materialization at the renderer's cap (one-past so
927 // render's truncation check fires); see MAP_MATERIALIZE_MAX.
928 if out.len() > MAP_MATERIALIZE_MAX {
929 break;
930 }
931 cur_key.copy_from_slice(&next_key);
932 }
933
934 out
935 }
936
937 fn read_arena_pages(
938 &self,
939 map: &BpfMapInfo,
940 _arena_offsets: &BpfArenaOffsets,
941 ) -> ArenaSnapshot {
942 let Some(pinned) = self.pinned_for(map) else {
943 return ArenaSnapshot::default();
944 };
945 if map.map_type != BPF_MAP_TYPE_ARENA {
946 return ArenaSnapshot::default();
947 }
948
949 // The kernel sizes the arena as `max_entries * PAGE_SIZE`
950 // (`arena_map_alloc`) at the host base page size; read it at
951 // runtime so the span — and the mmap length below — match what
952 // the kernel pinned as `user_vm_end`. A hardcoded 4 KiB would
953 // under-size the mapping on a 16 KiB-granule host and trip
954 // `arena_map_mmap`'s `user_vm_end` check (-EBUSY). Same caps as
955 // the guest-memory side for cross-backend parity.
956 let page_size = host_page_size();
957 let declared_bytes_raw = (map.max_entries as u64).saturating_mul(page_size as u64);
958 let span_capped = declared_bytes_raw > MAX_ARENA_BYTES;
959 let declared_bytes = declared_bytes_raw.min(MAX_ARENA_BYTES);
960 let declared_pages = declared_bytes / page_size as u64;
961
962 // Use map_extra as the user_vm_start anchor. BPF programs
963 // see arena addresses at this base (lib/arena_map.h hardcodes
964 // it: x86 `1<<44`, aarch64 `1<<32`). Operators correlating
965 // arena pointers want the same base in the snapshot.
966 // Lifted above the early returns so every snapshot — empty
967 // or populated — carries the anchor in `user_vm_start`; the
968 // pointer-chasing reader needs it to classify arena addresses
969 // even when the page set is empty.
970 let user_vm_start = pinned.map_extra;
971
972 if declared_pages == 0 {
973 return ArenaSnapshot {
974 pages: Vec::new(),
975 truncated: false,
976 declared_pages: 0,
977 span_capped,
978 user_vm_start,
979 ..Default::default()
980 };
981 }
982
983 // The read window is capped at MAX_ARENA_PAGES so a huge arena
984 // can't drive an unbounded mincore/read loop; `truncated`
985 // surfaces the cap. mincore() below filters to the resident set
986 // (arena_vm_fault populates pages on demand, so pages the BPF
987 // program never touched are absent) and we read only those.
988 let read_pages = declared_pages.min(MAX_ARENA_PAGES);
989 let read_bytes = (read_pages as usize) * page_size;
990 let truncated = declared_pages > read_pages;
991
992 // Placement: when map_extra was set at arena creation the
993 // kernel pinned user_vm_start/user_vm_end, so the read must map
994 // the FULL arena at exactly map_extra (MAP_FIXED_NOREPLACE) or
995 // arena_map_mmap returns -EBUSY. See `arena_mmap_placement`.
996 let (hint, mmap_flags, mmap_bytes) =
997 arena_mmap_placement(user_vm_start, declared_bytes as usize, read_bytes);
998
999 // SAFETY: mmap with PROT_READ + MAP_SHARED on an arena fd is
1000 // exactly what `arena_map_mmap` (`kernel/bpf/arena.c`) exports;
1001 // offset 0 is required (the op rejects a nonzero vm_pgoff).
1002 // MAP_FIXED_NOREPLACE places the mapping at the kernel-blessed
1003 // VA without clobbering an existing one (fails EEXIST instead).
1004 let addr = unsafe {
1005 libc::mmap(
1006 if hint == 0 {
1007 ptr::null_mut()
1008 } else {
1009 hint as *mut libc::c_void
1010 },
1011 mmap_bytes,
1012 libc::PROT_READ,
1013 mmap_flags,
1014 pinned.fd.as_raw_fd(),
1015 0,
1016 )
1017 };
1018 if addr == libc::MAP_FAILED {
1019 // mmap rejected (e.g. -EBUSY if the arena's user VA is
1020 // pinned elsewhere, EEXIST if map_extra's VA is already
1021 // mapped in this process). Log it: a silently empty arena
1022 // snapshot reads as "arena is empty" when it is actually
1023 // unreadable — exactly how the prior NULL-hint bug hid.
1024 let err = std::io::Error::last_os_error();
1025 tracing::warn!(
1026 user_vm_start = format_args!("{user_vm_start:#x}"),
1027 mmap_bytes,
1028 error = %err,
1029 "read_arena_pages: mmap of arena fd failed; returning empty snapshot"
1030 );
1031 return ArenaSnapshot {
1032 pages: Vec::new(),
1033 truncated,
1034 declared_pages,
1035 span_capped,
1036 user_vm_start,
1037 ..Default::default()
1038 };
1039 }
1040
1041 let mut pages: Vec<ArenaPage> = Vec::new();
1042 // Read the resident pages out of the mmap. We use mincore()
1043 // to filter out pages that aren't present, then read only the
1044 // present ones. mincore returns 0 for
1045 // resident pages, < 0 on error.
1046 let mut residency = vec![0u8; read_pages as usize];
1047 let mincore_ret = unsafe { libc::mincore(addr, read_bytes, residency.as_mut_ptr()) };
1048 if mincore_ret == 0 {
1049 for (idx, &resident) in residency.iter().enumerate() {
1050 if resident & 1 == 0 {
1051 // Page not in core — sparse arena, never
1052 // populated by the BPF program. Skip.
1053 continue;
1054 }
1055 let page_addr = (addr as usize) + idx * page_size;
1056 // SAFETY: page is resident per mincore; reading
1057 // page_size bytes is in-bounds.
1058 let mut buf = vec![0u8; page_size];
1059 unsafe {
1060 std::ptr::copy_nonoverlapping(
1061 page_addr as *const u8,
1062 buf.as_mut_ptr(),
1063 page_size,
1064 );
1065 }
1066 // user_vm_start comes from the BPF map's map_extra
1067 // field — a guest-controllable value. A hostile or
1068 // corrupt map metadata could push the page identifier
1069 // past u64::MAX. Skip the page rather than emit a
1070 // wrapped address that consumers would treat as
1071 // legitimate.
1072 let Some(idx_offset) = (idx as u64).checked_mul(page_size as u64) else {
1073 continue;
1074 };
1075 let Some(user_addr) = user_vm_start.checked_add(idx_offset) else {
1076 continue;
1077 };
1078 pages.push(ArenaPage {
1079 user_addr,
1080 bytes: buf,
1081 });
1082 }
1083 }
1084
1085 // SAFETY: we created this mapping above and aren't using it
1086 // after this point.
1087 unsafe {
1088 libc::munmap(addr, mmap_bytes);
1089 }
1090
1091 ArenaSnapshot {
1092 pages,
1093 truncated,
1094 declared_pages,
1095 span_capped,
1096 user_vm_start,
1097 ..Default::default()
1098 }
1099 }
1100
1101 fn load_program_btf(&self, map: &BpfMapInfo, base_btf: &Btf) -> Option<Btf> {
1102 // map.btf_kva on the live-host backend stores the kernel's
1103 // btf_id (u32) — see obj_get_info_map. 0 means no BTF.
1104 let btf_id = map.btf_kva as u32;
1105 if btf_id == 0 {
1106 return None;
1107 }
1108
1109 // Pin the BTF object by id.
1110 let attr = BpfAttrGetId {
1111 id_or_start_id: btf_id,
1112 next_id: 0,
1113 open_flags: 0,
1114 };
1115 let btf_fd = bpf_call_fd(
1116 BPF_BTF_GET_FD_BY_ID,
1117 &raw const attr as *const u8,
1118 std::mem::size_of::<BpfAttrGetId>(),
1119 )
1120 .ok()?;
1121 // SAFETY: btf_fd >= 0 from a successful bpf_call_fd.
1122 let btf_owned = unsafe { OwnedFd::from_raw_fd(btf_fd) };
1123
1124 // Two-pass info fetch: first call to learn btf_size, then
1125 // allocate a buffer and refetch with `btf` populated to
1126 // pull the BTF blob bytes.
1127 let mut info = BpfBtfInfoUapi::default();
1128 let info_attr = BpfAttrInfoByFd {
1129 bpf_fd: btf_owned.as_raw_fd() as u32,
1130 info_len: std::mem::size_of::<BpfBtfInfoUapi>() as u32,
1131 info: &raw mut info as u64,
1132 };
1133 bpf_call_status(
1134 BPF_OBJ_GET_INFO_BY_FD,
1135 &raw const info_attr as *const u8,
1136 std::mem::size_of::<BpfAttrInfoByFd>(),
1137 )
1138 .ok()?;
1139 if info.btf_size == 0 {
1140 return None;
1141 }
1142
1143 // Second pass with a real buffer.
1144 let mut buf = vec![0u8; info.btf_size as usize];
1145 info.btf = buf.as_mut_ptr() as u64;
1146 let info_attr2 = BpfAttrInfoByFd {
1147 bpf_fd: btf_owned.as_raw_fd() as u32,
1148 info_len: std::mem::size_of::<BpfBtfInfoUapi>() as u32,
1149 info: &raw mut info as u64,
1150 };
1151 bpf_call_status(
1152 BPF_OBJ_GET_INFO_BY_FD,
1153 &raw const info_attr2 as *const u8,
1154 std::mem::size_of::<BpfAttrInfoByFd>(),
1155 )
1156 .ok()?;
1157
1158 if info.kernel_btf != 0 {
1159 Btf::from_split_bytes(&buf, base_btf).ok()
1160 } else {
1161 Btf::from_bytes(&buf).ok()
1162 }
1163 }
1164}
1165
1166#[cfg(test)]
1167mod tests {
1168 use super::*;
1169
1170 /// Verify the bpf_attr arms have the exact UAPI layout the
1171 /// kernel expects. Wrong sizes or field offsets cause -EINVAL
1172 /// on every syscall — this test catches the layout drift before
1173 /// it produces silent failures at runtime.
1174 #[test]
1175 fn bpf_attr_map_elem_size() {
1176 // include/uapi/linux/bpf.h: the MAP_ELEM_OPS arm is exactly
1177 // 32 bytes (4 + 4 pad + 8 + 8 + 8).
1178 assert_eq!(std::mem::size_of::<BpfAttrMapElem>(), 32);
1179 }
1180
1181 #[test]
1182 fn bpf_attr_get_id_size() {
1183 // GET_NEXT_ID / GET_FD_BY_ID: we pass a 12-byte prefix
1184 // (start_id/id + next_id + open_flags) =
1185 // offsetofend(union bpf_attr, open_flags). The full kernel arm
1186 // is 16 bytes — it adds a trailing fd_by_id_token_fd, which the
1187 // kernel zero-fills since we omit it (matching how libbpf sizes
1188 // these calls). This pins our 12-byte prefix, NOT the arm.
1189 assert_eq!(std::mem::size_of::<BpfAttrGetId>(), 12);
1190 }
1191
1192 #[test]
1193 fn bpf_attr_info_by_fd_size() {
1194 // OBJ_GET_INFO_BY_FD arm: 16 bytes (4 + 4 + 8).
1195 assert_eq!(std::mem::size_of::<BpfAttrInfoByFd>(), 16);
1196 }
1197
1198 /// Pin every field offset of [`BpfMapInfoUapi`] against the kernel
1199 /// `struct bpf_map_info` (include/uapi/linux/bpf.h). The kernel
1200 /// writes this struct on `BPF_OBJ_GET_INFO_BY_FD`, so a single
1201 /// shifted offset makes `obj_get_info_map` read garbage from the
1202 /// wrong field (e.g. `value_size` out of `max_entries`) with no
1203 /// runtime error. Pinning only `map_type`@0 and `name`@24 would miss
1204 /// a field insertion between `map_flags` and the tail, so every
1205 /// offset the struct exposes is asserted explicitly.
1206 ///
1207 /// Verdict-routed so a multi-field uapi-shape regression surfaces
1208 /// every drift in one run rather than failing on the first mismatch.
1209 #[test]
1210 fn bpf_map_info_uapi_layout() {
1211 use crate::assert::Verdict;
1212 use std::mem::offset_of;
1213
1214 let mut v = Verdict::new();
1215 // Offsets per `struct bpf_map_info`: u32 fields packed, name[16]
1216 // at 24, u64 fields 8-aligned. Matches the kernel header.
1217 crate::claim!(v, offset_of!(BpfMapInfoUapi, map_type)).eq(0usize);
1218 crate::claim!(v, offset_of!(BpfMapInfoUapi, id)).eq(4usize);
1219 crate::claim!(v, offset_of!(BpfMapInfoUapi, key_size)).eq(8usize);
1220 crate::claim!(v, offset_of!(BpfMapInfoUapi, value_size)).eq(12usize);
1221 crate::claim!(v, offset_of!(BpfMapInfoUapi, max_entries)).eq(16usize);
1222 crate::claim!(v, offset_of!(BpfMapInfoUapi, map_flags)).eq(20usize);
1223 crate::claim!(v, offset_of!(BpfMapInfoUapi, name)).eq(24usize);
1224 crate::claim!(v, offset_of!(BpfMapInfoUapi, ifindex)).eq(40usize);
1225 crate::claim!(v, offset_of!(BpfMapInfoUapi, btf_vmlinux_value_type_id)).eq(44usize);
1226 crate::claim!(v, offset_of!(BpfMapInfoUapi, netns_dev)).eq(48usize);
1227 crate::claim!(v, offset_of!(BpfMapInfoUapi, netns_ino)).eq(56usize);
1228 crate::claim!(v, offset_of!(BpfMapInfoUapi, btf_id)).eq(64usize);
1229 crate::claim!(v, offset_of!(BpfMapInfoUapi, btf_key_type_id)).eq(68usize);
1230 crate::claim!(v, offset_of!(BpfMapInfoUapi, btf_value_type_id)).eq(72usize);
1231 // `_pad` mirrors the kernel's `btf_vmlinux_id` at offset 76.
1232 crate::claim!(v, offset_of!(BpfMapInfoUapi, _pad)).eq(76usize);
1233 crate::claim!(v, offset_of!(BpfMapInfoUapi, map_extra)).eq(80usize);
1234 // `map_extra` is the trailing field we read; our struct ends at
1235 // offset 88 (the kernel's hash/hash_size past it are not read).
1236 crate::claim!(v, std::mem::size_of::<BpfMapInfoUapi>()).eq(88usize);
1237 let r = v.into_result();
1238 assert!(
1239 r.is_pass(),
1240 "bpf_map_info uapi layout drift: {:?}",
1241 r.outcomes,
1242 );
1243 }
1244
1245 /// Round-up arithmetic for percpu stride matches the kernel's
1246 /// `round_up(value_size, 8)`.
1247 #[test]
1248 fn percpu_stride_round_up() {
1249 let cases = [
1250 (0usize, 0),
1251 (1, 8),
1252 (7, 8),
1253 (8, 8),
1254 (9, 16),
1255 (15, 16),
1256 (16, 16),
1257 ];
1258 for (val_sz, expected) in cases {
1259 let stride = (val_sz + 7) & !7;
1260 assert_eq!(stride, expected, "value_size {val_sz} → stride {stride}");
1261 }
1262 }
1263
1264 /// Build a `BpfMapInfo` whose only populated field is the active
1265 /// name — the key both the construction-time predicate filter
1266 /// and `pinned_for` match on.
1267 fn info_named(name: &str) -> BpfMapInfo {
1268 let bytes = name.as_bytes();
1269 assert!(bytes.len() <= BPF_OBJ_NAME_LEN, "test name too long");
1270 let mut name_bytes = [0u8; BPF_OBJ_NAME_LEN];
1271 name_bytes[..bytes.len()].copy_from_slice(bytes);
1272 BpfMapInfo {
1273 name_bytes,
1274 name_len: bytes.len() as u8,
1275 ..Default::default()
1276 }
1277 }
1278
1279 /// The construction-time keep/discard filter
1280 /// ([`select_keeping`], the pure mirror of
1281 /// `from_running_kernel_filtered`'s inline predicate gate) keeps
1282 /// exactly the maps the predicate accepts: a predicate matching
1283 /// no names yields the empty set; a name-suffix predicate yields
1284 /// exactly the matching subset, in order.
1285 #[test]
1286 fn predicate_filters_pinned_set() {
1287 let infos = vec![
1288 info_named("scx_central"),
1289 info_named("central_dsq"),
1290 info_named("cilium_lb"),
1291 info_named("central_data"),
1292 ];
1293
1294 // Match-nothing predicate ⇒ empty kept set.
1295 let none = select_keeping(&infos, |_| false);
1296 assert!(none.is_empty(), "false predicate must discard every map");
1297
1298 // Match-everything predicate ⇒ full set, order preserved.
1299 let all = select_keeping(&infos, |_| true);
1300 assert_eq!(all.len(), 4, "true predicate must keep every map");
1301 assert_eq!(all[0].name(), "scx_central");
1302 assert_eq!(all[3].name(), "central_data");
1303
1304 // Name-suffix predicate ⇒ exactly the "central"-bearing subset.
1305 let kept = select_keeping(&infos, |i| i.name().contains("central"));
1306 let kept_names: Vec<String> = kept.iter().map(|i| i.name().to_string()).collect();
1307 assert_eq!(
1308 kept_names,
1309 vec!["scx_central", "central_dsq", "central_data"],
1310 "suffix predicate must keep exactly the matching subset, in order",
1311 );
1312
1313 // The same name-match key drives pinned_for: a target sharing
1314 // active name bytes matches; a differing name does not.
1315 assert!(
1316 info_name_matches(&info_named("central_dsq"), &info_named("central_dsq")),
1317 "identical active name bytes must match",
1318 );
1319 assert!(
1320 !info_name_matches(&info_named("central_dsq"), &info_named("central_data")),
1321 "differing active name bytes must NOT match",
1322 );
1323 // name_len bounds the compared region: a longer NUL-padded
1324 // buffer with a shorter name_len compares only the active
1325 // prefix, so "scx" (len 3) does not match "scx_central".
1326 assert!(
1327 !info_name_matches(&info_named("scx"), &info_named("scx_central")),
1328 "name_len must bound the match to the active prefix",
1329 );
1330 }
1331
1332 /// A cheap real fd for a test [`PinnedMap`]. `pinned_for` only
1333 /// compares names and never dereferences `.fd`, and every accessor
1334 /// read path under test returns at an early guard before the fd
1335 /// reaches a `bpf()` syscall — so any open fd suffices to satisfy
1336 /// the `OwnedFd` field. `/dev/null`
1337 /// is always present on Linux and `File` -> `OwnedFd` is the
1338 /// std-only conversion (no extra libc unsafe).
1339 fn dummy_fd() -> OwnedFd {
1340 OwnedFd::from(
1341 std::fs::File::open("/dev/null").expect("/dev/null must open on Linux test host"),
1342 )
1343 }
1344
1345 /// Build a [`PinnedMap`] from `info` + `map_extra`, carrying the
1346 /// dummy fd. The fields are private to the parent module; the
1347 /// `tests` child module can name and construct them directly,
1348 /// which is the inject seam the blueprint requires (no live
1349 /// kernel walk, no `from_running_kernel*` syscall path).
1350 fn pinned(info: BpfMapInfo, map_extra: u64) -> PinnedMap {
1351 PinnedMap {
1352 info,
1353 fd: dummy_fd(),
1354 map_extra,
1355 }
1356 }
1357
1358 /// Build a [`BpfSyscallAccessor`] holding exactly `maps`. The
1359 /// production constructors (`from_running_kernel*`) only ever
1360 /// populate `maps` via the live bpf(2) id walk; this literal is
1361 /// the host-test inject seam that lets the early-guard branches
1362 /// run without a kernel.
1363 fn accessor(maps: Vec<PinnedMap>) -> BpfSyscallAccessor {
1364 BpfSyscallAccessor { maps }
1365 }
1366
1367 /// `read_array` returns `None` on three rejections. Two are
1368 /// structurally pre-fd through the inject seam: the no-name-match
1369 /// (`pinned_for` -> None) returns before there is any fd, and the
1370 /// wrong-map-type guard returns before building the lookup attr.
1371 /// The `key >= max_entries` guard also returns `None`, but on this
1372 /// dummy-fd accessor that is indistinguishable from letting the
1373 /// `bpf()` `BPF_MAP_LOOKUP_ELEM` run on the bad fd
1374 /// (`-EINVAL` -> `None`) — so for a scalar `Option` return the
1375 /// key-bound case pins the rejection VALUE, not guard-precedence
1376 /// over the syscall (proving that needs a live map). The
1377 /// `key < max_entries` success path issues a real lookup and is NOT
1378 /// asserted here.
1379 #[test]
1380 fn read_array_pre_lookup_guards_reject() {
1381 let arr = BpfMapInfo {
1382 map_type: BPF_MAP_TYPE_ARRAY,
1383 max_entries: 4,
1384 value_size: 8,
1385 ..info_named("arr")
1386 };
1387 let acc = accessor(vec![pinned(arr.clone(), 0)]);
1388
1389 // No pinned map matches the target name -> pinned_for None.
1390 assert_eq!(acc.read_array(&info_named("missing"), 0), None);
1391
1392 // Name matches but map_type is HASH, not ARRAY -> type-reject.
1393 let hash = BpfMapInfo {
1394 map_type: BPF_MAP_TYPE_HASH,
1395 ..info_named("arr")
1396 };
1397 assert_eq!(acc.read_array(&hash, 0), None);
1398
1399 // key == max_entries and key > max_entries both reject before
1400 // the lookup (the kernel index_mask is a Spectre bound, the
1401 // explicit `key >= max_entries` is the range check).
1402 assert_eq!(acc.read_array(&arr, 4), None);
1403 assert_eq!(acc.read_array(&arr, 99), None);
1404 }
1405
1406 /// `read_value` returns `None` on two rejections. The no-name-match
1407 /// (`pinned_for` -> None) is structurally pre-fd — there is no map,
1408 /// hence no fd to look up. The wrong-map-type rejection (neither
1409 /// ARRAY nor STRUCT_OPS) also returns `None`, but on this dummy-fd
1410 /// accessor that is indistinguishable from letting the `bpf()`
1411 /// lookup run on the bad fd (`-EINVAL` -> `None`), so it pins the
1412 /// rejection VALUE rather than guard-precedence over the syscall.
1413 /// The post-lookup window-bounds / `checked_add` guards sit past the
1414 /// live lookup and need a real map; they are NOT asserted here.
1415 #[test]
1416 fn read_value_pre_lookup_type_reject() {
1417 // PERCPU_HASH is neither ARRAY nor STRUCT_OPS.
1418 let percpu_hash = BpfMapInfo {
1419 map_type: BPF_MAP_TYPE_PERCPU_HASH,
1420 value_size: 8,
1421 ..info_named("v")
1422 };
1423 let acc = accessor(vec![pinned(percpu_hash.clone(), 0)]);
1424
1425 assert_eq!(acc.read_value(&info_named("nomatch"), 0, 4), None);
1426 assert_eq!(acc.read_value(&percpu_hash, 0, 4), None);
1427 }
1428
1429 /// `iter_hash_map` returns an empty `Vec`. The no-name-match
1430 /// let-else is structurally pre-fd (no map, no fd). The type-reject
1431 /// (only HASH and LRU_HASH proceed) also returns empty, but on this
1432 /// dummy-fd accessor that is indistinguishable from the walk loop
1433 /// issuing `BPF_MAP_GET_NEXT_KEY` on the bad fd and breaking on the
1434 /// error — so it pins the empty RESULT, not guard-precedence over
1435 /// the syscall. The populated iteration path needs a live hash map.
1436 #[test]
1437 fn iter_hash_map_pre_walk_guards_empty() {
1438 let arr = BpfMapInfo {
1439 map_type: BPF_MAP_TYPE_ARRAY,
1440 ..info_named("h")
1441 };
1442 let acc = accessor(vec![pinned(arr.clone(), 0)]);
1443
1444 // No pinned match -> let-else returns Vec::new().
1445 assert_eq!(acc.iter_hash_map(&info_named("none")).len(), 0);
1446 // Name matches but map_type is ARRAY, not HASH/LRU_HASH.
1447 assert_eq!(acc.iter_hash_map(&arr).len(), 0);
1448 }
1449
1450 /// `read_percpu_array` returns an empty `Vec` (length 0) on the
1451 /// three pre-lookup guards: the no-name-match (`pinned_for` ->
1452 /// None), the wrong-map-type guard, and the `key >= max_entries`
1453 /// guard. The length distinguishes these from the
1454 /// post-lookup-failure branch which returns `vec![None; num_cpus]`
1455 /// (length `num_cpus`), so the assertions pin LENGTH 0, not just
1456 /// emptiness-of-content.
1457 #[test]
1458 fn read_percpu_array_pre_lookup_guards_empty() {
1459 let pa = BpfMapInfo {
1460 map_type: BPF_MAP_TYPE_PERCPU_ARRAY,
1461 max_entries: 2,
1462 value_size: 8,
1463 ..info_named("pa")
1464 };
1465 let acc = accessor(vec![pinned(pa.clone(), 0)]);
1466
1467 // No pinned match.
1468 assert_eq!(acc.read_percpu_array(&info_named("x"), 0, 4).len(), 0);
1469 // Name matches but map_type is ARRAY, not PERCPU_ARRAY.
1470 let arr = BpfMapInfo {
1471 map_type: BPF_MAP_TYPE_ARRAY,
1472 ..info_named("pa")
1473 };
1474 assert_eq!(acc.read_percpu_array(&arr, 0, 4).len(), 0);
1475 // key == max_entries rejects with length 0 (distinct from the
1476 // num_cpus-length lookup-failure vector).
1477 assert_eq!(acc.read_percpu_array(&pa, 2, 4).len(), 0);
1478 }
1479
1480 /// `iter_percpu_hash_map` returns an empty `PerCpuHashEntries`. The
1481 /// no-name-match let-else is structurally pre-fd (no map, no fd).
1482 /// The type-reject (only PERCPU_HASH and LRU_PERCPU_HASH proceed)
1483 /// also returns empty, but on this dummy-fd accessor that is
1484 /// indistinguishable from the walk loop breaking on the bad fd — so
1485 /// it pins the empty RESULT, not guard-precedence over the syscall.
1486 /// The populated walk path needs a live map.
1487 #[test]
1488 fn iter_percpu_hash_map_pre_walk_guards_empty() {
1489 let hash = BpfMapInfo {
1490 map_type: BPF_MAP_TYPE_HASH,
1491 ..info_named("ph")
1492 };
1493 let acc = accessor(vec![pinned(hash.clone(), 0)]);
1494
1495 // No pinned match.
1496 assert_eq!(acc.iter_percpu_hash_map(&info_named("none"), 4).len(), 0);
1497 // Name matches but map_type is HASH, not PERCPU_HASH/LRU_PERCPU_HASH.
1498 assert_eq!(acc.iter_percpu_hash_map(&hash, 4).len(), 0);
1499 }
1500
1501 /// `read_arena_pages` has three isolable, fd-free blocks ahead of
1502 /// the `mmap`: the no-name-match (`pinned_for` -> None ->
1503 /// `ArenaSnapshot::default()`), the wrong-map-type guard (->
1504 /// default), and the declared-span math + `declared_pages == 0`
1505 /// early return. The span math is pure:
1506 /// `declared_bytes_raw = max_entries * 4096` (saturating),
1507 /// `span_capped = declared_bytes_raw > MAX_ARENA_BYTES` (4 GiB),
1508 /// and the zero-page snapshot carries `user_vm_start = map_extra`.
1509 /// The populated mmap/mincore path needs a live arena fd.
1510 #[test]
1511 fn read_arena_pages_pre_mmap_paths() {
1512 // A 3-field literal: BpfArenaOffsets derives only Debug+Clone
1513 // (no Default), and the value is unused on every path under
1514 // test (the fn parameter `_arena_offsets` is ignored), so the
1515 // concrete offsets are arbitrary.
1516 let offsets = BpfArenaOffsets {
1517 arena_kern_vm: 0,
1518 arena_user_vm_start: 0,
1519 vm_struct_addr: 0,
1520 };
1521
1522 // max_entries == 0 -> declared_pages == 0 early return,
1523 // carrying user_vm_start = map_extra.
1524 let arena0 = BpfMapInfo {
1525 map_type: BPF_MAP_TYPE_ARENA,
1526 max_entries: 0,
1527 ..info_named("a")
1528 };
1529 let acc = accessor(vec![pinned(arena0.clone(), 0x1000)]);
1530
1531 // No name match -> ArenaSnapshot::default() (all-zero).
1532 let no_match = acc.read_arena_pages(&info_named("no"), &offsets);
1533 assert!(no_match.pages.is_empty());
1534 assert_eq!(no_match.declared_pages, 0);
1535 assert_eq!(no_match.user_vm_start, 0);
1536 assert!(!no_match.span_capped);
1537 assert!(!no_match.truncated);
1538
1539 // Name matches but map_type is ARRAY, not ARENA -> default.
1540 let arr = BpfMapInfo {
1541 map_type: BPF_MAP_TYPE_ARRAY,
1542 ..info_named("a")
1543 };
1544 let type_reject = acc.read_arena_pages(&arr, &offsets);
1545 assert!(type_reject.pages.is_empty());
1546 assert_eq!(type_reject.declared_pages, 0);
1547 assert_eq!(type_reject.user_vm_start, 0);
1548 assert!(!type_reject.span_capped);
1549
1550 // declared_pages == 0 path: empty pages, span not capped,
1551 // user_vm_start carried through from map_extra.
1552 let zero = acc.read_arena_pages(&arena0, &offsets);
1553 assert_eq!(zero.pages.len(), 0);
1554 assert_eq!(zero.declared_pages, 0);
1555 assert!(!zero.span_capped);
1556 assert_eq!(zero.user_vm_start, 0x1000);
1557 assert!(!zero.truncated);
1558
1559 // max_entries == u32::MAX -> declared_bytes_raw =
1560 // 0xFFFF_FFFF * 4096 > 4 GiB, so span_capped is set. With the
1561 // span capped to MAX_ARENA_BYTES, declared_pages > 0, so the
1562 // span-math result is only observable on this sub-case
1563 // through the MAP_FAILED snapshot or a populated walk — both
1564 // need a live fd. The dummy /dev/null fd makes mmap fail
1565 // (MAP_FAILED), exercising the MAP_FAILED early return,
1566 // which carries span_capped + user_vm_start. Assert exactly
1567 // those two carry-through fields, which the blueprint marks
1568 // host-assertable.
1569 let arena_max = BpfMapInfo {
1570 map_type: BPF_MAP_TYPE_ARENA,
1571 max_entries: u32::MAX,
1572 ..info_named("a")
1573 };
1574 let acc_max = accessor(vec![pinned(arena_max.clone(), 0x2000)]);
1575 let capped = acc_max.read_arena_pages(&arena_max, &offsets);
1576 assert!(capped.span_capped, "u32::MAX max_entries must cap the span");
1577 assert_eq!(capped.user_vm_start, 0x2000);
1578 }
1579
1580 #[test]
1581 fn arena_mmap_placement_map_extra_pins_full_span_fixed_noreplace() {
1582 // map_extra set (nonzero user_vm_start): the read must land at
1583 // exactly map_extra and span the FULL declared arena (not the
1584 // capped read window) with MAP_FIXED_NOREPLACE, or
1585 // arena_map_mmap returns -EBUSY on the user_vm_start/end check.
1586 let (hint, flags, len) = arena_mmap_placement(0x1_0000_0000, 8192, 4096);
1587 assert_eq!(hint, 0x1_0000_0000, "hint must be map_extra, not NULL");
1588 assert_eq!(flags, libc::MAP_SHARED | libc::MAP_FIXED_NOREPLACE);
1589 assert_eq!(len, 8192, "full declared span, not the capped read window");
1590 }
1591
1592 #[test]
1593 fn arena_mmap_placement_no_map_extra_uses_null_capped_prefix() {
1594 // map_extra == 0: the kernel adopts our VA, so a NULL hint and
1595 // the capped read window are correct.
1596 let (hint, flags, len) = arena_mmap_placement(0, 8192, 4096);
1597 assert_eq!(hint, 0, "NULL hint — kernel chooses the VA");
1598 assert_eq!(flags, libc::MAP_SHARED);
1599 assert_eq!(len, 4096, "capped read window, not the full declared span");
1600 }
1601
1602 /// `load_program_btf` returns `None` immediately when
1603 /// `btf_id == 0` (`btf_id = map.btf_kva as u32`). `info_named`
1604 /// leaves `btf_kva` at its `Default` (0), so the guard fires before
1605 /// the `BPF_BTF_GET_FD_BY_ID` syscall and the `base_btf` argument is
1606 /// never dereferenced (it is only used on the post-fetch arms). Only
1607 /// this guard is host-assertable; the BTF-fetch path needs a live
1608 /// kernel BTF object.
1609 #[test]
1610 fn load_program_btf_btf_id_zero_returns_none() {
1611 // info_named leaves btf_kva == 0 (Default).
1612 let prog = info_named("prog");
1613 assert_eq!(prog.btf_kva, 0, "info_named must default btf_kva to 0");
1614 let acc = accessor(vec![pinned(prog.clone(), 0)]);
1615
1616 // base_btf: a minimal valid BTF blob — magic 0xEB9F, version 1,
1617 // 24-byte header, one Int type (id 1) so the type section is
1618 // non-empty, strtab leading with NUL. Mirrors the
1619 // `cast_analysis` tests' `build_btf` minimal layout. Never
1620 // dereferenced on the btf_id==0 path; built only to satisfy
1621 // the `&Btf` parameter.
1622 let base = minimal_btf();
1623 // `btf_rs::Btf` derives neither `PartialEq` nor `Debug`, so
1624 // `assert_eq!(.., None)` cannot be used on `Option<Btf>`;
1625 // `is_none()` is the exact discriminant check here (the
1626 // btf_id==0 guard returns the `None` variant outright, with
1627 // no `Btf` value to compare).
1628 assert!(
1629 acc.load_program_btf(&prog, &base).is_none(),
1630 "btf_id==0 must short-circuit to None before any bpf() call",
1631 );
1632 }
1633
1634 /// Hand-build a minimal parseable BTF blob: a single `int` type
1635 /// (id 1, named "u64", 8 bytes) plus a NUL-led string table,
1636 /// wrapped in the 24-byte BTF header. Layout verified against the
1637 /// in-tree `src/monitor/cast_analysis/tests/mod.rs::build_btf`
1638 /// minimal path (the `empty_btf_no_panic` test proves a
1639 /// single-Int blob parses via `Btf::from_bytes`).
1640 fn minimal_btf() -> Btf {
1641 // String table: leading NUL (offset 0 = anonymous) + "u64\0".
1642 let mut strings: Vec<u8> = vec![0];
1643 let n_u64 = strings.len() as u32;
1644 strings.extend_from_slice(b"u64\0");
1645
1646 // Type section: one BTF_KIND_INT (kind 1).
1647 let mut type_section: Vec<u8> = Vec::new();
1648 const BTF_KIND_INT: u32 = 1;
1649 type_section.extend_from_slice(&n_u64.to_le_bytes()); // name_off
1650 let info = (BTF_KIND_INT << 24) & 0x1f00_0000; // vlen 0
1651 type_section.extend_from_slice(&info.to_le_bytes());
1652 type_section.extend_from_slice(&8u32.to_le_bytes()); // size = 8
1653 // btf_int data word: encoding 0, offset 0, bits 64.
1654 let int_data: u32 = 64;
1655 type_section.extend_from_slice(&int_data.to_le_bytes());
1656
1657 let type_len = type_section.len() as u32;
1658 let str_len = strings.len() as u32;
1659
1660 let mut blob: Vec<u8> = Vec::new();
1661 blob.extend_from_slice(&0xEB9F_u16.to_le_bytes()); // magic
1662 blob.push(1); // version
1663 blob.push(0); // flags
1664 blob.extend_from_slice(&24u32.to_le_bytes()); // hdr_len
1665 blob.extend_from_slice(&0u32.to_le_bytes()); // type_off
1666 blob.extend_from_slice(&type_len.to_le_bytes()); // type_len
1667 blob.extend_from_slice(&type_len.to_le_bytes()); // str_off = type_len
1668 blob.extend_from_slice(&str_len.to_le_bytes()); // str_len
1669 blob.extend_from_slice(&type_section);
1670 blob.extend_from_slice(&strings);
1671
1672 Btf::from_bytes(&blob).expect("minimal synthetic BTF must parse")
1673 }
1674}