ktstr/scenario/
bpf_pin.rs

1//! Guest-side BPF map fd pinning. See
2//! [`crate::scenario::ops::Op::PinBpfMap`] for the full
3//! motivation; in short, the same-binary `Op::ReplaceScheduler`
4//! swap window's multi-bss case (two `<obj>.bss` copies coexisting
5//! while the dying scheduler's BPF object is being torn down) only
6//! fires when both copies are still alive at freeze time, and the
7//! kernel frees the dying instance's maps as soon as libbpf drops
8//! their fds. Holding an extra refcount via this helper keeps the
9//! dying scheduler's map alive long enough for at least one
10//! post-swap freeze to observe both copies, which is what the
11//! framework's [`crate::scenario::snapshot::Snapshot::active`]
12//! plus walker disambiguation chain exists to handle.
13
14use anyhow::{Result, bail};
15use libbpf_rs::libbpf_sys;
16use libbpf_rs::query::MapInfoIter;
17use std::io;
18use std::os::fd::{FromRawFd, OwnedFd};
19
20/// Walk the kernel's BPF map ID space, find the first map whose
21/// `bpf_map_info.name` matches `name`, return its [`OwnedFd`]. The
22/// caller holds the returned fd to keep the map alive (the kernel
23/// refcount only drops to zero once every fd holder releases).
24///
25/// `name` is matched against the kernel-side map name by full-string
26/// equality. BPF map names are NUL-terminated and capped at
27/// `BPF_OBJ_NAME_LEN = 16` bytes (including the trailing NUL — so 15
28/// usable chars max) per `kernel/bpf/syscall.c`'s `bpf_obj_name_cpy`.
29/// Pass the kernel-visible name (typically `<obj>.bss` / `<obj>.data`
30/// / `<obj>.rodata`); libbpf truncates long object prefixes to fit
31/// the 15-char cap, so for a scheduler whose libbpf-source obj name
32/// exceeds the limit, the kernel-visible name is the FIRST-15-chars
33/// form. Reading a previous [`crate::monitor::dump::FailureDumpReport`]'s
34/// `maps[].name` or running `bpftool map list` outside the test is
35/// the safe way to discover the exact string the kernel sees.
36///
37/// **Order matters at the test layer**: this helper must run AFTER
38/// the target scheduler's BPF object is loaded. The companion
39/// [`crate::scenario::ops::Op::PinBpfMap`] doc documents the "place
40/// after a hold long enough for the scheduler to be ready" pattern;
41/// this helper itself does not block or retry.
42///
43/// **ID-order tiebreaker**: the underlying
44/// [`libbpf_rs::query::MapInfoIter`] walks in monotonically-
45/// increasing map-id order, so when multiple maps share the same
46/// name (the same-binary swap window's multi-bss case), the lowest-
47/// id (oldest) map is returned. For the swap-window scenario this
48/// means: call BEFORE `Op::ReplaceScheduler` so the captured fd is
49/// on the OUTGOING scheduler's map; the new scheduler's load will
50/// then create a SECOND copy that's also kept alive because the
51/// old refcount blocks the kernel from freeing the id.
52///
53/// **Error on miss**: returns Err naming every map name the walk
54/// observed, so the caller can sanity-check what's actually loaded
55/// (vs typo'd name vs scheduler-not-attached-yet vs map-already-freed).
56///
57/// **Privilege**: requires `CAP_SYS_ADMIN`. The kernel gates
58/// `BPF_*_GET_NEXT_ID` and `BPF_MAP_GET_FD_BY_ID` on CAP_SYS_ADMIN
59/// unconditionally (`kernel/bpf/syscall.c:4761` and `:4869`),
60/// independent of `CAP_BPF` (which only governs prog/map creation).
61/// ktstr always runs as root inside the guest VM so this is satisfied.
62pub fn open_bpf_map_fd_by_name(name: &str) -> Result<OwnedFd> {
63    let mut observed_names: Vec<String> = Vec::new();
64    for info in MapInfoIter::default() {
65        let map_name = info.name.to_string_lossy().into_owned();
66        if map_name == name {
67            // MapInfoIter consumes its per-iteration enumeration fd
68            // at the end of `next()` (the OwnedFd it built drops),
69            // so we re-open via id to obtain a caller-owned fd.
70            // TOCTOU window: the map may have been freed between the
71            // enumeration step and this call (e.g. the dying-side
72            // BPF object's last fd just dropped); surface that as a
73            // usable error rather than a silent test misfire.
74            //
75            // SAFETY: `bpf_map_get_fd_by_id` is a syscall wrapper
76            // with no preconditions beyond a valid `u32` id; on
77            // success it returns a kernel-owned file descriptor
78            // that we take ownership of, on failure it returns -1
79            // and sets errno.
80            let fd = unsafe { libbpf_sys::bpf_map_get_fd_by_id(info.id) };
81            if fd < 0 {
82                bail!(
83                    "BPF map '{name}' (id={}) disappeared between enumeration and \
84                     fd-open: {}",
85                    info.id,
86                    io::Error::last_os_error(),
87                );
88            }
89            // SAFETY: `fd` came from a successful kernel syscall on
90            // the line above and has not been exposed to any other
91            // code path, so we are the sole owner; transferring it
92            // into `OwnedFd` makes the Drop close it at the right
93            // time (when the caller drops the returned value).
94            return Ok(unsafe { OwnedFd::from_raw_fd(fd) });
95        }
96        observed_names.push(map_name);
97    }
98    bail!(
99        "BPF map '{name}' not found in any currently-attached BPF object — \
100         scanned {} maps; observed names: {observed_names:?}. \
101         Common causes: (a) the target scheduler's BPF object hasn't \
102         finished loading yet (place this op AFTER a hold long enough for \
103         the scheduler to be ready); (b) the requested name exceeds the \
104         15-char usable cap of `BPF_OBJ_NAME_LEN` and was truncated by \
105         libbpf when loaded — compare against the observed names above; \
106         (c) the map has already been freed (no fd holders left).",
107        observed_names.len(),
108    );
109}