ktstr/monitor/
live_host_kernel.rs

1//! Auto-discovery of the running host kernel's vmlinux, BTF, and
2//! symbol table for the live-host introspection path.
3//!
4//! Companion to [`super::bpf_syscall::BpfSyscallAccessor`]. Where the
5//! frozen-VM path resolves vmlinux/symbols from a kernel build tree
6//! the freeze coordinator already controls, the live-host path has
7//! to find them on whatever distro happens to be running. This
8//! module centralizes the search.
9//!
10//! # What gets discovered
11//!
12//! | resource           | source                                                       |
13//! |--------------------|--------------------------------------------------------------|
14//! | kernel release     | `uname(2)` (libc::uname)                                     |
15//! | vmlinux ELF        | `/lib/modules/$(uname -r)/build/vmlinux`, distro debug paths |
16//! | BTF                | `/sys/kernel/btf/vmlinux` (always present with sched_ext)    |
17//! | kernel symbols     | `/proc/kallsyms` (root-readable, falls back per-line)        |
18//!
19//! # Why a separate module
20//!
21//! The frozen-VM pipeline (`vmm/mod.rs::find_vmlinux`,
22//! `kernel_path::resolve_btf`) already searches similar paths but
23//! with different priorities — it expects the freeze coordinator to
24//! have built or downloaded the kernel and knows where the build
25//! tree lives. The live-host pipeline starts from "no idea where
26//! anything is, just whatever the running kernel exposes" and works
27//! outward. Reusing the existing search would conflate the two
28//! semantics (e.g. the frozen-VM code prefers a kernel-source-tree
29//! vmlinux over `/sys/kernel/btf/vmlinux`; the live-host code
30//! prefers BTF first for parse-cost reasons since it skips the goblin
31//! ELF section walk).
32//!
33//! # BTF preference order
34//!
35//! 1. `/sys/kernel/btf/vmlinux` — always present when sched_ext is
36//!    enabled (CONFIG_DEBUG_INFO_BTF). Raw BTF blob; no ELF parse
37//!    needed.
38//! 2. `/lib/modules/$(uname -r)/build/vmlinux` — kernel build tree,
39//!    typically only present when `linux-headers-*` or a vendor
40//!    kbuild package is installed.
41//! 3. `/usr/lib/debug/boot/vmlinux-$(uname -r)` — debian/ubuntu
42//!    `linux-image-*-dbg` package.
43//! 4. `/usr/lib/debug/lib/modules/$(uname -r)/vmlinux` — fedora /
44//!    rhel kernel-debuginfo layout.
45//! 5. ktstr's kernel cache — for ktstr-built kernels installed via
46//!    `cargo ktstr kernel build`, the cache root holds vmlinux next
47//!    to the boot image.
48//!
49//! Order chosen so the cheapest parse path (raw BTF) wins by
50//! default, and the more expensive ELF-extraction paths only run
51//! when the live-host caller specifically asks for the full ELF
52//! (e.g. for symbol resolution that goes beyond /proc/kallsyms).
53
54use std::ffi::CStr;
55use std::path::{Path, PathBuf};
56
57use anyhow::{Context, Result, anyhow};
58
59/// Resolved live-host kernel environment.
60///
61/// Built once at the start of a live-host introspection run; the
62/// dump pipeline holds it alongside the
63/// [`super::bpf_syscall::BpfSyscallAccessor`] for the duration of
64/// the dump.
65#[derive(Debug, Clone)]
66#[allow(dead_code)] // public via crate::live_host; the lib's own
67// compilation graph never constructs it (only the
68// tests/live_host_inside_vm.rs integration test does).
69pub struct LiveHostKernelEnv {
70    /// Output of `uname -r` — the running kernel's release string
71    /// (e.g. "6.16.0-1234-generic"). Used to interpolate paths
72    /// like `/lib/modules/<release>/build/vmlinux`.
73    pub release: String,
74    /// Path to the vmlinux ELF (or raw BTF blob) the BTF parser
75    /// will load. Always set; resolution order is documented on the
76    /// module-level doc.
77    pub btf_path: PathBuf,
78    /// Path to a vmlinux ELF when one is reachable on this host.
79    /// `None` when only `/sys/kernel/btf/vmlinux` is available
80    /// (raw BTF, no ELF) — most common on stripped distro kernels
81    /// without `linux-headers-*` / `linux-image-*-dbg` installed.
82    pub vmlinux_elf_path: Option<PathBuf>,
83    /// Path to `/proc/kallsyms` — fixed but kept here so callers
84    /// that want to swap in a unit-test fixture (or `/proc/PID/maps`
85    /// alternative) have a single override point.
86    pub kallsyms_path: PathBuf,
87}
88
89impl LiveHostKernelEnv {
90    /// Auto-discover every resource needed by the live-host
91    /// introspection pipeline.
92    ///
93    /// Returns an error only when none of the BTF candidate paths
94    /// resolve — without BTF the failure-dump renderer can't decode
95    /// any field. Missing vmlinux ELF or unreadable `/proc/kallsyms`
96    /// are NOT errors at this layer: callers that need ELF or
97    /// symbols for their specific dump pass surface their own error
98    /// when they reach for an unavailable resource.
99    #[allow(dead_code)]
100    pub fn discover() -> Result<Self> {
101        let release = uname_release().context("uname(2) failed")?;
102        let btf_path = locate_btf(&release).ok_or_else(|| {
103            anyhow!("no BTF found (looked in /sys/kernel/btf/vmlinux and ELF paths)")
104        })?;
105        let vmlinux_elf_path = locate_vmlinux_elf(&release);
106        Ok(Self {
107            release,
108            btf_path,
109            vmlinux_elf_path,
110            kallsyms_path: PathBuf::from("/proc/kallsyms"),
111        })
112    }
113}
114
115/// `uname(2)` syscall wrapper. Returns the running kernel's release
116/// string (the field that `uname -r` prints).
117///
118/// SAFETY: libc::uname populates a `utsname` struct on the stack;
119/// the release field is a NUL-terminated `c_char[65]` in glibc /
120/// musl's definition.
121#[allow(dead_code)]
122pub fn uname_release() -> Result<String> {
123    // SAFETY: libc::utsname is a POD; zero-init is valid input to
124    // libc::uname which fills it.
125    let mut uts: libc::utsname = unsafe { std::mem::zeroed() };
126    // SAFETY: libc::uname is a thin wrapper over the syscall;
127    // returns 0 on success, -1 on failure (very rare — only fails
128    // on a kernel-side fault).
129    let ret = unsafe { libc::uname(&mut uts as *mut libc::utsname) };
130    if ret != 0 {
131        return Err(anyhow!(
132            "uname(2) failed: {}",
133            std::io::Error::last_os_error()
134        ));
135    }
136    // SAFETY: libc fills .release with a NUL-terminated string of
137    // at most 65 bytes. CStr::from_ptr requires a valid NUL-
138    // terminated pointer; libc guarantees this on success.
139    let release = unsafe { CStr::from_ptr(uts.release.as_ptr()) }
140        .to_str()
141        .context("uname.release was not valid UTF-8")?
142        .to_string();
143    Ok(release)
144}
145
146/// Locate a BTF source for the running kernel. See module-level doc
147/// for the search order.
148///
149/// `/sys/kernel/btf/vmlinux` is preferred — when present (kernel
150/// built with `CONFIG_DEBUG_INFO_BTF`, mandatory for sched_ext on
151/// modern distros), it provides a raw BTF blob ready for
152/// `Btf::from_bytes`. ELF candidates are returned only when the
153/// raw BTF is absent.
154fn locate_btf(release: &str) -> Option<PathBuf> {
155    // 1. /sys/kernel/btf/vmlinux — fastest path, always present
156    //    on a sched_ext-capable kernel.
157    let sysfs = Path::new("/sys/kernel/btf/vmlinux");
158    if sysfs.is_file() {
159        return Some(sysfs.to_path_buf());
160    }
161    // 2-5. Fall back to ELF candidates that ALSO carry BTF in their
162    //      .BTF section — the BTF loader handles both formats.
163    locate_vmlinux_elf(release)
164}
165
166/// Locate a vmlinux ELF for the running kernel.
167///
168/// Search order (descending priority):
169/// - `/lib/modules/$(uname -r)/build/vmlinux`
170/// - `/usr/lib/debug/boot/vmlinux-$(uname -r)` (debian/ubuntu dbg)
171/// - `/usr/lib/debug/lib/modules/$(uname -r)/vmlinux` (fedora/rhel)
172/// - ktstr kernel cache (when present, falls through last)
173fn locate_vmlinux_elf(release: &str) -> Option<PathBuf> {
174    let candidates = [
175        format!("/lib/modules/{release}/build/vmlinux"),
176        format!("/usr/lib/debug/boot/vmlinux-{release}"),
177        format!("/usr/lib/debug/lib/modules/{release}/vmlinux"),
178    ];
179    for cand in &candidates {
180        let p = Path::new(cand);
181        if p.is_file() {
182            return Some(p.to_path_buf());
183        }
184    }
185    // ktstr kernel cache — defer to the disk-template cache resolver.
186    // The cache root is computed by
187    // [`crate::vmm::disk_template::cache_root`] but we only consult
188    // the per-release entry shape: <cache root> / <key> / vmlinux.
189    // Without a cache key we can't address a specific build, so we
190    // fall through here and let the live-host caller specify a kernel
191    // cache entry explicitly when they know one matches.
192    None
193}
194
195/// Parsed kernel symbol table from `/proc/kallsyms`.
196///
197/// Per-line lazy lookup is too slow for the live-host pipeline,
198/// which resolves dozens of symbols (sched_class addresses, lock
199/// slowpath entry points, scx_root, etc.) at a single dump time.
200/// `KallsymsTable` parses once and holds an O(1) name→addr map.
201#[derive(Debug, Clone)]
202#[allow(dead_code)]
203pub struct KallsymsTable {
204    by_name: std::collections::HashMap<String, u64>,
205}
206
207impl KallsymsTable {
208    /// Read and parse `/proc/kallsyms` from the configured path on
209    /// `env`. Returns an error when the file is unreadable —
210    /// `/proc/kallsyms` is root-readable only, and unprivileged
211    /// callers see a 0-filled file. The parser detects the all-
212    /// zeros case and returns an empty map without erroring (so
213    /// non-privileged unit tests still get a usable
214    /// `KallsymsTable` even though it can't resolve anything).
215    #[allow(dead_code)]
216    pub fn load_from(env: &LiveHostKernelEnv) -> Result<Self> {
217        Self::load_from_path(&env.kallsyms_path)
218    }
219
220    /// Read and parse a kallsyms file from an explicit path. Useful
221    /// for unit tests and for the rare live-host caller that wants
222    /// to point at a saved snapshot rather than the live
223    /// `/proc/kallsyms`.
224    #[allow(dead_code)]
225    pub fn load_from_path(path: &Path) -> Result<Self> {
226        let raw =
227            std::fs::read_to_string(path).with_context(|| format!("read {}", path.display()))?;
228        Ok(Self::parse(&raw))
229    }
230
231    /// Parse kallsyms-format text (one `HEX TYPE NAME ...` line per
232    /// symbol) into a name→address map.
233    ///
234    /// Skipped lines (silently, without affecting other symbols):
235    /// - lines with fewer than 3 whitespace-separated tokens
236    /// - lines whose first token is not a hex-parseable u64
237    /// - lines whose address is 0 (the kallsyms-redacted view that
238    ///   unprivileged readers see — addresses are zero-filled by
239    ///   the kernel for non-CAP_SYSLOG callers)
240    ///
241    /// A returned table with `len() == 0` is valid: the caller can
242    /// detect "kallsyms unreadable" via `is_empty()` and surface a
243    /// permission diagnostic without this layer producing an error.
244    pub fn parse(raw: &str) -> Self {
245        let mut by_name = std::collections::HashMap::new();
246        for line in raw.lines() {
247            let mut parts = line.split_whitespace();
248            let Some(addr) = parts.next() else { continue };
249            let _ty = parts.next();
250            let Some(sym) = parts.next() else { continue };
251            let Ok(addr) = u64::from_str_radix(addr, 16) else {
252                continue;
253            };
254            // Skip the redacted-view all-zeros entries. A genuine
255            // 0-valued symbol address would be a kernel bug; the
256            // expected case is "unprivileged kallsyms reader sees
257            // every line zeroed out".
258            if addr == 0 {
259                continue;
260            }
261            by_name.insert(sym.to_string(), addr);
262        }
263        Self { by_name }
264    }
265
266    /// Look up a symbol by exact name. Returns the kernel virtual
267    /// address (u64) or `None` when the name is not in the table.
268    #[allow(dead_code)]
269    pub fn resolve(&self, name: &str) -> Option<u64> {
270        self.by_name.get(name).copied()
271    }
272
273    /// Total number of resolved symbols. Zero when /proc/kallsyms
274    /// was readable but every line was redacted (unprivileged
275    /// caller case).
276    #[allow(dead_code)]
277    pub fn len(&self) -> usize {
278        self.by_name.len()
279    }
280
281    /// True when the table holds no usable symbols. Live-host
282    /// callers that hit this should surface a "run as root"
283    /// diagnostic.
284    #[allow(dead_code)]
285    pub fn is_empty(&self) -> bool {
286        self.by_name.is_empty()
287    }
288}
289
290#[cfg(test)]
291mod tests {
292    use super::*;
293
294    /// uname_release returns a non-empty string on any platform
295    /// where libc::uname succeeds. Linux always succeeds — this
296    /// test would only fail on a hypothetical hostile kernel that
297    /// returned -1, which would be a test-environment bug.
298    #[test]
299    fn uname_release_returns_nonempty() {
300        let release = uname_release().expect("uname succeeds on Linux");
301        assert!(!release.is_empty());
302        // Sanity: every kernel since the dawn of time has had at
303        // least one dot in the release (major.minor or
304        // major.minor.patch).
305        assert!(
306            release.contains('.'),
307            "release {release:?} should look like X.Y or X.Y.Z"
308        );
309    }
310
311    /// `KallsymsTable::parse` recovers every well-formed symbol from a
312    /// representative kallsyms snippet. Mirrors the format the
313    /// kernel actually produces (HEX TYPE NAME [MODULE]).
314    #[test]
315    fn kallsyms_parse_basic() {
316        let raw = "\
317ffffffff80100000 T _stext
318ffffffff80101234 T scx_disable_workfn
319ffffffff80105678 t local_static_function
320ffffffff8000abcd D ext_sched_class
321";
322        let table = KallsymsTable::parse(raw);
323        assert_eq!(table.resolve("_stext"), Some(0xffffffff80100000));
324        assert_eq!(
325            table.resolve("scx_disable_workfn"),
326            Some(0xffffffff80101234)
327        );
328        assert_eq!(
329            table.resolve("local_static_function"),
330            Some(0xffffffff80105678)
331        );
332        assert_eq!(table.resolve("ext_sched_class"), Some(0xffffffff8000abcd));
333        assert_eq!(table.len(), 4);
334        assert!(!table.is_empty());
335    }
336
337    /// Redacted-view kallsyms (every address zero, what an
338    /// unprivileged reader sees) parses to an empty table. The
339    /// table is `is_empty()` rather than failing — callers
340    /// distinguish "unreadable" (load failure) from "redacted"
341    /// (parsed-but-empty) themselves.
342    #[test]
343    fn kallsyms_parse_skips_zero_addresses() {
344        let raw = "\
3450000000000000000 T _stext
3460000000000000000 T scx_disable_workfn
347";
348        let table = KallsymsTable::parse(raw);
349        assert!(table.is_empty());
350        assert_eq!(table.resolve("_stext"), None);
351    }
352
353    /// Malformed lines (too few fields, non-hex address) are
354    /// skipped without affecting good lines on either side.
355    #[test]
356    fn kallsyms_parse_skips_malformed_lines() {
357        let raw = "\
358ffffffff80100000 T _stext
359not-a-hex-address T garbage
360short_line
361ffffffff80105678 T good_symbol
362";
363        let table = KallsymsTable::parse(raw);
364        assert_eq!(table.resolve("_stext"), Some(0xffffffff80100000));
365        assert_eq!(table.resolve("good_symbol"), Some(0xffffffff80105678));
366        assert_eq!(table.resolve("garbage"), None);
367        assert_eq!(table.len(), 2);
368    }
369
370    /// `KallsymsTable::load_from_path` reads from a file path
371    /// rather than the live `/proc/kallsyms`. Verifies the
372    /// pluggable-path constructor used by tests.
373    #[test]
374    fn kallsyms_load_from_path() {
375        use std::io::Write;
376        let tmp = tempfile::NamedTempFile::new().unwrap();
377        let mut f = tmp.reopen().unwrap();
378        writeln!(f, "ffffffff80100000 T _stext").unwrap();
379        writeln!(f, "ffffffff80101234 T target_symbol").unwrap();
380        drop(f);
381
382        let table = KallsymsTable::load_from_path(tmp.path()).unwrap();
383        assert_eq!(table.resolve("target_symbol"), Some(0xffffffff80101234));
384    }
385
386    /// `LiveHostKernelEnv::discover` works on any sched_ext-capable
387    /// kernel — it just needs `/sys/kernel/btf/vmlinux` to exist.
388    /// Skip the test when running on a host without it (e.g. a
389    /// build container without sched_ext debug info).
390    #[test]
391    fn live_host_kernel_env_discover_smoke() {
392        if !Path::new("/sys/kernel/btf/vmlinux").is_file() {
393            // No way to verify discover() on this host; skip.
394            return;
395        }
396        let env = LiveHostKernelEnv::discover().expect("BTF present, discover should succeed");
397        assert!(!env.release.is_empty());
398        assert!(env.btf_path.exists());
399        // kallsyms_path is always /proc/kallsyms regardless of
400        // whether the file is readable.
401        assert_eq!(env.kallsyms_path, Path::new("/proc/kallsyms"));
402    }
403
404    /// `locate_btf` falls through to ELF candidates when sysfs is
405    /// missing. We can't easily test the sysfs path here without
406    /// a syscall mock; verify the ELF fallback shape by passing a
407    /// release that maps to no real path.
408    #[test]
409    fn locate_btf_no_real_release_returns_none_or_sysfs() {
410        let result = locate_btf("definitely-not-a-kernel-release-9.99");
411        // Either /sys/kernel/btf/vmlinux exists (and we get that)
412        // or no fallback path resolves (and we get None).
413        if let Some(p) = result {
414            assert_eq!(p, Path::new("/sys/kernel/btf/vmlinux"))
415        }
416    }
417}