ktstr/monitor/live_host_kernel.rs
1//! Auto-discovery of the running host kernel's vmlinux, BTF, and
2//! symbol table for the live-host introspection path.
3//!
4//! Companion to [`super::bpf_syscall::BpfSyscallAccessor`]. Where the
5//! frozen-VM path resolves vmlinux/symbols from a kernel build tree
6//! the freeze coordinator already controls, the live-host path has
7//! to find them on whatever distro happens to be running. This
8//! module centralizes the search.
9//!
10//! # What gets discovered
11//!
12//! | resource | source |
13//! |--------------------|--------------------------------------------------------------|
14//! | kernel release | `uname(2)` (libc::uname) |
15//! | vmlinux ELF | `/lib/modules/$(uname -r)/build/vmlinux`, distro debug paths |
16//! | BTF | `/sys/kernel/btf/vmlinux` (always present with sched_ext) |
17//! | kernel symbols | `/proc/kallsyms` (root-readable, falls back per-line) |
18//!
19//! # Why a separate module
20//!
21//! The frozen-VM pipeline (`vmm/mod.rs::find_vmlinux`,
22//! `kernel_path::resolve_btf`) already searches similar paths but
23//! with different priorities — it expects the freeze coordinator to
24//! have built or downloaded the kernel and knows where the build
25//! tree lives. The live-host pipeline starts from "no idea where
26//! anything is, just whatever the running kernel exposes" and works
27//! outward. Reusing the existing search would conflate the two
28//! semantics (e.g. the frozen-VM code prefers a kernel-source-tree
29//! vmlinux over `/sys/kernel/btf/vmlinux`; the live-host code
30//! prefers BTF first for parse-cost reasons since it skips the goblin
31//! ELF section walk).
32//!
33//! # BTF preference order
34//!
35//! 1. `/sys/kernel/btf/vmlinux` — always present when sched_ext is
36//! enabled (CONFIG_DEBUG_INFO_BTF). Raw BTF blob; no ELF parse
37//! needed.
38//! 2. `/lib/modules/$(uname -r)/build/vmlinux` — kernel build tree,
39//! typically only present when `linux-headers-*` or a vendor
40//! kbuild package is installed.
41//! 3. `/usr/lib/debug/boot/vmlinux-$(uname -r)` — debian/ubuntu
42//! `linux-image-*-dbg` package.
43//! 4. `/usr/lib/debug/lib/modules/$(uname -r)/vmlinux` — fedora /
44//! rhel kernel-debuginfo layout.
45//! 5. ktstr's kernel cache — for ktstr-built kernels installed via
46//! `cargo ktstr kernel build`, the cache root holds vmlinux next
47//! to the boot image.
48//!
49//! Order chosen so the cheapest parse path (raw BTF) wins by
50//! default, and the more expensive ELF-extraction paths only run
51//! when the live-host caller specifically asks for the full ELF
52//! (e.g. for symbol resolution that goes beyond /proc/kallsyms).
53
54use std::ffi::CStr;
55use std::path::{Path, PathBuf};
56
57use anyhow::{Context, Result, anyhow};
58
59/// Resolved live-host kernel environment.
60///
61/// Built once at the start of a live-host introspection run; the
62/// dump pipeline holds it alongside the
63/// [`super::bpf_syscall::BpfSyscallAccessor`] for the duration of
64/// the dump.
65#[derive(Debug, Clone)]
66#[allow(dead_code)] // public via crate::live_host; the lib's own
67// compilation graph never constructs it (only the
68// tests/live_host_inside_vm.rs integration test does).
69pub struct LiveHostKernelEnv {
70 /// Output of `uname -r` — the running kernel's release string
71 /// (e.g. "6.16.0-1234-generic"). Used to interpolate paths
72 /// like `/lib/modules/<release>/build/vmlinux`.
73 pub release: String,
74 /// Path to the vmlinux ELF (or raw BTF blob) the BTF parser
75 /// will load. Always set; resolution order is documented on the
76 /// module-level doc.
77 pub btf_path: PathBuf,
78 /// Path to a vmlinux ELF when one is reachable on this host.
79 /// `None` when only `/sys/kernel/btf/vmlinux` is available
80 /// (raw BTF, no ELF) — most common on stripped distro kernels
81 /// without `linux-headers-*` / `linux-image-*-dbg` installed.
82 pub vmlinux_elf_path: Option<PathBuf>,
83 /// Path to `/proc/kallsyms` — fixed but kept here so callers
84 /// that want to swap in a unit-test fixture (or `/proc/PID/maps`
85 /// alternative) have a single override point.
86 pub kallsyms_path: PathBuf,
87}
88
89impl LiveHostKernelEnv {
90 /// Auto-discover every resource needed by the live-host
91 /// introspection pipeline.
92 ///
93 /// Returns an error only when none of the BTF candidate paths
94 /// resolve — without BTF the failure-dump renderer can't decode
95 /// any field. Missing vmlinux ELF or unreadable `/proc/kallsyms`
96 /// are NOT errors at this layer: callers that need ELF or
97 /// symbols for their specific dump pass surface their own error
98 /// when they reach for an unavailable resource.
99 #[allow(dead_code)]
100 pub fn discover() -> Result<Self> {
101 let release = uname_release().context("uname(2) failed")?;
102 let btf_path = locate_btf(&release).ok_or_else(|| {
103 anyhow!("no BTF found (looked in /sys/kernel/btf/vmlinux and ELF paths)")
104 })?;
105 let vmlinux_elf_path = locate_vmlinux_elf(&release);
106 Ok(Self {
107 release,
108 btf_path,
109 vmlinux_elf_path,
110 kallsyms_path: PathBuf::from("/proc/kallsyms"),
111 })
112 }
113}
114
115/// `uname(2)` syscall wrapper. Returns the running kernel's release
116/// string (the field that `uname -r` prints).
117///
118/// SAFETY: libc::uname populates a `utsname` struct on the stack;
119/// the release field is a NUL-terminated `c_char[65]` in glibc /
120/// musl's definition.
121#[allow(dead_code)]
122pub fn uname_release() -> Result<String> {
123 // SAFETY: libc::utsname is a POD; zero-init is valid input to
124 // libc::uname which fills it.
125 let mut uts: libc::utsname = unsafe { std::mem::zeroed() };
126 // SAFETY: libc::uname is a thin wrapper over the syscall;
127 // returns 0 on success, -1 on failure (very rare — only fails
128 // on a kernel-side fault).
129 let ret = unsafe { libc::uname(&mut uts as *mut libc::utsname) };
130 if ret != 0 {
131 return Err(anyhow!(
132 "uname(2) failed: {}",
133 std::io::Error::last_os_error()
134 ));
135 }
136 // SAFETY: libc fills .release with a NUL-terminated string of
137 // at most 65 bytes. CStr::from_ptr requires a valid NUL-
138 // terminated pointer; libc guarantees this on success.
139 let release = unsafe { CStr::from_ptr(uts.release.as_ptr()) }
140 .to_str()
141 .context("uname.release was not valid UTF-8")?
142 .to_string();
143 Ok(release)
144}
145
146/// Locate a BTF source for the running kernel. See module-level doc
147/// for the search order.
148///
149/// `/sys/kernel/btf/vmlinux` is preferred — when present (kernel
150/// built with `CONFIG_DEBUG_INFO_BTF`, mandatory for sched_ext on
151/// modern distros), it provides a raw BTF blob ready for
152/// `Btf::from_bytes`. ELF candidates are returned only when the
153/// raw BTF is absent.
154fn locate_btf(release: &str) -> Option<PathBuf> {
155 // 1. /sys/kernel/btf/vmlinux — fastest path, always present
156 // on a sched_ext-capable kernel.
157 let sysfs = Path::new("/sys/kernel/btf/vmlinux");
158 if sysfs.is_file() {
159 return Some(sysfs.to_path_buf());
160 }
161 // 2-5. Fall back to ELF candidates that ALSO carry BTF in their
162 // .BTF section — the BTF loader handles both formats.
163 locate_vmlinux_elf(release)
164}
165
166/// Locate a vmlinux ELF for the running kernel.
167///
168/// Search order (descending priority):
169/// - `/lib/modules/$(uname -r)/build/vmlinux`
170/// - `/usr/lib/debug/boot/vmlinux-$(uname -r)` (debian/ubuntu dbg)
171/// - `/usr/lib/debug/lib/modules/$(uname -r)/vmlinux` (fedora/rhel)
172/// - ktstr kernel cache (when present, falls through last)
173fn locate_vmlinux_elf(release: &str) -> Option<PathBuf> {
174 let candidates = [
175 format!("/lib/modules/{release}/build/vmlinux"),
176 format!("/usr/lib/debug/boot/vmlinux-{release}"),
177 format!("/usr/lib/debug/lib/modules/{release}/vmlinux"),
178 ];
179 for cand in &candidates {
180 let p = Path::new(cand);
181 if p.is_file() {
182 return Some(p.to_path_buf());
183 }
184 }
185 // ktstr kernel cache — defer to the disk-template cache resolver.
186 // The cache root is computed by
187 // [`crate::vmm::disk_template::cache_root`] but we only consult
188 // the per-release entry shape: <cache root> / <key> / vmlinux.
189 // Without a cache key we can't address a specific build, so we
190 // fall through here and let the live-host caller specify a kernel
191 // cache entry explicitly when they know one matches.
192 None
193}
194
195/// Parsed kernel symbol table from `/proc/kallsyms`.
196///
197/// Per-line lazy lookup is too slow for the live-host pipeline,
198/// which resolves dozens of symbols (sched_class addresses, lock
199/// slowpath entry points, scx_root, etc.) at a single dump time.
200/// `KallsymsTable` parses once and holds an O(1) name→addr map.
201#[derive(Debug, Clone)]
202#[allow(dead_code)]
203pub struct KallsymsTable {
204 by_name: std::collections::HashMap<String, u64>,
205}
206
207impl KallsymsTable {
208 /// Read and parse `/proc/kallsyms` from the configured path on
209 /// `env`. Returns an error when the file is unreadable —
210 /// `/proc/kallsyms` is root-readable only, and unprivileged
211 /// callers see a 0-filled file. The parser detects the all-
212 /// zeros case and returns an empty map without erroring (so
213 /// non-privileged unit tests still get a usable
214 /// `KallsymsTable` even though it can't resolve anything).
215 #[allow(dead_code)]
216 pub fn load_from(env: &LiveHostKernelEnv) -> Result<Self> {
217 Self::load_from_path(&env.kallsyms_path)
218 }
219
220 /// Read and parse a kallsyms file from an explicit path. Useful
221 /// for unit tests and for the rare live-host caller that wants
222 /// to point at a saved snapshot rather than the live
223 /// `/proc/kallsyms`.
224 #[allow(dead_code)]
225 pub fn load_from_path(path: &Path) -> Result<Self> {
226 let raw =
227 std::fs::read_to_string(path).with_context(|| format!("read {}", path.display()))?;
228 Ok(Self::parse(&raw))
229 }
230
231 /// Parse kallsyms-format text (one `HEX TYPE NAME ...` line per
232 /// symbol) into a name→address map.
233 ///
234 /// Skipped lines (silently, without affecting other symbols):
235 /// - lines with fewer than 3 whitespace-separated tokens
236 /// - lines whose first token is not a hex-parseable u64
237 /// - lines whose address is 0 (the kallsyms-redacted view that
238 /// unprivileged readers see — addresses are zero-filled by
239 /// the kernel for non-CAP_SYSLOG callers)
240 ///
241 /// A returned table with `len() == 0` is valid: the caller can
242 /// detect "kallsyms unreadable" via `is_empty()` and surface a
243 /// permission diagnostic without this layer producing an error.
244 pub fn parse(raw: &str) -> Self {
245 let mut by_name = std::collections::HashMap::new();
246 for line in raw.lines() {
247 let mut parts = line.split_whitespace();
248 let Some(addr) = parts.next() else { continue };
249 let _ty = parts.next();
250 let Some(sym) = parts.next() else { continue };
251 let Ok(addr) = u64::from_str_radix(addr, 16) else {
252 continue;
253 };
254 // Skip the redacted-view all-zeros entries. A genuine
255 // 0-valued symbol address would be a kernel bug; the
256 // expected case is "unprivileged kallsyms reader sees
257 // every line zeroed out".
258 if addr == 0 {
259 continue;
260 }
261 by_name.insert(sym.to_string(), addr);
262 }
263 Self { by_name }
264 }
265
266 /// Look up a symbol by exact name. Returns the kernel virtual
267 /// address (u64) or `None` when the name is not in the table.
268 #[allow(dead_code)]
269 pub fn resolve(&self, name: &str) -> Option<u64> {
270 self.by_name.get(name).copied()
271 }
272
273 /// Total number of resolved symbols. Zero when /proc/kallsyms
274 /// was readable but every line was redacted (unprivileged
275 /// caller case).
276 #[allow(dead_code)]
277 pub fn len(&self) -> usize {
278 self.by_name.len()
279 }
280
281 /// True when the table holds no usable symbols. Live-host
282 /// callers that hit this should surface a "run as root"
283 /// diagnostic.
284 #[allow(dead_code)]
285 pub fn is_empty(&self) -> bool {
286 self.by_name.is_empty()
287 }
288}
289
290#[cfg(test)]
291mod tests {
292 use super::*;
293
294 /// uname_release returns a non-empty string on any platform
295 /// where libc::uname succeeds. Linux always succeeds — this
296 /// test would only fail on a hypothetical hostile kernel that
297 /// returned -1, which would be a test-environment bug.
298 #[test]
299 fn uname_release_returns_nonempty() {
300 let release = uname_release().expect("uname succeeds on Linux");
301 assert!(!release.is_empty());
302 // Sanity: every kernel since the dawn of time has had at
303 // least one dot in the release (major.minor or
304 // major.minor.patch).
305 assert!(
306 release.contains('.'),
307 "release {release:?} should look like X.Y or X.Y.Z"
308 );
309 }
310
311 /// `KallsymsTable::parse` recovers every well-formed symbol from a
312 /// representative kallsyms snippet. Mirrors the format the
313 /// kernel actually produces (HEX TYPE NAME [MODULE]).
314 #[test]
315 fn kallsyms_parse_basic() {
316 let raw = "\
317ffffffff80100000 T _stext
318ffffffff80101234 T scx_disable_workfn
319ffffffff80105678 t local_static_function
320ffffffff8000abcd D ext_sched_class
321";
322 let table = KallsymsTable::parse(raw);
323 assert_eq!(table.resolve("_stext"), Some(0xffffffff80100000));
324 assert_eq!(
325 table.resolve("scx_disable_workfn"),
326 Some(0xffffffff80101234)
327 );
328 assert_eq!(
329 table.resolve("local_static_function"),
330 Some(0xffffffff80105678)
331 );
332 assert_eq!(table.resolve("ext_sched_class"), Some(0xffffffff8000abcd));
333 assert_eq!(table.len(), 4);
334 assert!(!table.is_empty());
335 }
336
337 /// Redacted-view kallsyms (every address zero, what an
338 /// unprivileged reader sees) parses to an empty table. The
339 /// table is `is_empty()` rather than failing — callers
340 /// distinguish "unreadable" (load failure) from "redacted"
341 /// (parsed-but-empty) themselves.
342 #[test]
343 fn kallsyms_parse_skips_zero_addresses() {
344 let raw = "\
3450000000000000000 T _stext
3460000000000000000 T scx_disable_workfn
347";
348 let table = KallsymsTable::parse(raw);
349 assert!(table.is_empty());
350 assert_eq!(table.resolve("_stext"), None);
351 }
352
353 /// Malformed lines (too few fields, non-hex address) are
354 /// skipped without affecting good lines on either side.
355 #[test]
356 fn kallsyms_parse_skips_malformed_lines() {
357 let raw = "\
358ffffffff80100000 T _stext
359not-a-hex-address T garbage
360short_line
361ffffffff80105678 T good_symbol
362";
363 let table = KallsymsTable::parse(raw);
364 assert_eq!(table.resolve("_stext"), Some(0xffffffff80100000));
365 assert_eq!(table.resolve("good_symbol"), Some(0xffffffff80105678));
366 assert_eq!(table.resolve("garbage"), None);
367 assert_eq!(table.len(), 2);
368 }
369
370 /// `KallsymsTable::load_from_path` reads from a file path
371 /// rather than the live `/proc/kallsyms`. Verifies the
372 /// pluggable-path constructor used by tests.
373 #[test]
374 fn kallsyms_load_from_path() {
375 use std::io::Write;
376 let tmp = tempfile::NamedTempFile::new().unwrap();
377 let mut f = tmp.reopen().unwrap();
378 writeln!(f, "ffffffff80100000 T _stext").unwrap();
379 writeln!(f, "ffffffff80101234 T target_symbol").unwrap();
380 drop(f);
381
382 let table = KallsymsTable::load_from_path(tmp.path()).unwrap();
383 assert_eq!(table.resolve("target_symbol"), Some(0xffffffff80101234));
384 }
385
386 /// `LiveHostKernelEnv::discover` works on any sched_ext-capable
387 /// kernel — it just needs `/sys/kernel/btf/vmlinux` to exist.
388 /// Skip the test when running on a host without it (e.g. a
389 /// build container without sched_ext debug info).
390 #[test]
391 fn live_host_kernel_env_discover_smoke() {
392 if !Path::new("/sys/kernel/btf/vmlinux").is_file() {
393 // No way to verify discover() on this host; skip.
394 return;
395 }
396 let env = LiveHostKernelEnv::discover().expect("BTF present, discover should succeed");
397 assert!(!env.release.is_empty());
398 assert!(env.btf_path.exists());
399 // kallsyms_path is always /proc/kallsyms regardless of
400 // whether the file is readable.
401 assert_eq!(env.kallsyms_path, Path::new("/proc/kallsyms"));
402 }
403
404 /// `locate_btf` falls through to ELF candidates when sysfs is
405 /// missing. We can't easily test the sysfs path here without
406 /// a syscall mock; verify the ELF fallback shape by passing a
407 /// release that maps to no real path.
408 #[test]
409 fn locate_btf_no_real_release_returns_none_or_sysfs() {
410 let result = locate_btf("definitely-not-a-kernel-release-9.99");
411 // Either /sys/kernel/btf/vmlinux exists (and we get that)
412 // or no fallback path resolves (and we get None).
413 if let Some(p) = result {
414 assert_eq!(p, Path::new("/sys/kernel/btf/vmlinux"))
415 }
416 }
417}