ktstr/host_context.rs
1//! Host runtime state captured at sidecar-write time.
2//!
3//! [`HostContext`] is a snapshot of the host running the tool:
4//! kernel release, CPU identity, memory size, hugepages config,
5//! transparent-hugepage policy, kernel scheduler tunables, NUMA
6//! node count, and kernel cmdline. Static fields (CPU identity,
7//! total memory, hugepage size, NUMA count, uname triple,
8//! per-CPU cpufreq governor) are memoized in [`OnceLock`] across
9//! the process; dynamic fields (sched tunables, hugepages totals,
10//! THP policy, cmdline) are re-read on every call so run-time
11//! sysctl changes or hugepage reservations between tests are not
12//! hidden by the cache.
13//!
14//! ## Static-cache staleness under hotplug
15//!
16//! The static-field cache pins the first snapshot it observes for
17//! the life of the process. This is OUR invariant, not the
18//! kernel's: `/proc/meminfo`'s `MemTotal`,
19//! `/sys/devices/system/node/*`, and the `uname()` return all
20//! update live when memory or NUMA hotplug fires, and a freshly-
21//! started process would pick up the new values on its next
22//! collect call. It is `STATIC_HOST_INFO`'s `OnceLock` that
23//! binds a single read for the process lifetime — not any
24//! kernel-side caching.
25//!
26//! So on a host where CPU / NUMA / memory hotplug fires between
27//! two collect calls in the same process, `HostContext` continues
28//! to report the pre-hotplug values — `total_memory_kib` stays at
29//! the original snapshot, `numa_nodes` does not reflect an
30//! added/removed node. `arch` is the only field genuinely immune
31//! (a reboot is required to change architecture).
32//!
33//! `cpufreq_governor` is similarly pinned: the per-CPU
34//! `scaling_governor` map is read once on first
35//! [`collect_host_context`] call and reused thereafter. A test
36//! that writes to `/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor`
37//! mid-process will not see the post-write value reflected in
38//! later snapshots. Governor changes are rare (they typically
39//! happen at boot via `cpupower`, systemd unit, or kernel default)
40//! and the cache trades that rare-mutation visibility for
41//! eliminating up to N × M sysfs reads per process (N = online
42//! CPUs, M = `collect_host_context` invocations).
43//!
44//! Tests that need live-updated values must either (a) avoid
45//! reading HostContext after the hotplug event, or (b) restart
46//! the process to force a fresh `OnceLock` population. No
47//! `reset` hook is exposed in production; the `#[cfg(test)]`-only
48//! reset machinery is for unit tests, not runtime recapture.
49
50use std::collections::BTreeMap;
51use std::sync::OnceLock;
52
53/// Host-level runtime state snapshot attached to each
54/// [`SidecarResult`](crate::test_support::SidecarResult). Every
55/// field is optional so a partial read (missing /proc entry,
56/// permission denied, parse failure) still records the fields that
57/// did succeed instead of dropping the whole snapshot.
58///
59/// # Constructing instances in tests
60///
61/// `HostContext` is `#[non_exhaustive]` — see
62/// [`crate::non_exhaustive`] for the cross-crate construction and
63/// pattern-match rules shared by every such type in the crate. The
64/// concrete pattern for `HostContext` is to start from a [`Default`]
65/// instance and mutate fields:
66///
67/// ```
68/// use ktstr::prelude::HostContext;
69/// let mut ctx = HostContext::default();
70/// ctx.cpu_model = Some("Test CPU".to_string());
71/// ctx.numa_nodes = Some(2);
72/// ```
73///
74/// For tests that want a populated baseline (non-trivial defaults
75/// for every field) instead of `Default`'s all-`None` minimum, start
76/// from [`HostContext::test_fixture`] and mutate from there.
77///
78/// # Partial-read round-trip
79///
80/// Fields representing producer-time partial-read outcomes use
81/// `serde(default, skip_serializing_if = ...)` so the absent
82/// state round-trips through the sidecar JSON — `None` for the
83/// `Option<T>` fields paired with `Option::is_none`, empty for
84/// the `cpufreq_governor` `BTreeMap<usize, String>` paired with
85/// `BTreeMap::is_empty`. A producer-time partial read (missing
86/// `/proc` entry, permission denied, parse failure) lands at the
87/// absent state, gets omitted on serialize, and deserializes back
88/// to the same absent state. The pattern exists for that
89/// producer-side partial-population path, not for cross-binary-
90/// version compatibility. Per the pre-1.0 sidecar-disposable
91/// rule, a sidecar written by a different binary version may
92/// fail to deserialize when the schema has diverged — re-run the
93/// test to regenerate it with the current schema.
94#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
95#[non_exhaustive]
96pub struct HostContext {
97 /// CPU model string — the `model name` line of `/proc/cpuinfo`.
98 /// Single value (first processor entry) since heterogeneous
99 /// CPU models on a single host are rare enough that the
100 /// extra complexity is not worth carrying.
101 #[serde(default, skip_serializing_if = "Option::is_none")]
102 pub cpu_model: Option<String>,
103 /// CPU vendor ID — the `vendor_id` line of `/proc/cpuinfo`
104 /// (e.g. `GenuineIntel`, `AuthenticAMD`). On ARM64,
105 /// `/proc/cpuinfo` uses `CPU implementer` instead of
106 /// `vendor_id`, so this field is `None`.
107 #[serde(default, skip_serializing_if = "Option::is_none")]
108 pub cpu_vendor: Option<String>,
109 /// Total physical memory in KiB — `MemTotal:` from
110 /// `/proc/meminfo`. The kernel labels the value `kB` but the
111 /// scale is 1024 bytes (KiB); the field name uses the
112 /// unambiguous IEC binary unit so the sidecar reader does not
113 /// need to guess the scale.
114 #[serde(default, skip_serializing_if = "Option::is_none")]
115 pub total_memory_kib: Option<u64>,
116 /// Configured huge pages — `HugePages_Total` from `/proc/meminfo`.
117 #[serde(default, skip_serializing_if = "Option::is_none")]
118 pub hugepages_total: Option<u64>,
119 /// Free huge pages — `HugePages_Free` from `/proc/meminfo`.
120 #[serde(default, skip_serializing_if = "Option::is_none")]
121 pub hugepages_free: Option<u64>,
122 /// Hugepage size in KiB — `Hugepagesize:` from `/proc/meminfo`
123 /// (labeled `kB` in the file; the scale is 1024 bytes / KiB).
124 #[serde(default, skip_serializing_if = "Option::is_none")]
125 pub hugepages_size_kib: Option<u64>,
126 /// Active THP policy — content of
127 /// `/sys/kernel/mm/transparent_hugepage/enabled` with the
128 /// bracketed selection preserved verbatim (e.g.
129 /// `"always [madvise] never"`). Trimmed of leading and
130 /// trailing whitespace by `read_trimmed_sysfs`, so the trailing
131 /// newline that sysfs appends does not appear in the captured
132 /// value. Stored as-read rather than parsed because the bracket
133 /// is the meaningful part and downstream tooling may want the
134 /// full menu too.
135 #[serde(default, skip_serializing_if = "Option::is_none")]
136 pub thp_enabled: Option<String>,
137 /// Active THP defrag policy — content of
138 /// `/sys/kernel/mm/transparent_hugepage/defrag`, bracket
139 /// preserved. Trimmed of leading and trailing whitespace by
140 /// `read_trimmed_sysfs`.
141 #[serde(default, skip_serializing_if = "Option::is_none")]
142 pub thp_defrag: Option<String>,
143 /// `/proc/sys/kernel/sched_*` tunables. Keys are the leaf
144 /// basename (e.g. `sched_migration_cost_ns`); values are the
145 /// file content trimmed of leading and trailing whitespace
146 /// (internal whitespace preserved — `read_trimmed_sysfs` uses
147 /// `str::trim`, which only strips edges). Every current
148 /// `sched_*` tunable is a scalar, but a future kernel that
149 /// exposes a multi-line tunable would land here as a
150 /// multi-line `String`. `None` when the `read_dir` of
151 /// `/proc/sys/kernel` fails; empty map when the directory is
152 /// readable but contains no entries starting with `sched_`
153 /// (or all such entries fail the per-file read or trim to
154 /// empty).
155 #[serde(default, skip_serializing_if = "Option::is_none")]
156 pub sched_tunables: Option<BTreeMap<String, String>>,
157 /// Number of online host CPUs — `HostTopology::online_cpus.len()`
158 /// from the same `from_sysfs` probe that drives `numa_nodes`.
159 /// `None` when the topology probe fails. Captured as a discrete
160 /// field so downstream consumers (sidecar readers, scheduler
161 /// regression dashboards) don't need to reconstruct a
162 /// HostTopology just to learn the CPU count.
163 #[serde(default, skip_serializing_if = "Option::is_none")]
164 pub online_cpus: Option<usize>,
165 /// Count of NUMA nodes — derived from
166 /// `HostTopology::from_sysfs` (the `cpu_to_node` map's distinct
167 /// value count). `None` when the topology probe itself fails so
168 /// "unknown" is distinguishable from a populated result. A probe
169 /// that succeeds but reports no CPU→node entries defaults to
170 /// `Some(1)` because every Linux system has at least one NUMA
171 /// node — see `count_numa_nodes_in_topology` for the full
172 /// rationale (in production, empty `cpu_to_node` from a
173 /// successful probe cannot happen because `TestTopology::from_system`
174 /// bails on zero online CPUs; the `.max(1)` floor is a guard
175 /// for synthetic/test topologies).
176 #[serde(default, skip_serializing_if = "Option::is_none")]
177 pub numa_nodes: Option<usize>,
178 /// Per-CPU scaling_governor string, keyed by CPU id. Read
179 /// from `/sys/devices/system/cpu/cpu{N}/cpufreq/scaling_governor`
180 /// for every online CPU. Value is the trimmed governor name
181 /// as written by the kernel (e.g. `"performance"`,
182 /// `"powersave"`, `"schedutil"`, `"ondemand"`).
183 ///
184 /// Per-CPU granularity matters: heterogeneous hosts (big.LITTLE,
185 /// P/E cores) can carry different governors on different CPUs,
186 /// and a scheduler micro-benchmark landing on a `powersave`
187 /// CPU sees 2× the latency of one landing on a `performance`
188 /// CPU. A run-level single-governor field would average this
189 /// out and hide the variance.
190 ///
191 /// Empty map when `/sys/devices/system/cpu/online` is
192 /// unreadable (sysfs absent, container without it mounted)
193 /// or when every per-CPU read fails. `skip_serializing_if`
194 /// keeps the sidecar compact on hosts without the data.
195 ///
196 /// Cached: the first [`collect_host_context`] call populates a
197 /// process-wide [`OnceLock`] with one read per online CPU;
198 /// subsequent calls clone the cached map. Governor changes
199 /// after first capture are not reflected — see the
200 /// "Static-cache staleness under hotplug" section in the
201 /// module-level docs for the full contract.
202 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
203 pub cpufreq_governor: BTreeMap<usize, String>,
204 /// Kernel name — `uname.sysname` (typically `"Linux"`).
205 /// The nodename field is intentionally dropped; it's a local
206 /// hostname and has no place in a published sidecar.
207 #[serde(default, skip_serializing_if = "Option::is_none")]
208 pub kernel_name: Option<String>,
209 /// Kernel release — `uname.release` (e.g. `"6.11.0-rc3"`).
210 /// The full `/proc/version` banner is NOT captured because it
211 /// embeds the build host + gcc version string, which is
212 /// environment leakage.
213 #[serde(default, skip_serializing_if = "Option::is_none")]
214 pub kernel_release: Option<String>,
215 /// Machine architecture — `uname.machine` (e.g. `"x86_64"`,
216 /// `"aarch64"`).
217 #[serde(default, skip_serializing_if = "Option::is_none")]
218 pub arch: Option<String>,
219 /// `/proc/cmdline` verbatim (trimmed of leading and trailing
220 /// whitespace). Captures boot-time parameters that materially
221 /// affect scheduler behavior — `preempt=`, `isolcpus=`,
222 /// `nohz_full=`, `mitigations=`, hugepage reservations,
223 /// `transparent_hugepage=`, and others. Stored as a single
224 /// string because any split-into-pairs parser loses the
225 /// quoted-value and flag-only variants the kernel accepts.
226 ///
227 /// Named `kernel_cmdline` rather than `cmdline` to disambiguate
228 /// from [`SidecarResult::kargs`](crate::test_support::SidecarResult):
229 /// that field carries the extra kargs the ktstr VMM appended
230 /// when booting the guest, NOT the running host's boot line.
231 /// Both are cmdline-shaped strings but describe different
232 /// systems.
233 #[serde(default, skip_serializing_if = "Option::is_none")]
234 pub kernel_cmdline: Option<String>,
235 /// Kernel delay-accounting state probed from
236 /// `/proc/sys/kernel/task_delayacct`. `None` only in synthetic
237 /// contexts that did not probe; [`collect_host_context`] always
238 /// populates it. Gates which taskstats delay-family fields are
239 /// genuinely measured — see [`DelayacctState`].
240 #[serde(default, skip_serializing_if = "Option::is_none")]
241 pub task_delayacct: Option<DelayacctState>,
242 /// CONFIG_TASK_XACCT build state probed from `/proc/config.gz`.
243 /// Gates the taskstats memory-watermark fields — see
244 /// [`XacctState`]. `None` only in synthetic contexts.
245 #[serde(default, skip_serializing_if = "Option::is_none")]
246 pub config_task_xacct: Option<XacctState>,
247 /// Running process's jemalloc heap state — active / allocated /
248 /// resident / mapped bytes and arena count. Populated on
249 /// jemalloc-linked builds (every ktstr binary), `None` on
250 /// downstream consumers that use the library without
251 /// installing `tikv_jemallocator` as `#[global_allocator]`. See
252 /// [`HostHeapState`](crate::host_heap::HostHeapState) for the
253 /// field-level documentation.
254 #[serde(default, skip_serializing_if = "Option::is_none")]
255 pub heap_state: Option<crate::host_heap::HostHeapState>,
256}
257
258/// Extract the bracketed active policy from a kernel mm
259/// menu-style string such as `"always [madvise] never"` (THP
260/// enabled) or `"always defer defer+madvise [madvise] never"`
261/// (THP defrag). Returns the content between the first `[` and
262/// first subsequent `]`, or `None` if either bracket is missing.
263///
264/// **First-bracket-wins**: if the string contains multiple `[..]`
265/// pairs (e.g. a hand-written test fixture or a malformed sysfs
266/// read), only the FIRST pair is returned; later pairs are
267/// ignored. The kernel emits exactly one bracketed token in
268/// practice — this scanner exists to decode that canonical shape,
269/// not to validate arbitrary input.
270///
271/// Exposed as a pure helper so downstream tooling that wants the
272/// active policy (not the full menu) does not have to re-implement
273/// the bracket scan. The raw field is kept on [`HostContext`] for
274/// consumers that want the menu; [`HostContext::thp_enabled_active`]
275/// and [`HostContext::thp_defrag_active`] route through this
276/// helper.
277pub fn parse_bracketed_active_policy(s: &str) -> Option<&str> {
278 let open = s.find('[')?;
279 let rest = &s[open + 1..];
280 let close = rest.find(']')?;
281 Some(&rest[..close])
282}
283
284fn fmt_opt<T: std::fmt::Display>(v: Option<&T>) -> String {
285 match v {
286 Some(v) => v.to_string(),
287 None => "(unknown)".to_string(),
288 }
289}
290
291fn diff_row<T: std::fmt::Display + PartialEq>(
292 out: &mut String,
293 key: &str,
294 a: Option<&T>,
295 b: Option<&T>,
296) {
297 use std::fmt::Write;
298 if a == b {
299 return;
300 }
301 let _ = writeln!(out, " {key}: {} → {}", fmt_opt(a), fmt_opt(b));
302}
303
304fn summarize_tunables(m: Option<&BTreeMap<String, String>>) -> String {
305 match m {
306 None => "(unknown)".to_string(),
307 Some(map) if map.is_empty() => "(empty)".to_string(),
308 Some(map) if map.len() == 1 => "(1 entry)".to_string(),
309 Some(map) => format!("({} entries)", map.len()),
310 }
311}
312
313/// Append the per-CPU `cpufreq_governor` section of a
314/// [`HostContext::diff`]. Iterates the union of CPU ids from both
315/// maps in `BTreeSet` (ascending CPU id) order and emits one
316/// `cpufreq_governor.cpuN: before → after` line per CPU whose
317/// governor differs; a CPU present on only one side renders
318/// `(absent)` on the other.
319fn diff_cpufreq_governor(
320 out: &mut String,
321 a_cpufreq_governor: &BTreeMap<usize, String>,
322 b_cpufreq_governor: &BTreeMap<usize, String>,
323) {
324 use std::fmt::Write;
325 let mut cpus: std::collections::BTreeSet<usize> = std::collections::BTreeSet::new();
326 cpus.extend(a_cpufreq_governor.keys().copied());
327 cpus.extend(b_cpufreq_governor.keys().copied());
328 for cpu in cpus {
329 let av = a_cpufreq_governor.get(&cpu);
330 let bv = b_cpufreq_governor.get(&cpu);
331 if av != bv {
332 let _ = writeln!(
333 out,
334 " cpufreq_governor.cpu{cpu}: {} → {}",
335 av.map(String::as_str).unwrap_or("(absent)"),
336 bv.map(String::as_str).unwrap_or("(absent)"),
337 );
338 }
339 }
340}
341
342/// Append the `sched_tunables` section of a [`HostContext::diff`].
343/// When both sides carry a map, emits one `sched_tunables.KEY:
344/// before → after` line per differing key in `BTreeSet`
345/// (ascending key) order, rendering `(absent)` for a key present on
346/// only one side. When the maps' `Option` presence or cardinality
347/// differs but a per-key diff is not applicable (one side `None`),
348/// emits a single summarized `sched_tunables: before → after` line
349/// via [`summarize_tunables`].
350fn diff_sched_tunables(
351 out: &mut String,
352 a_sched_tunables: Option<&BTreeMap<String, String>>,
353 b_sched_tunables: Option<&BTreeMap<String, String>>,
354) {
355 use std::fmt::Write;
356 match (a_sched_tunables, b_sched_tunables) {
357 (Some(am), Some(bm)) => {
358 let mut keys: std::collections::BTreeSet<&str> = std::collections::BTreeSet::new();
359 keys.extend(am.keys().map(String::as_str));
360 keys.extend(bm.keys().map(String::as_str));
361 for k in keys {
362 let av = am.get(k);
363 let bv = bm.get(k);
364 if av != bv {
365 let _ = writeln!(
366 out,
367 " sched_tunables.{k}: {} → {}",
368 av.map(String::as_str).unwrap_or("(absent)"),
369 bv.map(String::as_str).unwrap_or("(absent)"),
370 );
371 }
372 }
373 }
374 (am, bm) if am != bm => {
375 let _ = writeln!(
376 out,
377 " sched_tunables: {} → {}",
378 summarize_tunables(am),
379 summarize_tunables(bm),
380 );
381 }
382 _ => {}
383 }
384}
385
386/// Append the `heap_state` section of a [`HostContext::diff`]. When
387/// both sides carry a [`HostHeapState`](crate::host_heap::HostHeapState),
388/// delegates to its own `diff` and, if non-empty, emits a
389/// `heap_state:` header followed by the nested diff indented two
390/// extra spaces. When only one side is present, emits a single
391/// `heap_state: (present) → (unknown)` (or reverse) line.
392fn diff_heap_state(
393 out: &mut String,
394 a_heap_state: Option<&crate::host_heap::HostHeapState>,
395 b_heap_state: Option<&crate::host_heap::HostHeapState>,
396) {
397 use std::fmt::Write;
398 match (a_heap_state, b_heap_state) {
399 (Some(ah), Some(bh)) => {
400 let inner = ah.diff(bh);
401 if !inner.is_empty() {
402 out.push_str(" heap_state:\n");
403 for line in inner.lines() {
404 let _ = writeln!(out, " {line}");
405 }
406 }
407 }
408 (a, b) if a != b => {
409 let _ = writeln!(
410 out,
411 " heap_state: {} → {}",
412 if a.is_some() {
413 "(present)"
414 } else {
415 "(unknown)"
416 },
417 if b.is_some() {
418 "(present)"
419 } else {
420 "(unknown)"
421 },
422 );
423 }
424 _ => {}
425 }
426}
427
428impl HostContext {
429 /// Populated [`HostContext`] for unit tests. Every field carries
430 /// a reasonable non-trivial value so call sites only spell out
431 /// what they want to vary via post-hoc field assignment
432 /// (`#[non_exhaustive]` rejects all StructExpression forms
433 /// cross-crate, including functional update):
434 ///
435 /// ```
436 /// use ktstr::prelude::HostContext;
437 /// let mut ctx = HostContext::test_fixture();
438 /// ctx.numa_nodes = Some(4);
439 /// ```
440 ///
441 /// Defaults model a plausible 2-node x86_64 Linux host: Intel
442 /// CPU identity, 64 GiB memory, 2 NUMA nodes, default THP
443 /// policies, a minimal `sched_*` tunable map, and a populated
444 /// uname triple. Parity with
445 /// `SidecarResult::test_fixture`
446 /// — both fixtures exist so tests don't re-derive an
447 /// "everything populated" baseline in every call site.
448 ///
449 /// # Usage guidance
450 ///
451 /// Prefer this fixture over local "populated default" helpers
452 /// — a local closure duplicates the default set and drifts the
453 /// moment [`HostContext`] grows a field. This is the single
454 /// place those defaults live. Hash-stability and
455 /// serialization-pin tests are the one exception: they must
456 /// NOT rely on these defaults, because any future change to
457 /// the fixture would silently shift the pinned value. Spell
458 /// every participating field out explicitly in such tests so
459 /// the pin is robust against fixture evolution.
460 pub fn test_fixture() -> HostContext {
461 let mut sched_tunables = BTreeMap::new();
462 sched_tunables.insert("sched_migration_cost_ns".to_string(), "500000".to_string());
463 sched_tunables.insert("sched_latency_ns".to_string(), "24000000".to_string());
464 HostContext {
465 cpu_model: Some("Intel(R) Xeon(R) Test CPU".to_string()),
466 cpu_vendor: Some("GenuineIntel".to_string()),
467 total_memory_kib: Some(64 * 1024 * 1024),
468 hugepages_total: Some(0),
469 hugepages_free: Some(0),
470 hugepages_size_kib: Some(2048),
471 thp_enabled: Some("always [madvise] never".to_string()),
472 thp_defrag: Some("always defer defer+madvise [madvise] never".to_string()),
473 sched_tunables: Some(sched_tunables),
474 online_cpus: Some(16),
475 numa_nodes: Some(2),
476 cpufreq_governor: {
477 let mut m = BTreeMap::new();
478 for cpu in 0..16 {
479 m.insert(cpu, "performance".to_string());
480 }
481 m
482 },
483 kernel_name: Some("Linux".to_string()),
484 kernel_release: Some("6.16.0-test".to_string()),
485 arch: Some("x86_64".to_string()),
486 kernel_cmdline: Some("BOOT_IMAGE=/boot/vmlinuz-test root=/dev/sda1".to_string()),
487 task_delayacct: Some(DelayacctState::On),
488 config_task_xacct: Some(XacctState::On),
489 heap_state: Some(crate::host_heap::HostHeapState::test_fixture()),
490 }
491 }
492
493 /// Render as a human-readable multi-line report. Each field
494 /// occupies one line as `key: value`. Absent fields render as
495 /// `(unknown)` rather than being dropped, so operators see
496 /// which fields failed to populate. The `sched_tunables` map
497 /// is expanded one entry per line under the parent key; an
498 /// empty map renders as `(empty)` and a `None` map as
499 /// `(unknown)`. The output ends with a newline.
500 ///
501 /// This output is for human inspection only. For programmatic
502 /// access, parse the sidecar JSON directly or drive `serde_json`
503 /// against the [`HostContext`] struct — the text format here is
504 /// not a stable serialization contract and may be retuned for
505 /// readability without notice.
506 ///
507 /// Naming: the name pair (`format_human` with no
508 /// `format_machine`) is intentional rather than accidental
509 /// asymmetry. The "machine" surface is serde JSON — callers
510 /// that want a machine-readable rendering use
511 /// `serde_json::to_string(ctx)` directly. A dedicated
512 /// `format_machine` wrapper around that one line would add no
513 /// value. `format_human` stays named as it is (not as
514 /// `impl Display`) because it prints a multi-line block with
515 /// its own newline, which clashes with `Display`'s implicit
516 /// one-value-per-formatter convention; embedding this in
517 /// `format!("{ctx}")` would surprise callers used to single-
518 /// line Display output.
519 pub fn format_human(&self) -> String {
520 use std::fmt::Write;
521 // Destructuring bind forces every field of HostContext to
522 // appear by name here. Adding a new field to the struct
523 // will fail compilation until this function handles it —
524 // that is the intent, it prevents `show-host` from
525 // silently dropping a freshly-captured dimension.
526 let HostContext {
527 cpu_model,
528 cpu_vendor,
529 total_memory_kib,
530 hugepages_total,
531 hugepages_free,
532 hugepages_size_kib,
533 thp_enabled,
534 thp_defrag,
535 sched_tunables,
536 online_cpus,
537 numa_nodes,
538 cpufreq_governor,
539 kernel_name,
540 kernel_release,
541 arch,
542 kernel_cmdline,
543 task_delayacct,
544 config_task_xacct,
545 heap_state,
546 } = self;
547 fn row<T: std::fmt::Display>(out: &mut String, key: &str, value: Option<&T>) {
548 match value {
549 Some(v) => {
550 let _ = writeln!(out, "{key}: {v}");
551 }
552 None => {
553 let _ = writeln!(out, "{key}: (unknown)");
554 }
555 }
556 }
557 let mut out = String::new();
558 row(&mut out, "kernel_name", kernel_name.as_ref());
559 row(&mut out, "kernel_release", kernel_release.as_ref());
560 row(&mut out, "arch", arch.as_ref());
561 row(&mut out, "cpu_model", cpu_model.as_ref());
562 row(&mut out, "cpu_vendor", cpu_vendor.as_ref());
563 row(&mut out, "total_memory_kib", total_memory_kib.as_ref());
564 row(&mut out, "hugepages_total", hugepages_total.as_ref());
565 row(&mut out, "hugepages_free", hugepages_free.as_ref());
566 row(&mut out, "hugepages_size_kib", hugepages_size_kib.as_ref());
567 row(&mut out, "online_cpus", online_cpus.as_ref());
568 row(&mut out, "numa_nodes", numa_nodes.as_ref());
569 row(&mut out, "thp_enabled", thp_enabled.as_ref());
570 row(&mut out, "thp_defrag", thp_defrag.as_ref());
571 row(&mut out, "kernel_cmdline", kernel_cmdline.as_ref());
572 row(&mut out, "task_delayacct", task_delayacct.as_ref());
573 row(&mut out, "config_task_xacct", config_task_xacct.as_ref());
574 if cpufreq_governor.is_empty() {
575 out.push_str("cpufreq_governor: (empty)\n");
576 } else {
577 out.push_str("cpufreq_governor:\n");
578 for (cpu, gov) in cpufreq_governor {
579 let _ = writeln!(&mut out, " cpu{cpu} = {gov}");
580 }
581 }
582 match sched_tunables {
583 Some(map) if !map.is_empty() => {
584 out.push_str("sched_tunables:\n");
585 for (k, v) in map {
586 let _ = writeln!(&mut out, " {k} = {v}");
587 }
588 }
589 Some(_) => out.push_str("sched_tunables: (empty)\n"),
590 None => out.push_str("sched_tunables: (unknown)\n"),
591 }
592 match heap_state {
593 Some(h) => {
594 out.push_str("heap_state:\n");
595 for line in h.format_human().lines() {
596 let _ = writeln!(&mut out, " {line}");
597 }
598 }
599 None => out.push_str("heap_state: (unknown)\n"),
600 }
601 out
602 }
603
604 /// Active THP-enabled policy, extracted from the bracketed
605 /// `[...]` token inside [`Self::thp_enabled`]. Returns the
606 /// content between the first `[` and subsequent `]` (e.g.
607 /// `"madvise"` from `"always [madvise] never"`). `None` when
608 /// `thp_enabled` is `None`, empty, or carries no bracketed
609 /// token (kernels that reshape the menu format).
610 ///
611 /// Provided so downstream tooling (`cargo ktstr stats`, CI
612 /// regression gates, custom dashboards) can consume the active
613 /// policy as a bare token without re-implementing the bracket
614 /// scan in every caller.
615 pub fn thp_enabled_active(&self) -> Option<&str> {
616 self.thp_enabled
617 .as_deref()
618 .and_then(parse_bracketed_active_policy)
619 }
620
621 /// Active THP-defrag policy, extracted the same way as
622 /// [`Self::thp_enabled_active`]. Returns e.g. `"madvise"` from
623 /// `"always defer defer+madvise [madvise] never"`.
624 pub fn thp_defrag_active(&self) -> Option<&str> {
625 self.thp_defrag
626 .as_deref()
627 .and_then(parse_bracketed_active_policy)
628 }
629
630 /// Render the differences between two host contexts as
631 /// indented `key: before → after` lines. Fields that compare
632 /// equal are omitted; an empty return value means the two
633 /// contexts are field-for-field identical (including
634 /// `sched_tunables`). `None` values render as `(unknown)` and
635 /// map entries present in one side only render as `(absent)`
636 /// so a `None → Some(..)` transition does not silently look
637 /// the same as an unchanged absent field. When only one side
638 /// has a `sched_tunables` map, the other side renders
639 /// `(unknown)`; the Some side renders as `(empty)` for an
640 /// empty map or `(N entries)` for a populated one so the
641 /// cardinality of the new data is visible at a glance.
642 pub fn diff(&self, other: &HostContext) -> String {
643 // Symmetric destructuring bind of both sides: forces every
644 // field to appear by name here, same reason as
645 // `format_human` — a new HostContext field must be
646 // explicitly classified as hash-participating, scalar, or
647 // structured before diff will compile.
648 let HostContext {
649 cpu_model: a_cpu_model,
650 cpu_vendor: a_cpu_vendor,
651 total_memory_kib: a_total_memory_kib,
652 hugepages_total: a_hugepages_total,
653 hugepages_free: a_hugepages_free,
654 hugepages_size_kib: a_hugepages_size_kib,
655 thp_enabled: a_thp_enabled,
656 thp_defrag: a_thp_defrag,
657 sched_tunables: a_sched_tunables,
658 online_cpus: a_online_cpus,
659 numa_nodes: a_numa_nodes,
660 cpufreq_governor: a_cpufreq_governor,
661 kernel_name: a_kernel_name,
662 kernel_release: a_kernel_release,
663 arch: a_arch,
664 kernel_cmdline: a_kernel_cmdline,
665 task_delayacct: a_task_delayacct,
666 config_task_xacct: a_config_task_xacct,
667 heap_state: a_heap_state,
668 } = self;
669 let HostContext {
670 cpu_model: b_cpu_model,
671 cpu_vendor: b_cpu_vendor,
672 total_memory_kib: b_total_memory_kib,
673 hugepages_total: b_hugepages_total,
674 hugepages_free: b_hugepages_free,
675 hugepages_size_kib: b_hugepages_size_kib,
676 thp_enabled: b_thp_enabled,
677 thp_defrag: b_thp_defrag,
678 sched_tunables: b_sched_tunables,
679 online_cpus: b_online_cpus,
680 numa_nodes: b_numa_nodes,
681 cpufreq_governor: b_cpufreq_governor,
682 kernel_name: b_kernel_name,
683 kernel_release: b_kernel_release,
684 arch: b_arch,
685 kernel_cmdline: b_kernel_cmdline,
686 task_delayacct: b_task_delayacct,
687 config_task_xacct: b_config_task_xacct,
688 heap_state: b_heap_state,
689 } = other;
690 let mut out = String::new();
691 diff_row(
692 &mut out,
693 "kernel_name",
694 a_kernel_name.as_ref(),
695 b_kernel_name.as_ref(),
696 );
697 diff_row(
698 &mut out,
699 "kernel_release",
700 a_kernel_release.as_ref(),
701 b_kernel_release.as_ref(),
702 );
703 diff_row(&mut out, "arch", a_arch.as_ref(), b_arch.as_ref());
704 diff_row(
705 &mut out,
706 "cpu_model",
707 a_cpu_model.as_ref(),
708 b_cpu_model.as_ref(),
709 );
710 diff_row(
711 &mut out,
712 "cpu_vendor",
713 a_cpu_vendor.as_ref(),
714 b_cpu_vendor.as_ref(),
715 );
716 diff_row(
717 &mut out,
718 "total_memory_kib",
719 a_total_memory_kib.as_ref(),
720 b_total_memory_kib.as_ref(),
721 );
722 diff_row(
723 &mut out,
724 "hugepages_total",
725 a_hugepages_total.as_ref(),
726 b_hugepages_total.as_ref(),
727 );
728 diff_row(
729 &mut out,
730 "hugepages_free",
731 a_hugepages_free.as_ref(),
732 b_hugepages_free.as_ref(),
733 );
734 diff_row(
735 &mut out,
736 "hugepages_size_kib",
737 a_hugepages_size_kib.as_ref(),
738 b_hugepages_size_kib.as_ref(),
739 );
740 diff_row(
741 &mut out,
742 "online_cpus",
743 a_online_cpus.as_ref(),
744 b_online_cpus.as_ref(),
745 );
746 diff_row(
747 &mut out,
748 "numa_nodes",
749 a_numa_nodes.as_ref(),
750 b_numa_nodes.as_ref(),
751 );
752 diff_row(
753 &mut out,
754 "thp_enabled",
755 a_thp_enabled.as_ref(),
756 b_thp_enabled.as_ref(),
757 );
758 diff_row(
759 &mut out,
760 "thp_defrag",
761 a_thp_defrag.as_ref(),
762 b_thp_defrag.as_ref(),
763 );
764 diff_row(
765 &mut out,
766 "kernel_cmdline",
767 a_kernel_cmdline.as_ref(),
768 b_kernel_cmdline.as_ref(),
769 );
770 diff_row(
771 &mut out,
772 "task_delayacct",
773 a_task_delayacct.as_ref(),
774 b_task_delayacct.as_ref(),
775 );
776 diff_row(
777 &mut out,
778 "config_task_xacct",
779 a_config_task_xacct.as_ref(),
780 b_config_task_xacct.as_ref(),
781 );
782 diff_cpufreq_governor(&mut out, a_cpufreq_governor, b_cpufreq_governor);
783 diff_sched_tunables(
784 &mut out,
785 a_sched_tunables.as_ref(),
786 b_sched_tunables.as_ref(),
787 );
788 diff_heap_state(&mut out, a_heap_state.as_ref(), b_heap_state.as_ref());
789 out
790 }
791}
792
793/// Static-fields cache. These values do not change for the lifetime
794/// of the process (CPU identity, total installed memory, hugepage
795/// size chosen at boot, NUMA count, uname triple), so walking
796/// `/proc` and `/sys` for them once and reusing the result avoids
797/// repeated syscalls on every sidecar write. Dynamic fields
798/// (sched_tunables, hugepages_total, hugepages_free, thp_enabled,
799/// thp_defrag, kernel_cmdline) are NOT cached — they can shift
800/// between tests via sysctl, hugepage reservation, THP policy flip,
801/// or live kexec, and a cached snapshot would hide that change.
802///
803/// Per-CPU `cpufreq_governor` is cached separately in
804/// [`CPUFREQ_GOVERNORS`] rather than embedded here so the cache
805/// hit on the per-call path does not clone a `BTreeMap<usize, String>`
806/// of up to `online_cpus` entries through the `StaticHostInfo`
807/// clone — `StaticHostInfo` carries only primitive `Option<…>`
808/// fields and stays cheap to clone, while `CPUFREQ_GOVERNORS`
809/// owns the heavyweight collection and is cloned on its own
810/// hit-path.
811#[derive(Clone)]
812struct StaticHostInfo {
813 cpu_model: Option<String>,
814 cpu_vendor: Option<String>,
815 total_memory_kib: Option<u64>,
816 hugepages_size_kib: Option<u64>,
817 online_cpus: Option<usize>,
818 numa_nodes: Option<usize>,
819 kernel_name: Option<String>,
820 kernel_release: Option<String>,
821 arch: Option<String>,
822}
823
824static STATIC_HOST_INFO: OnceLock<StaticHostInfo> = OnceLock::new();
825
826/// Process-wide cache for the per-CPU `scaling_governor` map. The
827/// first [`collect_host_context`] call populates this lock by
828/// invoking [`read_cpufreq_governors`]; every later call clones
829/// the cached `BTreeMap` instead of re-reading
830/// `/sys/devices/system/cpu/cpu{N}/cpufreq/scaling_governor` for
831/// every online CPU. With N online CPUs and M sidecar writes per
832/// process, this collapses up to N × M sysfs reads (a 256-CPU
833/// host running a 1000-test session = 256 000 reads) to N. See
834/// the module-level "Static-cache staleness under hotplug"
835/// section for the consequences of pinning the first observed
836/// snapshot — runtime governor changes after first capture are
837/// not reflected.
838static CPUFREQ_GOVERNORS: OnceLock<BTreeMap<usize, String>> = OnceLock::new();
839
840/// Test-only call counter for [`compute_static_host_info`]. Pinned
841/// by `call_counts_*` tests to prove the OnceLock is exercised at
842/// most once per process, independent of how many
843/// `collect_host_context` calls happen. Production builds do not
844/// carry the counter.
845#[cfg(test)]
846static STATIC_INIT_CALLS: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0);
847
848/// Test-only call counter for [`read_meminfo`]. Pinned by
849/// `call_counts_*` tests to prove the `/proc/meminfo` dedup holds
850/// — exactly one read per `collect_host_context` call, not the
851/// pre-dedup two reads on the cold path. Production builds do not
852/// carry the counter.
853#[cfg(test)]
854static MEMINFO_READ_CALLS: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0);
855
856/// Test-only call counter for [`read_cpufreq_governors`]. Pinned
857/// by `call_counts_*` tests to prove the [`CPUFREQ_GOVERNORS`]
858/// cache exercises the underlying sysfs walk at most once per
859/// process. Production builds do not carry the counter.
860#[cfg(test)]
861static CPUFREQ_GOVERNORS_READ_CALLS: std::sync::atomic::AtomicUsize =
862 std::sync::atomic::AtomicUsize::new(0);
863
864/// Capture the host context. Static fields are collected once
865/// and cached; dynamic fields are re-read on every call so
866/// intra-run sysctl / hugepage / THP changes are reflected.
867///
868/// Every sub-read is fallible; individual failures leave the
869/// corresponding field `None` and the rest of the context
870/// proceeds. Even on a host where every `/proc` and `/sys` read
871/// fails, the three uname-derived fields (`kernel_name`,
872/// `kernel_release`, `arch`) still populate because they come from
873/// the `uname()` syscall — filesystem-independent. An
874/// otherwise-empty `HostContext` serializes to a near-empty JSON
875/// object and distinguishes "collection attempted, nothing known"
876/// from "collection not attempted" (represented at the enclosing
877/// `Option<HostContext>` layer on
878/// [`SidecarResult`](crate::test_support::SidecarResult)).
879///
880/// # Timing: post-run snapshot
881///
882/// Production call sites invoke this at sidecar-write time (see
883/// `test_support::sidecar::write_sidecar` and `write_skip_sidecar`),
884/// which runs AFTER the VM finishes. The returned snapshot
885/// therefore reflects post-run host state, not the pre-run
886/// environment the scheduler booted into.
887///
888/// Fields fall into two groups by how they are read:
889///
890/// Static subset (memoised in `STATIC_HOST_INFO` —
891/// or, for `cpufreq_governor`, the parallel
892/// `CPUFREQ_GOVERNORS` cache — identical across every call in
893/// the process, shift only under CPU / memory / NUMA hotplug or
894/// runtime governor change): the uname triple, CPU identity
895/// (`cpu_model` + `cpu_vendor`), `total_memory_kib`,
896/// `hugepages_size_kib`, `online_cpus`, `numa_nodes`, and
897/// `cpufreq_governor`.
898///
899/// Dynamic subset (re-read on every call): `kernel_cmdline`,
900/// `hugepages_total`, `hugepages_free`, `thp_enabled`,
901/// `thp_defrag`, `sched_tunables`. `kernel_cmdline` is
902/// mechanically dynamic (re-read each call) but effectively
903/// static for the process (changes only across reboot). The
904/// others can genuinely drift between pre-run and post-run:
905///
906/// - `sched_tunables`: a test that writes to `/proc/sys/kernel/sched_*`
907/// and does not restore the previous value will be observed
908/// with the test-mutated value.
909/// - `hugepages_total` / `hugepages_free`: a test that reserves
910/// or releases hugepages shifts the counts.
911/// - `thp_enabled` / `thp_defrag`: a test that flips THP policy
912/// is captured with the flipped policy.
913///
914/// Dashboards and regression tooling that need the environment
915/// the scheduler actually saw (not the post-run state) should
916/// treat the three drift-prone fields as "post-run snapshot" and
917/// either (a) disable them in the comparison, or (b) capture a
918/// pre-run snapshot via [`collect_host_context_pre_run`] and
919/// travel the pair via [`HostContextSnapshots`].
920pub fn collect_host_context() -> HostContext {
921 // Read `/proc/meminfo` exactly once per call and share the
922 // parsed fields with `compute_static_host_info` (for `mem_total_kib`
923 // / `hugepages_size_kib` on cold init) and with the per-call
924 // hugepage counters. The prior formulation read `/proc/meminfo`
925 // twice on the cold path — once here for the dynamic counters
926 // and once inside the `OnceLock` init for the static fields —
927 // which is wasted syscall + parse work.
928 let meminfo = read_meminfo();
929 let static_info = STATIC_HOST_INFO
930 .get_or_init(|| compute_static_host_info(&meminfo))
931 .clone();
932 HostContext {
933 cpu_model: static_info.cpu_model,
934 cpu_vendor: static_info.cpu_vendor,
935 total_memory_kib: static_info.total_memory_kib,
936 hugepages_total: meminfo.hugepages_total,
937 hugepages_free: meminfo.hugepages_free,
938 hugepages_size_kib: static_info.hugepages_size_kib,
939 thp_enabled: read_trimmed_sysfs("/sys/kernel/mm/transparent_hugepage/enabled"),
940 thp_defrag: read_trimmed_sysfs("/sys/kernel/mm/transparent_hugepage/defrag"),
941 sched_tunables: read_sched_tunables(),
942 online_cpus: static_info.online_cpus,
943 numa_nodes: static_info.numa_nodes,
944 cpufreq_governor: cached_cpufreq_governors(),
945 kernel_name: static_info.kernel_name,
946 kernel_release: static_info.kernel_release,
947 arch: static_info.arch,
948 kernel_cmdline: read_trimmed_sysfs("/proc/cmdline"),
949 task_delayacct: Some(read_task_delayacct()),
950 config_task_xacct: Some(read_config_task_xacct()),
951 // `heap_state` is a post-run snapshot of the running ktstr
952 // process's jemalloc footprint. Captured here alongside the
953 // other dynamic fields so sidecar consumers can correlate
954 // test outcomes with runner memory pressure. libjemalloc is
955 // linked into every binary in this workspace (hard dep of
956 // `tikv-jemalloc-ctl`), so `collect()` always returns a
957 // populated struct when `#[global_allocator]` is jemalloc.
958 // Downstream consumers using ktstr without jemallocator
959 // installed see `allocated_bytes == Some(0)` and
960 // `active_bytes == Some(0)` because libjemalloc is linked
961 // but unused — collapse that shape to `None` so the sidecar
962 // does not carry a misleading empty row. `arenas.narenas` is
963 // still populated in the collapsed shape but alone carries
964 // no runner-pressure information, so it travels with the
965 // stats that give it meaning.
966 heap_state: {
967 let h = crate::host_heap::collect();
968 if h.allocated_bytes == Some(0) && h.active_bytes == Some(0) {
969 None
970 } else {
971 Some(h)
972 }
973 },
974 }
975}
976
977/// Capture the host context at the start of a run, before the VM
978/// boots or the test body mutates any sysctl / hugepage / THP
979/// setting. Semantic alias for [`collect_host_context`] — the
980/// collection mechanism is identical (same static-cache + dynamic
981/// re-read policy) and callers remain free to call either function
982/// on either side of the run, but the name pins intent:
983/// `collect_host_context_pre_run` documents that the returned
984/// snapshot is the authoritative view of the drift-prone dynamic
985/// fields (`sched_tunables`, `hugepages_total` / `hugepages_free`,
986/// `thp_enabled` / `thp_defrag`) as the scheduler saw them.
987///
988/// Pair the pre-run snapshot with the post-run snapshot produced by
989/// [`collect_host_context`] via [`HostContextSnapshots`] so
990/// downstream consumers can diff the two and surface environment
991/// mutations attributable to the test body (e.g. "scheduler config
992/// reservoir bumped `/proc/sys/kernel/sched_migration_cost_ns` mid-run")
993/// rather than silently folding them into a single ambiguous
994/// "post-run" record.
995///
996/// Static fields (uname triple, CPU identity, total memory,
997/// hugepage size, online CPU count, NUMA node count) are
998/// memoised across every call in the process via
999/// `STATIC_HOST_INFO`, so `collect_host_context_pre_run` and
1000/// `collect_host_context` observing different values for a static
1001/// field implies CPU/memory/NUMA hotplug between the two calls —
1002/// see the module-level "Static-cache staleness under hotplug"
1003/// section for the hotplug contract.
1004pub fn collect_host_context_pre_run() -> HostContext {
1005 // Intentional delegation rather than code duplication: the
1006 // pre/post distinction is purely about WHEN the caller fires
1007 // the snapshot, not HOW the fields are read. Forking the
1008 // implementation would open the door to the two paths drifting
1009 // apart (a fix to dynamic-field parsing landing in one but not
1010 // the other), which is exactly the kind of bug the pair is
1011 // meant to expose.
1012 collect_host_context()
1013}
1014
1015/// Paired pre-run / post-run [`HostContext`] snapshots captured
1016/// from a single test run, intended for sidecar persistence so
1017/// downstream analysis can diff the drift-prone dynamic fields
1018/// (`sched_tunables`, `hugepages_*`, `thp_*`) between the two
1019/// endpoints.
1020///
1021/// The struct deliberately carries both snapshots in full —
1022/// including the static fields (uname triple, CPU identity, total
1023/// memory) that are OnceLock-cached and therefore guaranteed equal
1024/// across a single process. Duplicating them on the wire (a few
1025/// hundred bytes of JSON per sidecar) keeps each snapshot
1026/// self-describing so a consumer that only cares about the
1027/// post-run state can read
1028/// [`HostContextSnapshots::post`] in isolation without reassembling
1029/// fields from [`HostContextSnapshots::pre`], and a consumer that
1030/// diffs the pair does not have to special-case "which field is
1031/// cached and which is dynamic".
1032///
1033/// Serde shape: both fields serialize as a full `HostContext`
1034/// object under their own keys. The per-field
1035/// `#[serde(default, skip_serializing_if = ...)]` policy on
1036/// `HostContext` carries through, so populated snapshots stay
1037/// compact. The whole struct is `#[non_exhaustive]` — see
1038/// [`crate::non_exhaustive`] for construction and pattern-match
1039/// rules.
1040#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1041#[non_exhaustive]
1042pub struct HostContextSnapshots {
1043 /// Captured before the test body runs — typically via
1044 /// [`collect_host_context_pre_run`] at the start of sidecar
1045 /// setup.
1046 pub pre: HostContext,
1047 /// Captured after the test body finishes — typically via
1048 /// [`collect_host_context`] at sidecar-write time.
1049 pub post: HostContext,
1050}
1051
1052impl HostContextSnapshots {
1053 /// Construct a pair from explicit pre/post snapshots. Prefer
1054 /// this constructor over a (forbidden cross-crate) struct
1055 /// literal so future fields can land on
1056 /// [`HostContextSnapshots`] without breaking callers.
1057 pub fn new(pre: HostContext, post: HostContext) -> Self {
1058 Self { pre, post }
1059 }
1060
1061 /// Capture both endpoints in a single call. Useful for tests
1062 /// and callers that don't observe a test body between the two
1063 /// snapshots and only want to stamp the pair structurally (both
1064 /// endpoints will reflect the same dynamic state because no
1065 /// mutation happened in between).
1066 ///
1067 /// `#[cfg(test)]`-gated so production sidecar writers cannot
1068 /// reach it by accident — they need
1069 /// [`collect_host_context_pre_run`] before the run and
1070 /// [`collect_host_context`] after, which
1071 /// [`HostContextSnapshots::new`] then pairs. The compile-time
1072 /// gate replaces the earlier doc-only warning.
1073 #[cfg(test)]
1074 pub fn capture_same_instant() -> Self {
1075 let snap = collect_host_context();
1076 Self {
1077 pre: snap.clone(),
1078 post: snap,
1079 }
1080 }
1081}
1082
1083/// Return the per-CPU `scaling_governor` map, populating the
1084/// process-wide [`CPUFREQ_GOVERNORS`] cache on first call and
1085/// cloning the cached value on every subsequent call. A clone of a
1086/// `BTreeMap<usize, String>` of even a few hundred entries is
1087/// orders of magnitude cheaper than the up to 256 sysfs `read`
1088/// syscalls the underlying [`read_cpufreq_governors`] performs on
1089/// a 256-CPU host.
1090fn cached_cpufreq_governors() -> BTreeMap<usize, String> {
1091 CPUFREQ_GOVERNORS
1092 .get_or_init(read_cpufreq_governors)
1093 .clone()
1094}
1095
1096/// Read `scaling_governor` for every online CPU, keyed by CPU
1097/// id. Reads `/sys/devices/system/cpu/cpu{N}/cpufreq/scaling_governor`
1098/// for each entry in `/sys/devices/system/cpu/online`. Returns an
1099/// empty map when `/sys/devices/system/cpu/online` is unreadable
1100/// (sysfs absent, constrained container) or when every per-CPU
1101/// read fails. A CPU with no `cpufreq/` directory (non-CPUFREQ
1102/// kernel, VM without passthrough) contributes no entry — the
1103/// missing-key shape is the "no governor reported" signal for
1104/// consumers.
1105///
1106/// Production callers reach this through
1107/// [`cached_cpufreq_governors`] which memoises the result in
1108/// [`CPUFREQ_GOVERNORS`]; a transient sysfs failure on the very
1109/// first call therefore pins an empty map for the remainder of
1110/// the process — see the module-level "Static-cache staleness"
1111/// section for the contract.
1112fn read_cpufreq_governors() -> BTreeMap<usize, String> {
1113 #[cfg(test)]
1114 CPUFREQ_GOVERNORS_READ_CALLS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1115 let Ok(online_raw) = std::fs::read_to_string("/sys/devices/system/cpu/online") else {
1116 return BTreeMap::new();
1117 };
1118 let Ok(cpus) = crate::topology::parse_cpu_list(&online_raw) else {
1119 return BTreeMap::new();
1120 };
1121 let mut out = BTreeMap::new();
1122 for cpu in cpus {
1123 let path = format!("/sys/devices/system/cpu/cpu{cpu}/cpufreq/scaling_governor");
1124 if let Some(gov) = read_trimmed_sysfs(&path) {
1125 out.insert(cpu, gov);
1126 }
1127 }
1128 out
1129}
1130
1131/// Populate the static-fields cache on first access. Takes the
1132/// already-parsed `/proc/meminfo` from the caller so the cold path
1133/// does not re-read the file. Reads `/proc/cpuinfo` (CPU identity),
1134/// the host NUMA topology, and a single `uname()` call.
1135fn compute_static_host_info(meminfo: &MeminfoFields) -> StaticHostInfo {
1136 #[cfg(test)]
1137 STATIC_INIT_CALLS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1138 let (cpu_model, cpu_vendor) = read_cpuinfo_identity();
1139 // `uname(2)` is unit-tested only through
1140 // `collect_host_context_returns_populated_struct_on_linux`
1141 // (integration-style — runs the real syscall and asserts the
1142 // sysname field populates). No injection seam exists by design:
1143 // the only post-syscall logic here is `.to_str().ok().map(...)`,
1144 // which is three method calls on `rustix::system::UtsName`'s
1145 // already-null-terminated-`CStr` accessors. Extracting that into
1146 // a pure parser would test `CStr::to_str` — std's invariant, not
1147 // ours — and the real fragility (syscall return, encoding on
1148 // non-Linux hosts) is untestable without a kernel mock, which
1149 // is outside ktstr's scope. Marking this not-unit-tested by
1150 // design.
1151 let u = rustix::system::uname();
1152 let (online_cpus, numa_nodes) = probe_host_topology_counts();
1153 StaticHostInfo {
1154 cpu_model,
1155 cpu_vendor,
1156 total_memory_kib: meminfo.mem_total_kib,
1157 hugepages_size_kib: meminfo.hugepages_size_kib,
1158 online_cpus,
1159 numa_nodes,
1160 kernel_name: u.sysname().to_str().ok().map(|s| s.to_string()),
1161 kernel_release: u.release().to_str().ok().map(|s| s.to_string()),
1162 arch: u.machine().to_str().ok().map(|s| s.to_string()),
1163 }
1164}
1165
1166/// One `HostTopology::from_sysfs` probe → both the online-CPU
1167/// count and the NUMA-node count. Returning a tuple keeps the
1168/// two derived values bound to the same probe, so a hotplug
1169/// event between reads cannot make them disagree. Both values
1170/// are `None` when the probe errors.
1171fn probe_host_topology_counts() -> (Option<usize>, Option<usize>) {
1172 match crate::vmm::host_topology::HostTopology::from_sysfs() {
1173 Ok(topo) => (
1174 Some(topo.online_cpus.len()),
1175 Some(count_numa_nodes_in_topology(&topo)),
1176 ),
1177 Err(_) => (None, None),
1178 }
1179}
1180
1181/// Read `/proc/cpuinfo` and extract the first processor's
1182/// `vendor_id` and `model name` lines. Thin I/O wrapper; the
1183/// parsing logic lives in [`parse_cpuinfo_identity`] so it can
1184/// be unit-tested with synthetic fixtures.
1185fn read_cpuinfo_identity() -> (Option<String>, Option<String>) {
1186 let Ok(text) = std::fs::read_to_string("/proc/cpuinfo") else {
1187 return (None, None);
1188 };
1189 parse_cpuinfo_identity(&text)
1190}
1191
1192/// Pure parser split from `read_cpuinfo_identity` for unit
1193/// testability. Parses the first processor's `vendor_id` and
1194/// `model name` lines from `/proc/cpuinfo` content. Returning
1195/// after the first blank line (processor boundary) keeps the
1196/// scan O(one processor) on big machines where `/proc/cpuinfo`
1197/// can span many MiB.
1198fn parse_cpuinfo_identity(text: &str) -> (Option<String>, Option<String>) {
1199 let mut model: Option<String> = None;
1200 let mut vendor: Option<String> = None;
1201 for line in text.lines() {
1202 if line.is_empty() {
1203 // End of the first processor block — both fields we want
1204 // are per-processor and appear before the first blank
1205 // line.
1206 break;
1207 }
1208 if let Some((key, value)) = line.split_once(':') {
1209 let key = key.trim();
1210 let value = value.trim();
1211 if value.is_empty() {
1212 continue;
1213 }
1214 match key {
1215 "model name" if model.is_none() => model = Some(value.to_string()),
1216 "vendor_id" if vendor.is_none() => vendor = Some(value.to_string()),
1217 _ => {}
1218 }
1219 }
1220 }
1221 (model, vendor)
1222}
1223
1224/// The `/proc/meminfo` fields the host-context snapshot consumes. A
1225/// purpose-built struct avoids the BTreeMap lookup/clone dance and
1226/// makes the set of captured fields explicit at the type level.
1227#[derive(Default)]
1228struct MeminfoFields {
1229 mem_total_kib: Option<u64>,
1230 hugepages_total: Option<u64>,
1231 hugepages_free: Option<u64>,
1232 hugepages_size_kib: Option<u64>,
1233}
1234
1235/// Read `/proc/meminfo` and extract the four fields the host
1236/// context needs. Thin I/O wrapper; parsing lives in
1237/// [`parse_meminfo`] so it can be unit-tested with synthetic
1238/// fixtures.
1239fn read_meminfo() -> MeminfoFields {
1240 #[cfg(test)]
1241 MEMINFO_READ_CALLS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1242 let Ok(text) = std::fs::read_to_string("/proc/meminfo") else {
1243 return MeminfoFields::default();
1244 };
1245 parse_meminfo(&text)
1246}
1247
1248/// Pure parser split from `read_meminfo` for unit testability.
1249/// Parses the four `/proc/meminfo` fields the host context needs
1250/// from already-read content. Lines without a numeric first token
1251/// are silently skipped so a kernel that introduces a new
1252/// non-numeric line (e.g. a future flags field) does not poison
1253/// the struct.
1254fn parse_meminfo(text: &str) -> MeminfoFields {
1255 let mut out = MeminfoFields::default();
1256 for line in text.lines() {
1257 let Some((key, rest)) = line.split_once(':') else {
1258 continue;
1259 };
1260 let key = key.trim();
1261 let token = rest.split_whitespace().next().unwrap_or("");
1262 let Ok(n) = token.parse::<u64>() else {
1263 continue;
1264 };
1265 match key {
1266 "MemTotal" => out.mem_total_kib = Some(n),
1267 "HugePages_Total" => out.hugepages_total = Some(n),
1268 "HugePages_Free" => out.hugepages_free = Some(n),
1269 "Hugepagesize" => out.hugepages_size_kib = Some(n),
1270 _ => {}
1271 }
1272 }
1273 out
1274}
1275
1276/// Read a sysfs leaf (or `/proc` pseudofile) and return its
1277/// trimmed content. Thin I/O wrapper; parsing lives in
1278/// [`parse_trimmed`] so it can be unit-tested with synthetic
1279/// fixtures. Returns `None` on any read error (ENOENT, EACCES,
1280/// EIO) so the caller records the field as absent without
1281/// treating it as a fatal context-collection failure.
1282fn read_trimmed_sysfs(path: impl AsRef<std::path::Path>) -> Option<String> {
1283 std::fs::read_to_string(path.as_ref())
1284 .ok()
1285 .and_then(|s| parse_trimmed(&s))
1286}
1287
1288/// Pure parser split from `read_trimmed_sysfs` for unit
1289/// testability. Trims leading and trailing whitespace; returns
1290/// `None` when the result is empty — an empty cmdline or thp
1291/// file is not useful to record. Bracketed content inside the
1292/// value (e.g. `"always [madvise] never"` from THP) is preserved
1293/// verbatim because `str::trim` only affects the edges.
1294fn parse_trimmed(text: &str) -> Option<String> {
1295 let trimmed = text.trim();
1296 if trimmed.is_empty() {
1297 None
1298 } else {
1299 Some(trimmed.to_string())
1300 }
1301}
1302
1303/// Walk `/proc/sys/kernel` for entries whose name starts with
1304/// `sched_` and record each as `basename → content`. Skips any
1305/// entry that is not a regular file — directories, symlinks,
1306/// sockets, fifos, and block/char devices all fall through the
1307/// `file_type.is_file()` guard. The kernel exposes no non-file
1308/// `sched_*` entries today but guarding keeps behavior defined if
1309/// that changes. Also skips entries whose name is not valid UTF-8
1310/// and entries whose contents cannot be read or trim to empty.
1311///
1312/// Returns `None` only when the directory listing itself fails
1313/// (unreadable `/proc/sys/kernel`); an empty map is a valid result
1314/// — it means the directory was readable but had no entries
1315/// starting with `sched_`, or every such entry failed the
1316/// per-file read or trim to empty.
1317fn read_sched_tunables() -> Option<BTreeMap<String, String>> {
1318 read_sched_tunables_from(std::path::Path::new("/proc/sys/kernel"))
1319}
1320
1321/// Path-parameterized walk used by [`read_sched_tunables`]. Seam for
1322/// unit tests that drive the walk with a tempdir full of `sched_*`
1323/// fixture files — everything the production caller does is mirrored
1324/// here except the hardcoded sysfs path, so a future test can
1325/// exercise the real walk + filter + read pipeline against a
1326/// controlled directory rather than against `/proc`.
1327fn read_sched_tunables_from(dir: &std::path::Path) -> Option<BTreeMap<String, String>> {
1328 let entries = std::fs::read_dir(dir).ok()?;
1329 let mut out = BTreeMap::new();
1330 for entry in entries.flatten() {
1331 let name = entry.file_name();
1332 let Some(name) = name.to_str() else { continue };
1333 if !name.starts_with("sched_") {
1334 continue;
1335 }
1336 let path = entry.path();
1337 let Ok(file_type) = entry.file_type() else {
1338 continue;
1339 };
1340 if !file_type.is_file() {
1341 continue;
1342 }
1343 if let Some(content) = read_trimmed_sysfs(&path) {
1344 out.insert(name.to_string(), content);
1345 }
1346 }
1347 Some(out)
1348}
1349
1350/// Build + runtime state of kernel delay accounting, probed from
1351/// `/proc/sys/kernel/task_delayacct`. Determines which taskstats
1352/// delay-family fields are actually being populated:
1353/// - `cpu_delay_*` come from `tsk->sched_info` and are filled
1354/// UNCONDITIONALLY by `delayacct_add_tsk` (kernel/delayacct.c)
1355/// whenever CONFIG_TASK_DELAY_ACCT is built in — they survive the
1356/// runtime toggle, so they read real values in both [`Self::On`]
1357/// and [`Self::RuntimeOff`].
1358/// - the per-resource lock categories (`blkio`, `swapin`,
1359/// `freepages`, `thrashing`, `compact`, `wpcopy`, `irq`) are gated
1360/// by `tsk->delays`, allocated at fork only when `delayacct_on`, so
1361/// they read genuine values only in [`Self::On`].
1362#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1363#[serde(rename_all = "snake_case")]
1364#[non_exhaustive]
1365pub enum DelayacctState {
1366 /// `/proc/sys/kernel/task_delayacct` absent. The sysctl is
1367 /// registered only under CONFIG_TASK_DELAY_ACCT
1368 /// (kernel/delayacct.c), so an absent file means the option is not
1369 /// built in and NO delay-family field is populated. (Under ktstr's
1370 /// root execution a present file is always readable, so absent is
1371 /// the only not-`On`/not-`RuntimeOff` outcome.)
1372 ConfigOff,
1373 /// File present, reads `0` — built in but the runtime toggle is
1374 /// off. `cpu_delay_*` still populate; the lock categories read zero
1375 /// for tasks forked while off.
1376 RuntimeOff,
1377 /// File present, reads `1` — delay accounting fully active.
1378 On,
1379}
1380
1381impl std::fmt::Display for DelayacctState {
1382 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1383 let s = match self {
1384 DelayacctState::ConfigOff => "config-off",
1385 DelayacctState::RuntimeOff => "runtime-off",
1386 DelayacctState::On => "on",
1387 };
1388 f.write_str(s)
1389 }
1390}
1391
1392/// Build state of extended task accounting (CONFIG_TASK_XACCT),
1393/// probed from `/proc/config.gz`. Gates the taskstats memory
1394/// watermark fields (`hiwater_rss_bytes`, `hiwater_vm_bytes`), which
1395/// `xacct_add_tsk` (kernel/tsacct.c) fills whenever CONFIG_TASK_XACCT
1396/// is built in — there is NO runtime toggle, unlike delay accounting.
1397#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1398#[serde(rename_all = "snake_case")]
1399#[non_exhaustive]
1400pub enum XacctState {
1401 /// `/proc/config.gz` shows `CONFIG_TASK_XACCT=y` — watermarks
1402 /// populate (for tasks with an address space; a kernel thread
1403 /// reads zero).
1404 On,
1405 /// `/proc/config.gz` shows `# CONFIG_TASK_XACCT is not set` — not
1406 /// built in, watermarks read zero.
1407 Off,
1408 /// `/proc/config.gz` is unreadable (no CONFIG_IKCONFIG_PROC, or the
1409 /// gz read/parse failed) or the symbol line is absent. Treated as
1410 /// measured (like [`Self::On`]) for gating so a host that does not
1411 /// expose its config never produces a false "absent" verdict.
1412 Unknown,
1413}
1414
1415impl std::fmt::Display for XacctState {
1416 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1417 let s = match self {
1418 XacctState::On => "on",
1419 XacctState::Off => "off",
1420 XacctState::Unknown => "unknown",
1421 };
1422 f.write_str(s)
1423 }
1424}
1425
1426/// Probe the delay-accounting state from `/proc/sys/kernel/task_delayacct`.
1427fn read_task_delayacct() -> DelayacctState {
1428 read_task_delayacct_from(std::path::Path::new("/proc/sys/kernel/task_delayacct"))
1429}
1430
1431/// Path-parameterized seam for [`read_task_delayacct`] — unit tests
1432/// drive it with a tempdir holding a `task_delayacct` fixture (or no
1433/// such file, for the [`DelayacctState::ConfigOff`] case). A
1434/// present-but-empty or unreadable file also maps to `ConfigOff`
1435/// (`read_trimmed_sysfs` returns `None`); the real `proc_dointvec_minmax`
1436/// sysctl always emits "0"/"1", so that conflation is unreachable under
1437/// ktstr's root execution where a present file is readable.
1438fn read_task_delayacct_from(path: &std::path::Path) -> DelayacctState {
1439 match read_trimmed_sysfs(path) {
1440 None => DelayacctState::ConfigOff,
1441 Some(s) if s == "1" => DelayacctState::On,
1442 // Any present-but-not-"1" value (canonically "0") is runtime-off.
1443 Some(_) => DelayacctState::RuntimeOff,
1444 }
1445}
1446
1447/// Probe CONFIG_TASK_XACCT from `/proc/config.gz`.
1448fn read_config_task_xacct() -> XacctState {
1449 read_config_task_xacct_from(std::path::Path::new("/proc/config.gz"))
1450}
1451
1452/// Gz-decode `path` and classify CONFIG_TASK_XACCT. Unreadable file or
1453/// decode failure → [`XacctState::Unknown`]. The decompressed-text
1454/// classification lives in [`parse_kconfig_xacct`] so it is unit-testable
1455/// without writing a gz fixture.
1456fn read_config_task_xacct_from(path: &std::path::Path) -> XacctState {
1457 let Ok(bytes) = std::fs::read(path) else {
1458 return XacctState::Unknown;
1459 };
1460 use std::io::Read as _;
1461 let mut gz = flate2::read::GzDecoder::new(&bytes[..]);
1462 let mut text = String::new();
1463 if gz.read_to_string(&mut text).is_err() {
1464 return XacctState::Unknown;
1465 }
1466 parse_kconfig_xacct(&text)
1467}
1468
1469/// Classify CONFIG_TASK_XACCT from decompressed kconfig text. `=y` →
1470/// [`XacctState::On`]; `# CONFIG_TASK_XACCT is not set` →
1471/// [`XacctState::Off`]; the symbol absent entirely →
1472/// [`XacctState::Unknown`] (a partial/foreign config, treated as
1473/// measured downstream). Matches the whole trimmed line so a substring
1474/// in an unrelated symbol cannot false-match.
1475fn parse_kconfig_xacct(text: &str) -> XacctState {
1476 if text.lines().any(|l| l.trim() == "CONFIG_TASK_XACCT=y") {
1477 XacctState::On
1478 } else if text
1479 .lines()
1480 .any(|l| l.trim() == "# CONFIG_TASK_XACCT is not set")
1481 {
1482 XacctState::Off
1483 } else {
1484 XacctState::Unknown
1485 }
1486}
1487
1488/// Per-sub-family "is this taskstats family genuinely populated host-wide"
1489/// state, reduced from the two kernel-accounting probes. Computed once per
1490/// ctprof snapshot and baked into each captured thread (AND-ed with the
1491/// per-thread query-Ok), so the group aggregation gates each sub-family
1492/// independently rather than treating the whole taskstats payload as one flag.
1493#[derive(Debug, Clone, Copy)]
1494pub(crate) struct TaskstatsActive {
1495 /// `cpu_delay_*` survive the runtime toggle (sched_info-sourced, filled
1496 /// unconditionally) — active unless CONFIG_TASK_DELAY_ACCT is off entirely.
1497 pub cpu_delay: bool,
1498 /// The delayacct resource-wait categories need delayacct runtime-on
1499 /// (`tsk->delays` is allocated at fork only when `delayacct_on`).
1500 pub delay_block: bool,
1501 /// The xacct watermarks need CONFIG_TASK_XACCT; [`XacctState::Unknown`]
1502 /// (config not exposed) is treated as active to avoid a false absent.
1503 pub xacct: bool,
1504}
1505
1506/// Probe both kernel-accounting states and reduce to the per-sub-family active
1507/// flags. Two small `/proc` reads; called once per ctprof snapshot.
1508pub(crate) fn probe_taskstats_active() -> TaskstatsActive {
1509 taskstats_active_from(read_task_delayacct(), read_config_task_xacct())
1510}
1511
1512/// Pure reduction of the two probe enums to the per-sub-family active flags —
1513/// the counter-semantics crux, split out so it is unit-testable without `/proc`.
1514/// `cpu_delay` survives `RuntimeOff` (gate is `!= ConfigOff`, NOT `== On`,
1515/// because `delayacct_add_tsk` fills the sched_info CPU block BEFORE the
1516/// `if (!tsk->delays)` gate); the resource-wait categories need `== On`
1517/// (`tsk->delays` is allocated at fork only when delayacct is on); xacct keys off
1518/// CONFIG only, with `Unknown` treated active to avoid a false absent.
1519fn taskstats_active_from(delayacct: DelayacctState, xacct: XacctState) -> TaskstatsActive {
1520 TaskstatsActive {
1521 cpu_delay: delayacct != DelayacctState::ConfigOff,
1522 delay_block: delayacct == DelayacctState::On,
1523 xacct: xacct != XacctState::Off,
1524 }
1525}
1526
1527/// Pure-function seam used by [`probe_host_topology_counts`]
1528/// (which itself wraps
1529/// [`HostTopology::from_sysfs`](crate::vmm::host_topology::HostTopology::from_sysfs),
1530/// which in turn wraps
1531/// [`TestTopology::from_system`](crate::topology::TestTopology::from_system)):
1532/// given a [`HostTopology`](crate::vmm::host_topology::HostTopology),
1533/// return the number of distinct NUMA nodes it claims. An empty
1534/// `cpu_to_node` map maps to `1` because every Linux system has
1535/// at least one NUMA node — returning zero would misrepresent the
1536/// topology. Sparse / non-contiguous node IDs are counted
1537/// correctly because `BTreeSet::from_iter` deduplicates on
1538/// insert.
1539///
1540/// # Empty `cpu_to_node`: UMA or broken probe?
1541///
1542/// In production the answer is: empty cannot occur from a
1543/// successful probe.
1544/// [`TestTopology::from_system`](crate::topology::TestTopology::from_system)
1545/// bails on `online_cpus.is_empty()`, and every online CPU
1546/// whose `/sys/devices/system/cpu/cpuN/` directory exists falls
1547/// through to at least `llc_id=0, node_id=0` when the per-CPU
1548/// reads inside that directory fail. CPUs listed in
1549/// `/sys/devices/system/cpu/online` whose sysfs directory is
1550/// absent are dropped with a `tracing::warn!` rather than
1551/// fallen-through — so on a host where every listed CPU lacks
1552/// its sysfs dir, `llc_groups` would be empty and
1553/// `cpu_to_node` would be empty too. That failure mode is
1554/// degenerate (a listed-but-absent CPU is itself a kernel/sysfs
1555/// bug) and not the common case. The `.max(1)` floor is
1556/// therefore a guard for synthetic topologies (unit-test
1557/// callers of this pure function) and for the degenerate
1558/// "all-dropped" probe — treating "no entries, but probe said
1559/// OK" as UMA is the conservative interpretation.
1560///
1561/// Keeping the I/O (sysfs probe) separate from the pure counting
1562/// logic lets unit tests exercise the fallback branch and the
1563/// dedup path without standing up a real /sys layout.
1564pub(crate) fn count_numa_nodes_in_topology(
1565 topo: &crate::vmm::host_topology::HostTopology,
1566) -> usize {
1567 topo.cpu_to_node
1568 .values()
1569 .copied()
1570 .collect::<std::collections::BTreeSet<usize>>()
1571 .len()
1572 .max(1)
1573}
1574
1575// Most tests in this module are pure parsers / formatters / diff
1576// helpers that compile and pass on any target. The handful that
1577// actually read `/proc`, `/sys`, or assert `kernel_name == "Linux"`
1578// are individually gated with `#[cfg(target_os = "linux")]` at the
1579// test-fn level so non-Linux contributors still get coverage of the
1580// portable surface.
1581
1582#[cfg(test)]
1583#[path = "host_context_tests.rs"]
1584mod tests;