ktstr/ctprof/parse.rs
1//! /proc parsers and tallying readers extracted from
2//! `super::mod.rs`. Holds:
3//! - `parse_psi` / `parse_centi_percent` and the `read_*psi_at`
4//! helpers that wrap them
5//! - `read_sched_ext_sysfs_at` + `read_sysfs_u64`
6//! - `parse_stat` / `parse_schedstat` / `parse_io` / `parse_status`
7//! plus their `read_*_at_with_tally` wrappers
8//! - `parse_cgroup_v2` / `read_cgroup_at*`
9//! - `parse_sched` and the `parsed_ns_from_dotted` half-millisecond
10//! recovery (with the `ParseDottedNs` discriminator)
11//! - `parse_smaps_rollup` / `read_smaps_rollup_at_with_tally`
12//! - `parse_cpu_stat` / `parse_kv_counters` / `parse_max_or_u64*` /
13//! `parse_floor_value` / `parse_cpu_max`
14//! - `read_cgroup_stats_at` — opens the cgroup v2 files actually
15//! read by capture: `cpu.stat`, `cpu.max`, `cpu.weight`,
16//! `cpu.weight.nice`, `memory.current`, `memory.max`,
17//! `memory.high`, `memory.low`, `memory.min`, `memory.stat`,
18//! `memory.events`, `pids.current`, `pids.max`, plus the
19//! `<cgroup>.pressure` PSI files via `read_cgroup_psi_at`.
20//! `memory.swap.current`, `memory.peak`, `memory.zswap.current`,
21//! `pids.peak`, and `io.stat` are NOT read — the cgroup capture
22//! surface intentionally limits to the v2 fields the comparator
23//! schemas the rest of the pipeline.
24//!
25//! ## Visibility
26//!
27//! The parsers and `read_*` helpers are `pub(super)` so the capture
28//! pipeline in `mod.rs` can call them without re-exporting through
29//! the public API. The serialization surface
30//! ([`CtprofSnapshot::load`](super::CtprofSnapshot::load) /
31//! [`write`](super::CtprofSnapshot::write)) and the snapshot
32//! constants stay `pub` on the parent because they ARE part of the
33//! ktstr public API, consumed by `cargo ktstr ctprof compare` and
34//! by the snapshot-loader crate consumers.
35
36use super::*;
37use std::collections::BTreeMap;
38use std::path::Path;
39
40/// Parse one PSI file's contents. The kernel emits one or two
41/// lines (`some` then `full`), each formatted by `seq_printf` in
42/// `psi_show()` (`kernel/sched/psi.c`). Lines are tokenized by whitespace;
43/// each token is `key=value`. Unknown keys are ignored so a
44/// future kernel that adds a 4th avg or new field doesn't break
45/// the parser. Missing fields default to 0 (matching the
46/// absent-counter contract used elsewhere in this module).
47pub(super) fn parse_psi(raw: &str) -> PsiResource {
48 let mut out = PsiResource::default();
49 for line in raw.lines() {
50 let mut tokens = line.split_whitespace();
51 let Some(prefix) = tokens.next() else {
52 continue;
53 };
54 let half = match prefix {
55 "some" => &mut out.some,
56 "full" => &mut out.full,
57 _ => continue,
58 };
59 for tok in tokens {
60 let Some((key, value)) = tok.split_once('=') else {
61 continue;
62 };
63 match key {
64 "avg10" => half.avg10 = parse_centi_percent(value),
65 "avg60" => half.avg60 = parse_centi_percent(value),
66 "avg300" => half.avg300 = parse_centi_percent(value),
67 "total" => half.total_usec = value.parse::<u64>().unwrap_or(0),
68 _ => {}
69 }
70 }
71 }
72 out
73}
74
75/// Convert `"N.NN"` (kernel `%lu.%02lu` format from `psi_show()`)
76/// to `N * 100 + NN` (centi-percent integer). On malformed input
77/// returns 0, matching the absent-counter default contract.
78/// Saturates at u16::MAX to guard against pathological input.
79///
80/// The kernel always emits a 2-digit zero-padded fraction
81/// (`%02lu`), but a robust parser zero-pads its own input to
82/// exactly 2 digits before combining: a stray `"1.5"` (one
83/// fractional digit) must read as `150` (1.50%), not `105`
84/// (1.05%); a stray `"1.501"` (three fractional digits) is
85/// truncated to `1.50` rather than producing
86/// `1*100 + 501 = 601`. Mirrors the
87/// [`parsed_ns_from_dotted`] helper's zero-pad-to-six discipline.
88pub(super) fn parse_centi_percent(s: &str) -> u16 {
89 let (int_part, frac_part) = s.split_once('.').unwrap_or((s, ""));
90 let Ok(int) = int_part.parse::<u32>() else {
91 return 0;
92 };
93 let frac = if frac_part.is_empty() {
94 0
95 } else {
96 // Zero-pad-to-2 then truncate-to-2: "5" → "50", "501" →
97 // "50". Matches the kernel's `%02lu` format width
98 // exactly so a parser-side roundtrip can never under- or
99 // over-count the fractional weight.
100 let padded: String = frac_part
101 .chars()
102 .chain(std::iter::repeat('0'))
103 .take(2)
104 .collect();
105 padded.parse::<u32>().unwrap_or(0)
106 };
107 let combined = int.saturating_mul(100).saturating_add(frac);
108 combined.try_into().unwrap_or(u16::MAX)
109}
110
111/// Read host-level PSI files (`<proc_root>/pressure/{cpu,memory,io,irq}`)
112/// and populate a [`Psi`] bundle. Each file is read independently;
113/// absent files (older kernels missing irq.pressure, or hosts
114/// with CONFIG_PSI off) collapse to the all-zero default per the
115/// absent-counter contract.
116///
117/// PSI readers (this fn, `read_cgroup_psi_at`) and the
118/// `read_sched_ext_sysfs_at` reader deliberately omit the
119/// `ParseTally` argument that the per-thread procfs readers
120/// thread through. Their build-gate signal is presence of the
121/// containing directory (`/proc/pressure/`,
122/// `/sys/kernel/sched_ext/`): an absent directory means the
123/// kernel feature is off, which is a host-property fact rather
124/// than a per-tid parse failure, and the snapshot's all-zero
125/// default already encodes the absence. Threading these
126/// readers into the tally would multiply the failure tally by
127/// the worker count without adding any operator-actionable
128/// signal beyond what the absent fields already convey.
129pub(super) fn read_host_psi_at(proc_root: &Path) -> Psi {
130 let pressure_dir = proc_root.join("pressure");
131 Psi {
132 cpu: read_psi_file_at(&pressure_dir.join("cpu")),
133 memory: read_psi_file_at(&pressure_dir.join("memory")),
134 io: read_psi_file_at(&pressure_dir.join("io")),
135 irq: read_psi_file_at(&pressure_dir.join("irq")),
136 }
137}
138
139/// Read global sched_ext sysfs state from
140/// `<sys_root>/kernel/sched_ext/`. Returns `None` when the
141/// directory itself is absent (CONFIG_SCHED_CLASS_EXT=n
142/// kernels never expose it). Per-file misses default the
143/// affected field to zero / empty string per the
144/// absent-counter contract — a future kernel that adds new
145/// global attrs (and that we haven't surfaced as fields yet)
146/// won't break the parser; old kernels missing one or more of
147/// the existing five collapse cleanly.
148pub(super) fn read_sched_ext_sysfs_at(sys_root: &Path) -> Option<SchedExtSysfs> {
149 let dir = sys_root.join("kernel").join("sched_ext");
150 // No `tally` arg: directory presence (Option<SchedExtSysfs>)
151 // is THE not-built signal; per-attr misses collapse silently
152 // per the absent-counter contract.
153 if !dir.exists() {
154 return None;
155 }
156 Some(SchedExtSysfs {
157 state: fs::read_to_string(dir.join("state"))
158 .map(|s| s.trim().to_string())
159 .unwrap_or_default(),
160 switch_all: read_sysfs_u64(&dir.join("switch_all")),
161 nr_rejected: read_sysfs_u64(&dir.join("nr_rejected")),
162 hotplug_seq: read_sysfs_u64(&dir.join("hotplug_seq")),
163 enable_seq: read_sysfs_u64(&dir.join("enable_seq")),
164 })
165}
166
167/// Read a single-line u64 sysfs file. Trims trailing newline,
168/// parses, defaults to 0 on read or parse failure (matches the
169/// absent-counter contract).
170pub(super) fn read_sysfs_u64(path: &Path) -> u64 {
171 fs::read_to_string(path)
172 .ok()
173 .and_then(|s| s.trim().parse::<u64>().ok())
174 .unwrap_or(0)
175}
176
177/// Read per-cgroup PSI files (`<cgroup>/{cpu,memory,io,irq}.pressure`)
178/// and populate a [`Psi`] bundle. The four files are exposed by
179/// `cgroup_psi_files[]` (`kernel/cgroup/cgroup.c`); the per-cgroup interface
180/// uses the `<resource>.pressure` filename pattern rather than
181/// the host-level `pressure/<resource>` directory layout.
182pub(super) fn read_cgroup_psi_at(cgroup_root: &Path, path: &str) -> Psi {
183 let relative = path.strip_prefix('/').unwrap_or(path);
184 let dir = if relative.is_empty() {
185 cgroup_root.to_path_buf()
186 } else {
187 cgroup_root.join(relative)
188 };
189 Psi {
190 cpu: read_psi_file_at(&dir.join("cpu.pressure")),
191 memory: read_psi_file_at(&dir.join("memory.pressure")),
192 io: read_psi_file_at(&dir.join("io.pressure")),
193 irq: read_psi_file_at(&dir.join("irq.pressure")),
194 }
195}
196
197/// Read one PSI file by path. Absent files or read errors
198/// collapse to a default-zero [`PsiResource`].
199pub(super) fn read_psi_file_at(path: &Path) -> PsiResource {
200 fs::read_to_string(path)
201 .ok()
202 .as_deref()
203 .map(parse_psi)
204 .unwrap_or_default()
205}
206
207impl CtprofSnapshot {
208 /// Load a snapshot from a zstd-compressed JSON file.
209 ///
210 /// Errors propagate via [`anyhow`] with the source path in the
211 /// context chain so a malformed file surfaces an actionable
212 /// message rather than a generic deserialize error. The loader
213 /// does not validate that `threads` is non-empty — an empty
214 /// snapshot is a legitimate edge case (host idle, capture
215 /// filter excluded every thread) and the comparison engine
216 /// handles it by emitting an empty diff.
217 ///
218 /// The decompression step is bounded by
219 /// `MAX_DECOMPRESSED_SNAPSHOT_BYTES` — a payload that
220 /// decompresses past that ceiling surfaces an error rather
221 /// than allocating unbounded memory, guarding against a
222 /// hostile zstd payload (zstd compresses pathologically well
223 /// on repeated bytes).
224 pub fn load(path: &std::path::Path) -> anyhow::Result<Self> {
225 use anyhow::Context;
226 let bytes = std::fs::read(path)
227 .with_context(|| format!("read ctprof snapshot from {}", path.display()))?;
228 let json = decompress_capped(&bytes, MAX_DECOMPRESSED_SNAPSHOT_BYTES)
229 .with_context(|| format!("zstd decompress ctprof snapshot {}", path.display()))?;
230 let snap: CtprofSnapshot = serde_json::from_slice(&json).with_context(|| {
231 format!(
232 "parse ctprof snapshot JSON from {} (did the capture format change?)",
233 path.display(),
234 )
235 })?;
236 Ok(snap)
237 }
238
239 /// Write a snapshot as zstd-compressed JSON.
240 ///
241 /// Used by the capture layer; exposed from this module so that
242 /// both compare-side tests and the capture binary share one
243 /// on-disk shape. Compression level `3` mirrors the ktstr
244 /// remote-cache convention — adequate ratio at fast speed —
245 /// and is not tunable because ctprof captures are small
246 /// enough that further compression produces diminishing
247 /// returns on I/O.
248 pub fn write(&self, path: &std::path::Path) -> anyhow::Result<()> {
249 use anyhow::Context;
250 let json = serde_json::to_vec(self).context("serialize ctprof snapshot to JSON")?;
251 let compressed =
252 zstd::encode_all(json.as_slice(), 3).context("zstd compress ctprof snapshot")?;
253 std::fs::write(path, compressed)
254 .with_context(|| format!("write ctprof snapshot to {}", path.display()))?;
255 Ok(())
256 }
257}
258
259/// Decompress a zstd payload into a `Vec<u8>` capped at
260/// `max_decompressed` bytes — bombing out with an error if the
261/// payload would expand past the ceiling. Reads through
262/// `Read::take(cap + 1)` so a payload that decompresses to
263/// exactly `cap` bytes is accepted while one that produces
264/// `cap + 1` bytes (or more) is rejected — the +1 sentinel
265/// distinguishes "EOF coincided with the cap" from "more data
266/// behind the cap".
267pub(super) fn decompress_capped(bytes: &[u8], max_decompressed: u64) -> anyhow::Result<Vec<u8>> {
268 use std::io::Read;
269 let decoder = zstd::stream::read::Decoder::new(bytes)?;
270 let mut out = Vec::new();
271 decoder
272 .take(max_decompressed.saturating_add(1))
273 .read_to_end(&mut out)?;
274 if out.len() as u64 > max_decompressed {
275 anyhow::bail!(
276 "zstd-decompressed payload exceeds the {}-byte cap (decompression-bomb guard)",
277 max_decompressed,
278 );
279 }
280 Ok(out)
281}
282
283// ---------------------------------------------------------------
284// Capture layer: procfs readers + host walk.
285// ---------------------------------------------------------------
286
287/// Canonical file extension for a serialized snapshot.
288///
289/// `dead_code` allow: no production code references this
290/// constant — the only reference is a roundtrip test.
291/// [`write`](CtprofSnapshot::write) and
292/// [`CtprofSnapshot::load`] take a caller-supplied path and
293/// neither constructs the extension (the CLI accepts any path
294/// the operator supplies and the renderer reads via
295/// [`CtprofSnapshot::load`]). Kept as a named constant so a future
296/// caller that needs to construct paths from scratch has the
297/// canonical token available without re-typing the literal.
298#[allow(dead_code)]
299pub const SNAPSHOT_EXTENSION: &str = "ctprof.zst";
300
301/// Decompressed-size ceiling for [`CtprofSnapshot::load`].
302/// Bounds the allocation a malicious or corrupted zstd payload
303/// can force, since zstd compresses pathologically well on
304/// repeated bytes (a few-KiB compressed blob can decompress to
305/// gigabytes). 256 MiB covers any realistic production snapshot
306/// (typical hosts run 1K-100K live threads) while bounding
307/// worst-case allocation against hostile zstd payloads.
308/// Public so a downstream consumer can size buffers against the
309/// same ceiling without hardcoding the value.
310pub const MAX_DECOMPRESSED_SNAPSHOT_BYTES: u64 = 256 * 1024 * 1024;
311
312/// Default procfs root on Linux. The `_at` readers accept any
313/// `&Path` so tests stage a synthetic tree under a tempdir; the
314/// public readers delegate to those with this default.
315pub const DEFAULT_PROC_ROOT: &str = "/proc";
316
317/// Default cgroup v2 mount point.
318pub const DEFAULT_CGROUP_ROOT: &str = "/sys/fs/cgroup";
319
320/// Default sysfs root. Tests pass a tempdir so they don't read
321/// the live `/sys` tree (which would produce nondeterministic
322/// `sched_ext` state depending on the host kernel config). The
323/// public capture entry points pass this constant to read the
324/// real sysfs tree at runtime.
325pub const DEFAULT_SYS_ROOT: &str = "/sys";
326
327pub(super) fn task_file(proc_root: &Path, tgid: i32, tid: i32, leaf: &str) -> PathBuf {
328 proc_root
329 .join(tgid.to_string())
330 .join("task")
331 .join(tid.to_string())
332 .join(leaf)
333}
334
335pub(super) fn proc_file(proc_root: &Path, tgid: i32, leaf: &str) -> PathBuf {
336 proc_root.join(tgid.to_string()).join(leaf)
337}
338
339/// Map a numeric scheduling policy (as it appears in
340/// `/proc/<tgid>/task/<tid>/stat` field 41) to the canonical
341/// kernel identifier string. Unknown integers render as
342/// `"SCHED_UNKNOWN(<n>)"` rather than dropping the value so
343/// diff output still surfaces a novel policy from a future
344/// kernel.
345pub(super) fn policy_name(policy: i32) -> String {
346 match policy {
347 libc::SCHED_OTHER => "SCHED_OTHER".to_string(),
348 libc::SCHED_FIFO => "SCHED_FIFO".to_string(),
349 libc::SCHED_RR => "SCHED_RR".to_string(),
350 libc::SCHED_BATCH => "SCHED_BATCH".to_string(),
351 libc::SCHED_IDLE => "SCHED_IDLE".to_string(),
352 // `SCHED_DEADLINE` = 6, `SCHED_EXT` = 7 — neither is
353 // exposed by the libc crate as of this writing; use the
354 // kernel-canonical numeric codes.
355 6 => "SCHED_DEADLINE".to_string(),
356 7 => "SCHED_EXT".to_string(),
357 other => format!("SCHED_UNKNOWN({other})"),
358 }
359}
360
361/// Enumerate every numeric directory under the procfs root
362/// (live tgids). Returns sorted ids so snapshot ordering is
363/// deterministic. Empty vec on read failure.
364pub(super) fn iter_tgids_at(proc_root: &Path) -> Vec<i32> {
365 let Ok(entries) = fs::read_dir(proc_root) else {
366 return Vec::new();
367 };
368 let mut tgids: Vec<i32> = entries
369 .filter_map(|e| e.ok())
370 .filter_map(|e| e.file_name().to_str().and_then(|s| s.parse::<i32>().ok()))
371 .filter(|&p| p > 0)
372 .collect();
373 tgids.sort_unstable();
374 tgids
375}
376
377/// Enumerate tids under `<proc_root>/<tgid>/task`. Empty vec on
378/// read failure (process exited between enumeration and this
379/// call).
380pub(super) fn iter_task_ids_at(proc_root: &Path, tgid: i32) -> Vec<i32> {
381 let path = proc_root.join(tgid.to_string()).join("task");
382 let Ok(entries) = fs::read_dir(&path) else {
383 return Vec::new();
384 };
385 let mut tids: Vec<i32> = entries
386 .filter_map(|e| e.ok())
387 .filter_map(|e| e.file_name().to_str().and_then(|s| s.parse::<i32>().ok()))
388 .filter(|&t| t > 0)
389 .collect();
390 tids.sort_unstable();
391 tids
392}
393
394/// Read `<proc_root>/<tgid>/comm` trimmed. `None` on read
395/// failure or empty content.
396pub(super) fn read_process_comm_at(proc_root: &Path, tgid: i32) -> Option<String> {
397 let raw = fs::read_to_string(proc_file(proc_root, tgid, "comm")).ok()?;
398 let trimmed = raw.trim();
399 if trimmed.is_empty() {
400 None
401 } else {
402 Some(trimmed.to_string())
403 }
404}
405
406/// Read `<proc_root>/<tgid>/task/<tid>/comm` trimmed. `None`
407/// on read failure or empty content.
408pub(super) fn read_thread_comm_at(proc_root: &Path, tgid: i32, tid: i32) -> Option<String> {
409 let raw = fs::read_to_string(task_file(proc_root, tgid, tid, "comm")).ok()?;
410 let trimmed = raw.trim();
411 if trimmed.is_empty() {
412 None
413 } else {
414 Some(trimmed.to_string())
415 }
416}
417
418/// Selected fields parsed out of `/proc/<tgid>/task/<tid>/stat`.
419#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
420pub(super) struct StatFields {
421 pub(super) minflt: Option<u64>,
422 pub(super) majflt: Option<u64>,
423 pub(super) utime_clock_ticks: Option<u64>,
424 pub(super) stime_clock_ticks: Option<u64>,
425 /// Field 18: kernel-internal priority (signed, distinct
426 /// from `nice`). `seq_put_decimal_ll(m, " ", priority)` in
427 /// `do_task_stat()` (`fs/proc/array.c`); the value is the post-bias
428 /// scheduler priority (`task_prio(task)`).
429 pub(super) priority: Option<i32>,
430 pub(super) nice: Option<i32>,
431 pub(super) start_time_clock_ticks: Option<u64>,
432 pub(super) processor: Option<i32>,
433 /// Field 40: real-time priority. `seq_put_decimal_ull(m,
434 /// " ", task->rt_priority)` in `do_task_stat()` (`fs/proc/array.c`).
435 /// Stored as `u32` to match `unsigned int
436 /// task_struct::rt_priority` from `include/linux/sched.h`;
437 /// non-zero only when the task runs SCHED_FIFO / SCHED_RR.
438 pub(super) rt_priority: Option<u32>,
439 pub(super) policy: Option<i32>,
440}
441
442/// Pure parser for `/proc/<tgid>/task/<tid>/stat`. Per `proc(5)`,
443/// field 2 (`comm`) is wrapped in parens and may contain
444/// whitespace or `)`; every later field is indexed relative to
445/// the LAST `)` in the line. Tail offsets (0-indexed from the
446/// token past the final `)`):
447///
448/// | field | name | tail index |
449/// |-------|-----------------------|------------|
450/// | 10 | minflt | 7 |
451/// | 12 | majflt | 9 |
452/// | 14 | utime | 11 |
453/// | 15 | stime | 12 |
454/// | 18 | priority | 15 |
455/// | 19 | nice | 16 |
456/// | 22 | starttime | 19 |
457/// | 39 | processor | 36 |
458/// | 40 | rt_priority | 37 |
459/// | 41 | policy | 38 |
460///
461/// Field 42 (`delayacct_blkio_ticks`) is intentionally NOT
462/// parsed — `blkio_delay_total_ns` from the taskstats genetlink
463/// path supersedes it (ns precision vs USER_HZ ticks; both gated
464/// by `CONFIG_TASK_DELAY_ACCT`, but the netlink path delivers
465/// the same data without the procfs USER_HZ truncation).
466///
467/// Missing fields return `None` individually so a short line
468/// (tid exited mid-read, stat truncated) degrades gracefully.
469pub(super) fn parse_stat(raw: &str) -> StatFields {
470 let Some(line) = raw.lines().next() else {
471 return StatFields::default();
472 };
473 let Some(last_close) = line.rfind(')') else {
474 return StatFields::default();
475 };
476 let Some(tail) = line.get(last_close + 1..) else {
477 return StatFields::default();
478 };
479 let parts: Vec<&str> = tail.split_ascii_whitespace().collect();
480 let get_u64 = |idx: usize| parts.get(idx).and_then(|s| s.parse::<u64>().ok());
481 let get_u32 = |idx: usize| parts.get(idx).and_then(|s| s.parse::<u32>().ok());
482 let get_i32 = |idx: usize| parts.get(idx).and_then(|s| s.parse::<i32>().ok());
483 StatFields {
484 minflt: get_u64(7),
485 majflt: get_u64(9),
486 utime_clock_ticks: get_u64(11),
487 stime_clock_ticks: get_u64(12),
488 priority: get_i32(15),
489 nice: get_i32(16),
490 start_time_clock_ticks: get_u64(19),
491 processor: get_i32(36),
492 rt_priority: get_u32(37),
493 policy: get_i32(38),
494 }
495}
496
497/// Read `<proc_root>/<tgid>/task/<tid>/stat` and parse fields.
498/// Records a `"stat"` failure into `tally` on read error so the
499/// per-snapshot [`CtprofParseSummary`] surfaces the dominant
500/// procfs read-failure category. `tally: &mut None` skips the
501/// recording (the synthetic-tree test pattern).
502pub(super) fn read_stat_at_with_tally(
503 proc_root: &Path,
504 tgid: i32,
505 tid: i32,
506 tally: &mut Option<&mut ParseTally>,
507) -> StatFields {
508 match fs::read_to_string(task_file(proc_root, tgid, tid, "stat")) {
509 Ok(raw) => parse_stat(&raw),
510 Err(_) => {
511 if let Some(t) = tally.as_mut() {
512 t.record_failure("stat");
513 }
514 StatFields::default()
515 }
516 }
517}
518
519/// Parse the three leading u64 fields from a single-line
520/// `/proc/<tgid>/task/<tid>/schedstat` — `(run_time_ns,
521/// wait_time_ns, timeslices)`. Missing fields drop individually.
522pub(super) fn parse_schedstat(raw: &str) -> (Option<u64>, Option<u64>, Option<u64>) {
523 let Some(line) = raw.lines().next() else {
524 return (None, None, None);
525 };
526 let mut parts = line.split_ascii_whitespace();
527 let run = parts.next().and_then(|s| s.parse::<u64>().ok());
528 let wait = parts.next().and_then(|s| s.parse::<u64>().ok());
529 let slices = parts.next().and_then(|s| s.parse::<u64>().ok());
530 (run, wait, slices)
531}
532
533/// Read `<proc_root>/<tgid>/task/<tid>/schedstat`. Three-tuple
534/// of `Option<u64>` — when `CONFIG_SCHED_INFO` is off the proc
535/// entry is absent (the registration is `#ifdef CONFIG_SCHED_INFO`
536/// in `fs/proc/base.c`), so the read fails with ENOENT and yields
537/// all-`None`; the kernel's "0 0 0" zero-fill branch is dead code
538/// for this file (it compiles only when `CONFIG_SCHED_INFO` is on).
539/// `CONFIG_SCHEDSTATS` selects `CONFIG_SCHED_INFO`, so a SCHEDSTATS
540/// kernel always has the file; SCHED_INFO is the minimal gate.
541/// Records a `"schedstat"` failure on read error when a tally is
542/// supplied.
543pub(super) fn read_schedstat_at_with_tally(
544 proc_root: &Path,
545 tgid: i32,
546 tid: i32,
547 tally: &mut Option<&mut ParseTally>,
548) -> (Option<u64>, Option<u64>, Option<u64>) {
549 match fs::read_to_string(task_file(proc_root, tgid, tid, "schedstat")) {
550 Ok(raw) => parse_schedstat(&raw),
551 Err(_) => {
552 if let Some(t) = tally.as_mut() {
553 t.record_failure("schedstat");
554 }
555 (None, None, None)
556 }
557 }
558}
559
560#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
561pub(super) struct IoFields {
562 pub(super) rchar: Option<u64>,
563 pub(super) wchar: Option<u64>,
564 pub(super) syscr: Option<u64>,
565 pub(super) syscw: Option<u64>,
566 pub(super) read_bytes: Option<u64>,
567 pub(super) write_bytes: Option<u64>,
568 pub(super) cancelled_write_bytes: Option<u64>,
569}
570
571/// Parse `/proc/<tgid>/task/<tid>/io` (line-oriented
572/// `key: value` format).
573pub(super) fn parse_io(raw: &str) -> IoFields {
574 let mut out = IoFields::default();
575 for line in raw.lines() {
576 let Some((key, value)) = line.split_once(':') else {
577 continue;
578 };
579 let parsed = value.trim().parse::<u64>().ok();
580 match key.trim() {
581 "rchar" => out.rchar = parsed,
582 "wchar" => out.wchar = parsed,
583 "syscr" => out.syscr = parsed,
584 "syscw" => out.syscw = parsed,
585 "read_bytes" => out.read_bytes = parsed,
586 "write_bytes" => out.write_bytes = parsed,
587 "cancelled_write_bytes" => out.cancelled_write_bytes = parsed,
588 _ => {}
589 }
590 }
591 out
592}
593
594/// Read `<proc_root>/<tgid>/task/<tid>/io` and parse fields.
595/// Records an `"io"` failure into `tally` on read error (kernel
596/// without `CONFIG_TASK_IO_ACCOUNTING` or per-tid race).
597pub(super) fn read_io_at_with_tally(
598 proc_root: &Path,
599 tgid: i32,
600 tid: i32,
601 tally: &mut Option<&mut ParseTally>,
602) -> IoFields {
603 match fs::read_to_string(task_file(proc_root, tgid, tid, "io")) {
604 Ok(raw) => parse_io(&raw),
605 Err(_) => {
606 if let Some(t) = tally.as_mut() {
607 t.record_failure("io");
608 }
609 IoFields::default()
610 }
611 }
612}
613
614#[derive(Debug, Clone, Default, PartialEq, Eq)]
615pub(super) struct StatusFields {
616 pub(super) voluntary_csw: Option<u64>,
617 pub(super) nonvoluntary_csw: Option<u64>,
618 /// First non-whitespace character of the `State:` line value.
619 /// Real kernel chars are `R` / `S` / `D` / `T` / `t` / `X` /
620 /// `Z` / `P` / `I` (see `fs/proc/array.c::task_state_array`).
621 /// `None` when the line is absent or blank — the capture site
622 /// collapses to `'~'` (via `default_state_char`) which sorts
623 /// strictly after every real kernel char in lex order, so
624 /// the [`crate::ctprof_compare::AggRule::ModeChar`]
625 /// lex-smallest-wins tiebreak picks a real letter when one
626 /// is present.
627 pub(super) state: Option<char>,
628 /// `Cpus_allowed_list:` as a parsed sorted vec. Kept separate
629 /// from the `sched_getaffinity` reader because status-file
630 /// reads attribute to the target task without a syscall
631 /// round-trip — useful when the caller cannot hold a pid
632 /// long enough for the syscall without a race.
633 pub(super) cpus_allowed: Option<Vec<u32>>,
634 /// `Threads:` value — `signal_struct->nr_threads` snapshot
635 /// per `task_sig()` (`fs/proc/array.c`). Identical across every thread
636 /// of the same tgid. The capture site dedups by populating
637 /// [`ThreadState::nr_threads`] only on tid == tgid threads
638 /// (see `capture_thread_at_with_tally`).
639 pub(super) nr_threads: Option<u64>,
640}
641
642pub(super) fn parse_status(raw: &str) -> StatusFields {
643 let mut out = StatusFields::default();
644 for line in raw.lines() {
645 let Some((key, value)) = line.split_once(':') else {
646 continue;
647 };
648 let value = value.trim();
649 match key.trim() {
650 // Kernel emits `State:\t<C> (<long>)` where <C> is the
651 // single-letter code from `task_state_array`
652 // (R/S/D/T/t/X/Z/P/I — nine codes, including the
653 // off-by-default `P` parked state). First non-whitespace
654 // char of the trimmed value is the letter;
655 // `value.chars().next()` produces `None` only on a truly
656 // empty line (which the split_once guards against
657 // already).
658 "State" => {
659 out.state = value.chars().next();
660 }
661 "voluntary_ctxt_switches" => {
662 out.voluntary_csw = value.parse::<u64>().ok();
663 }
664 "nonvoluntary_ctxt_switches" => {
665 out.nonvoluntary_csw = value.parse::<u64>().ok();
666 }
667 "Cpus_allowed_list" => {
668 out.cpus_allowed = crate::cpu_util::parse_cpu_list(value);
669 }
670 // `Threads:\t<num_threads>\n` per
671 // `task_sig()` (`fs/proc/array.c`). Same value across every
672 // thread of the same tgid; capture-side dedup picks
673 // only the leader thread to avoid double-counting.
674 "Threads" => {
675 out.nr_threads = value.parse::<u64>().ok();
676 }
677 _ => {}
678 }
679 }
680 out
681}
682
683/// Read `<proc_root>/<tgid>/task/<tid>/status` and parse fields.
684/// Records a `"status"` failure into `tally` on read error.
685pub(super) fn read_status_at_with_tally(
686 proc_root: &Path,
687 tgid: i32,
688 tid: i32,
689 tally: &mut Option<&mut ParseTally>,
690) -> StatusFields {
691 match fs::read_to_string(task_file(proc_root, tgid, tid, "status")) {
692 Ok(raw) => parse_status(&raw),
693 Err(_) => {
694 if let Some(t) = tally.as_mut() {
695 t.record_failure("status");
696 }
697 StatusFields::default()
698 }
699 }
700}
701
702/// Read the cgroup v2 path from
703/// `<proc_root>/<tgid>/task/<tid>/cgroup`. Format per
704/// `cgroup(7)`: one line per hierarchy, shape
705/// `hid:controllers:path`. The unified (v2) hierarchy is keyed
706/// `0::<path>`; mixed-mode hosts expose legacy v1 hierarchies
707/// alongside, which this reader skips. `None` on read failure
708/// or when no v2 line is present. Test-only — production callers
709/// pipe through [`read_cgroup_at_with_tally`] so per-tid failures
710/// surface in `parse_summary`.
711#[cfg(test)]
712pub(super) fn read_cgroup_at(proc_root: &Path, tgid: i32, tid: i32) -> Option<String> {
713 read_cgroup_at_with_tally(proc_root, tgid, tid, &mut None)
714}
715
716/// Records a `"cgroup"`
717/// failure on read error (file absent — typical when the tid
718/// exited mid-capture).
719pub(super) fn read_cgroup_at_with_tally(
720 proc_root: &Path,
721 tgid: i32,
722 tid: i32,
723 tally: &mut Option<&mut ParseTally>,
724) -> Option<String> {
725 match fs::read_to_string(task_file(proc_root, tgid, tid, "cgroup")) {
726 Ok(raw) => parse_cgroup_v2(&raw),
727 Err(_) => {
728 if let Some(t) = tally.as_mut() {
729 t.record_failure("cgroup");
730 }
731 None
732 }
733 }
734}
735
736pub(super) fn parse_cgroup_v2(raw: &str) -> Option<String> {
737 for line in raw.lines() {
738 if let Some(rest) = line.strip_prefix("0::") {
739 let trimmed = rest.trim();
740 if !trimmed.is_empty() {
741 return Some(trimmed.to_string());
742 }
743 }
744 }
745 None
746}
747
748#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
749pub(super) struct SchedFields {
750 pub(super) nr_wakeups: Option<u64>,
751 pub(super) nr_wakeups_local: Option<u64>,
752 pub(super) nr_wakeups_remote: Option<u64>,
753 pub(super) nr_wakeups_sync: Option<u64>,
754 pub(super) nr_wakeups_migrate: Option<u64>,
755 pub(super) nr_wakeups_affine: Option<u64>,
756 pub(super) nr_wakeups_affine_attempts: Option<u64>,
757 pub(super) nr_migrations: Option<u64>,
758 pub(super) nr_forced_migrations: Option<u64>,
759 pub(super) nr_failed_migrations_affine: Option<u64>,
760 pub(super) nr_failed_migrations_running: Option<u64>,
761 pub(super) nr_failed_migrations_hot: Option<u64>,
762 pub(super) wait_sum: Option<u64>,
763 pub(super) wait_count: Option<u64>,
764 pub(super) wait_max: Option<u64>,
765 pub(super) sleep_sum: Option<u64>,
766 pub(super) sleep_max: Option<u64>,
767 pub(super) block_sum: Option<u64>,
768 pub(super) block_max: Option<u64>,
769 pub(super) iowait_sum: Option<u64>,
770 pub(super) iowait_count: Option<u64>,
771 pub(super) exec_max: Option<u64>,
772 pub(super) slice_max: Option<u64>,
773 /// `core_forceidle_sum` from `/proc/<tid>/sched`, emitted via
774 /// `PN_SCHEDSTAT(core_forceidle_sum)` in
775 /// `proc_sched_show_task()` (`kernel/sched/debug.c`), build-gated on
776 /// `CONFIG_SCHED_CORE`. Emission additionally lives inside
777 /// the `if (schedstat_enabled())` block in
778 /// `proc_sched_show_task()` (`kernel/sched/debug.c`), so on a host with schedstat
779 /// off at runtime the line is absent and the parser arm
780 /// never fires — leaving the field at `None`.
781 /// Dotted ms.ns format like the other PN_SCHEDSTAT fields —
782 /// reconstructed to full ns via [`parsed_ns_from_dotted`]. Counts
783 /// time the task forced its SMT sibling idle for core-scheduling.
784 /// `None` on kernels without `CONFIG_SCHED_CORE`, on hosts
785 /// with schedstat disabled at runtime, or for tasks whose
786 /// SMT cohort never accumulated forceidle.
787 pub(super) core_forceidle_sum: Option<u64>,
788 /// `se.slice` from `/proc/<tid>/sched`, emitted via
789 /// `P(se.slice)` in `proc_sched_show_task()` (`kernel/sched/debug.c`). Plain
790 /// `%lld` integer (NOT dotted ns; the `P` macro uses
791 /// `%lld`, not `PN`'s `%lld.%06ld`). Per-thread
792 /// `p->se.slice` in nanoseconds. For fair-class tasks
793 /// (SCHED_NORMAL / SCHED_BATCH) it is the instantaneous
794 /// slice CFS is currently running the task with; for
795 /// SCHED_EXT tasks it reflects stale `p->se.slice` state
796 /// because ext-class schedulers maintain slice in
797 /// `p->scx.slice` and do not refresh `p->se.slice`. The
798 /// kernel emits this line ONLY when `fair_policy(p->policy)`
799 /// holds, which (per `normal_policy()`/`fair_policy()` (`kernel/sched/sched.h`)) is
800 /// true for SCHED_NORMAL, SCHED_BATCH, AND — under
801 /// `CONFIG_SCHED_CLASS_EXT` — SCHED_EXT. `None` for
802 /// SCHED_DEADLINE / SCHED_RR / SCHED_FIFO / SCHED_IDLE.
803 pub(super) fair_slice_ns: Option<u64>,
804 pub(super) ext_enabled: Option<bool>,
805}
806
807/// Outcome of [`parsed_ns_from_dotted`]. Distinguishes the two
808/// failure modes the caller may want to treat separately:
809/// [`Self::Negative`] (kernel emitted a value with a leading
810/// `-`, observable on clock-skew / suspend-resume hosts) is
811/// counted into [`CtprofParseSummary::negative_dotted_values`]
812/// so an operator can see that the snapshot's schedstat values
813/// are routinely negative-and-zeroed; [`Self::Malformed`]
814/// (non-numeric, empty, overflow) is the every-other failure
815/// mode and stays silent (the data source is ill-formed in a way
816/// the operator can't act on).
817#[derive(Debug, Clone, Copy, PartialEq, Eq)]
818pub(super) enum ParseDottedNs {
819 /// Trimmed input started with `-` — the kernel's PN_SCHEDSTAT
820 /// `%Ld.%06ld` format emitted a negative integer part. The
821 /// `parse::<u64>()` rejection is by design (u64 cannot
822 /// represent the sign) but the SIGNAL is meaningful: a
823 /// negative schedstat field is rare and worth surfacing
824 /// rather than silently zeroing.
825 ///
826 /// Note: `-0.000000` would also route here, but is
827 /// unreachable from real kernel output — `SPLIT_NS(0)`
828 /// yields `(0, 0)` which `%Ld.%06ld` formats as
829 /// `0.000000` with no leading sign. The parser still
830 /// classifies the unreachable shape as `Negative` rather
831 /// than special-casing it; a fixture that synthesizes
832 /// `-0.000000` directly will land in this variant.
833 Negative,
834 /// Otherwise unparseable: non-numeric integer or fractional
835 /// part, empty input, or u64 overflow on the
836 /// `ms * 1_000_000 + ns_remainder` reconstruction.
837 Malformed,
838}
839
840/// Parse a `PN_SCHEDSTAT`-emitted dotted nanosecond value into
841/// full ns. The kernel formats schedstat fractional fields via
842/// `__PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS(F))`
843/// where `SPLIT_NS(x) = (x / 1_000_000, x % 1_000_000)` — the
844/// integer part is MILLISECONDS, the 6-digit fractional part is
845/// the NANOSECOND remainder within a millisecond. Reconstructing
846/// the original ns value is `ms * 1_000_000 + ns_remainder`.
847///
848/// Tolerates fractional widths other than 6 (some test fixtures
849/// emit `5000.25` or `7.999`) by zero-padding the right side
850/// before parsing — `.25` becomes `.250000` (=250_000 ns), `.999`
851/// becomes `.999000` (=999_000 ns). Truncates fractional widths
852/// >6 to the first 6 digits.
853///
854/// Returns `Err(ParseDottedNs::Negative)` when EITHER:
855/// - the trimmed integer part starts with `-` (kernel emitted
856/// `-5.000000` for a magnitude ≥ 1ms negative SPLIT_NS: for
857/// x < 0, `nsec_high` returns `-nsec` — a negative long long
858/// — so `%14Ld` on the integer side carries the sign), OR
859/// - the trimmed fractional part starts with `-`. The kernel
860/// never emits this shape: `nsec_low` (kernel/sched/debug.c)
861/// negates via `nsec = -nsec` before `do_div` and returns an
862/// unsigned `long`, so `%06ld` on the fractional side is
863/// always positive and a sub-millisecond negative prints as
864/// `0.000500` — indistinguishable from a genuine positive.
865/// This fractional-side check therefore catches no real
866/// kernel output; it defends only against synthetic/fixture
867/// input that injects a `0.-NNNNNN` shape.
868///
869/// The caller records the bump in the per-snapshot
870/// [`CtprofParseSummary::negative_dotted_values`] before
871/// folding to zero. Returns `Err(ParseDottedNs::Malformed)`
872/// for any other parse failure (non-numeric, empty, overflow);
873/// the caller folds to zero silently per the best-effort capture
874/// contract.
875///
876/// The bare-integer (no dot) branch parses the value as raw ns
877/// — used for test fixtures and graceful degradation; the
878/// kernel's PN_SCHEDSTAT format always emits the dotted form.
879/// Same negative-vs-malformed split applies to the bare-integer
880/// branch so a stray bare-integer negative is also tallied.
881pub(super) fn parsed_ns_from_dotted(value: &str) -> Result<u64, ParseDottedNs> {
882 if let Some((ms_str, ns_str)) = value.split_once('.') {
883 let ms_trimmed = ms_str.trim();
884 if ms_trimmed.starts_with('-') {
885 return Err(ParseDottedNs::Negative);
886 }
887 // Fractional-side negative: the kernel never emits this
888 // shape. `nsec_low` (kernel/sched/debug.c) does
889 // `nsec = -nsec` before `do_div` and returns an unsigned
890 // `long`, so `%06ld` on the fractional side is always
891 // positive — a sub-millisecond negative prints as
892 // `0.000500`, not `0.-000500`. This check defends only
893 // against synthetic/fixture input that injects a
894 // sign-only fractional. Check it BEFORE the
895 // chars().take(6) truncation would otherwise swallow a
896 // sign-only fractional like `-`.
897 if ns_str.trim_start().starts_with('-') {
898 return Err(ParseDottedNs::Negative);
899 }
900 let ms = ms_trimmed
901 .parse::<u64>()
902 .map_err(|_| ParseDottedNs::Malformed)?;
903 let ns_part: String = ns_str.chars().take(6).collect();
904 let padded = format!("{:0<6}", ns_part);
905 let ns = padded
906 .parse::<u64>()
907 .map_err(|_| ParseDottedNs::Malformed)?;
908 ms.checked_mul(1_000_000)
909 .and_then(|x| x.checked_add(ns))
910 .ok_or(ParseDottedNs::Malformed)
911 } else {
912 let trimmed = value.trim();
913 if trimmed.starts_with('-') {
914 return Err(ParseDottedNs::Negative);
915 }
916 trimmed.parse::<u64>().map_err(|_| ParseDottedNs::Malformed)
917 }
918}
919
920/// Parse `/proc/<tgid>/task/<tid>/sched`. The file is registered
921/// unconditionally (always present); the schedstat-prefixed fields
922/// this reads are emitted only under `CONFIG_SCHEDSTATS` (the
923/// kernel's `if (schedstat_enabled())` block) and are absent
924/// otherwise. Format is many lines of `key : value`
925/// where the key is dot-delimited (`se.statistics.nr_wakeups`);
926/// different kernel versions use `se.statistics.`, `stats.`,
927/// or bare names. The reader matches on the LAST dot-delimited
928/// segment to absorb that variation.
929///
930/// PN_SCHEDSTAT fields (`wait_sum`, `sum_sleep_runtime`,
931/// `sum_block_runtime`, `iowait_sum`) emit a `ms.ns_remainder`
932/// dotted format — reconstructed to full ns via
933/// [`parsed_ns_from_dotted`]. P_SCHEDSTAT fields
934/// (`wait_count`, `iowait_count`, `nr_wakeups*`,
935/// `nr_migrations`) emit plain integers — parsed as `u64`.
936///
937/// `tally`, when supplied, records each negative dotted-ns parse
938/// outcome via [`ParseTally::record_negative_dotted`] so the
939/// per-snapshot summary surfaces the rate at which schedstat
940/// fields were silently zeroed. `&mut None` skips the recording —
941/// the synthetic-tree test path that doesn't carry a tally.
942pub(super) fn parse_sched(raw: &str, tally: &mut Option<&mut ParseTally>) -> SchedFields {
943 let mut out = SchedFields::default();
944 let mut parse_dotted = |value: &str| -> Option<u64> {
945 match parsed_ns_from_dotted(value) {
946 Ok(v) => Some(v),
947 Err(ParseDottedNs::Negative) => {
948 if let Some(t) = tally.as_mut() {
949 t.record_negative_dotted();
950 }
951 None
952 }
953 Err(ParseDottedNs::Malformed) => None,
954 }
955 };
956 for line in raw.lines() {
957 let Some((key, value)) = line.split_once(':') else {
958 continue;
959 };
960 let key = key.trim();
961 let value = value.trim();
962 let parsed_u64 = || value.parse::<u64>().ok();
963 // `ext.enabled` is the only key the kernel emits with a
964 // literal dot in the variable name (every other dot is a
965 // namespace prefix like `se.statistics.`). Match on the full
966 // key BEFORE the rsplit-on-dot fallback so a future kernel
967 // line ending in `.enabled` cannot collide.
968 if key == "ext.enabled" {
969 out.ext_enabled = value.parse::<u64>().ok().map(|n| n != 0);
970 continue;
971 }
972 let short = key.rsplit('.').next().unwrap_or(key);
973 match short {
974 "nr_wakeups" => out.nr_wakeups = parsed_u64(),
975 "nr_wakeups_local" => out.nr_wakeups_local = parsed_u64(),
976 "nr_wakeups_remote" => out.nr_wakeups_remote = parsed_u64(),
977 "nr_wakeups_sync" => out.nr_wakeups_sync = parsed_u64(),
978 "nr_wakeups_migrate" => out.nr_wakeups_migrate = parsed_u64(),
979 "nr_wakeups_affine" => out.nr_wakeups_affine = parsed_u64(),
980 "nr_wakeups_affine_attempts" => out.nr_wakeups_affine_attempts = parsed_u64(),
981 "nr_migrations" => out.nr_migrations = parsed_u64(),
982 "nr_forced_migrations" => out.nr_forced_migrations = parsed_u64(),
983 "nr_failed_migrations_affine" => out.nr_failed_migrations_affine = parsed_u64(),
984 "nr_failed_migrations_running" => out.nr_failed_migrations_running = parsed_u64(),
985 "nr_failed_migrations_hot" => out.nr_failed_migrations_hot = parsed_u64(),
986 "wait_sum" => out.wait_sum = parse_dotted(value),
987 "wait_count" => out.wait_count = parsed_u64(),
988 "wait_max" => out.wait_max = parse_dotted(value),
989 // Kernel emits `sum_sleep_runtime` (see
990 // `kernel/sched/debug.c` -> `proc_sched_show_task`).
991 // The raw value lands in `SchedFields::sleep_sum`; the
992 // capture site at `capture_thread_at_with_tally`
993 // subtracts `sum_block_runtime` to derive
994 // `ThreadState::voluntary_sleep_ns` — the kernel
995 // double-counts block under sum_sleep_runtime, so the
996 // raw value is not surfaced in ThreadState. The kernel
997 // does not emit a `sleep_count` counterpart;
998 // `nr_wakeups` (matched above) covers the wake-side
999 // event tally.
1000 "sum_sleep_runtime" => out.sleep_sum = parse_dotted(value),
1001 "sleep_max" => out.sleep_max = parse_dotted(value),
1002 // Kernel emits `sum_block_runtime`; the matching
1003 // ThreadState field is `block_sum` for symmetry with
1004 // the other `*_sum` fields. There is no `block_count`
1005 // counterpart from the kernel — the schedstat printout
1006 // pairs `wait_sum/wait_count` and `iowait_sum/iowait_count`
1007 // but `sum_block_runtime` has no per-event counter.
1008 "sum_block_runtime" => out.block_sum = parse_dotted(value),
1009 "block_max" => out.block_max = parse_dotted(value),
1010 "iowait_sum" => out.iowait_sum = parse_dotted(value),
1011 "iowait_count" => out.iowait_count = parsed_u64(),
1012 "exec_max" => out.exec_max = parse_dotted(value),
1013 "slice_max" => out.slice_max = parse_dotted(value),
1014 // PN_SCHEDSTAT dotted ns; CONFIG_SCHED_CORE-gated. Same
1015 // ms.ns reconstruction as wait_sum / block_sum.
1016 "core_forceidle_sum" => out.core_forceidle_sum = parse_dotted(value),
1017 // P plain integer in ns. The kernel emits this only
1018 // for fair-policy tasks (`fair_policy(p->policy)` in
1019 // `proc_sched_show_task()`); for other policies the line is absent
1020 // and `parsed_u64()` collapses to None.
1021 "slice" => out.fair_slice_ns = parsed_u64(),
1022 _ => {}
1023 }
1024 }
1025 out
1026}
1027
1028/// Read `<proc_root>/<tgid>/task/<tid>/sched` and parse fields.
1029/// Records a `"sched"` failure into `tally` on read error, plus
1030/// per-line negative-dotted-value bumps via `parse_sched`.
1031pub(super) fn read_sched_at_with_tally(
1032 proc_root: &Path,
1033 tgid: i32,
1034 tid: i32,
1035 tally: &mut Option<&mut ParseTally>,
1036) -> SchedFields {
1037 match fs::read_to_string(task_file(proc_root, tgid, tid, "sched")) {
1038 Ok(raw) => parse_sched(&raw, tally),
1039 Err(_) => {
1040 if let Some(t) = tally.as_mut() {
1041 t.record_failure("sched");
1042 }
1043 SchedFields::default()
1044 }
1045 }
1046}
1047
1048/// Parse `/proc/<pid>/smaps_rollup` contents into a key→u64-kB
1049/// map. Format per `__show_smap()`
1050/// (`fs/proc/task_mmu.c`): each kv line is
1051/// `<Name>:<whitespace><u64><whitespace>kB`. The kernel ALSO
1052/// emits a `<addr_start>-<addr_end> ---p <off> XX:XX <inode> [rollup]`
1053/// header line (built by `show_vma_header_prefix()` then
1054/// `seq_puts(m, "[rollup]\n")` in `show_smaps_rollup()`). That
1055/// header CONTAINS a `:` (the device-major:minor pair `XX:XX`),
1056/// so a naive `split_once(':')` would mis-extract a junk key
1057/// (the whitespace-laden address range + flags + offset prefix)
1058/// with value 0 (the minor-device integer parses as the first
1059/// whitespace token of the value side). Real smaps_rollup keys
1060/// are single-word identifiers (Rss, Pss, Pss_Dirty, etc.) that
1061/// never contain whitespace or `-`; the address-range header
1062/// always contains both. Reject any line whose pre-`:` segment
1063/// carries either character.
1064///
1065/// Lines whose value field doesn't parse as u64 are silently
1066/// dropped (best-effort, matching the absent-counter contract).
1067pub(super) fn parse_smaps_rollup(raw: &str) -> BTreeMap<String, u64> {
1068 let mut out = BTreeMap::new();
1069 for line in raw.lines() {
1070 let Some((key, value)) = line.split_once(':') else {
1071 continue;
1072 };
1073 let key_trimmed = key.trim();
1074 // Header-line guard: real smaps_rollup keys never
1075 // contain whitespace or `-`. Address-range headers
1076 // (`<addr_start>-<addr_end> ---p <off> XX:XX <inode>
1077 // [rollup]`) carry both.
1078 if key_trimmed.contains(char::is_whitespace) || key_trimmed.contains('-') {
1079 continue;
1080 }
1081 // Value field: whitespace-prefixed integer + " kB" suffix
1082 // (or rarely no suffix on a future kernel addition). The
1083 // first whitespace-token after trimming IS the integer;
1084 // dropping the unit suffix happens for free via
1085 // `split_ascii_whitespace().next()`.
1086 let Some(n_str) = value.split_ascii_whitespace().next() else {
1087 continue;
1088 };
1089 let Ok(n) = n_str.parse::<u64>() else {
1090 continue;
1091 };
1092 out.insert(key_trimmed.to_string(), n);
1093 }
1094 out
1095}
1096
1097/// Read `<proc_root>/<tgid>/task/<tid>/smaps_rollup` for the
1098/// thread leader (tid == tgid) and parse it. Non-leader threads
1099/// short-circuit to an empty map: the underlying mm_struct is
1100/// shared per-tgid, so reading from any thread yields identical
1101/// values, and capturing once per tgid avoids redundant
1102/// per-thread work. Records a `"smaps_rollup"` failure into
1103/// `tally` on read error.
1104///
1105/// Permission semantics: `/proc/<pid>/smaps_rollup` requires
1106/// `CAP_SYS_PTRACE` for processes the caller doesn't own (PID 1
1107/// being the canonical example). Read failure is treated as
1108/// best-effort — empty map, tally bump, no panic. Older kernels
1109/// (pre-4.14) lack the file entirely; same handling.
1110pub(super) fn read_smaps_rollup_at_with_tally(
1111 proc_root: &Path,
1112 tgid: i32,
1113 tid: i32,
1114 tally: &mut Option<&mut ParseTally>,
1115) -> BTreeMap<String, u64> {
1116 if tid != tgid {
1117 // Leader-dedup: non-leader threads share the same
1118 // mm_struct, so the file would yield identical values.
1119 // Skip the read entirely.
1120 return BTreeMap::new();
1121 }
1122 match fs::read_to_string(task_file(proc_root, tgid, tid, "smaps_rollup")) {
1123 Ok(raw) => parse_smaps_rollup(&raw),
1124 Err(_) => {
1125 if let Some(t) = tally.as_mut() {
1126 t.record_failure("smaps_rollup");
1127 }
1128 BTreeMap::new()
1129 }
1130 }
1131}
1132
1133/// Parse cgroup v2 `cpu.stat`. Format is lines of `key value`
1134/// (space-separated, not `key: value`).
1135pub(super) fn parse_cpu_stat(raw: &str) -> (Option<u64>, Option<u64>, Option<u64>) {
1136 let mut usage = None;
1137 let mut throttled = None;
1138 let mut throttled_usec = None;
1139 for line in raw.lines() {
1140 let mut parts = line.split_ascii_whitespace();
1141 let Some(key) = parts.next() else { continue };
1142 let Some(value) = parts.next() else { continue };
1143 let parsed = value.parse::<u64>().ok();
1144 match key {
1145 "usage_usec" => usage = parsed,
1146 "nr_throttled" => throttled = parsed,
1147 "throttled_usec" => throttled_usec = parsed,
1148 _ => {}
1149 }
1150 }
1151 (usage, throttled, throttled_usec)
1152}
1153
1154/// Parse a cgroup v2 key-value file (one `<key> <u64>` per
1155/// line). Used for `memory.stat` and `memory.events`. Lines
1156/// the parser cannot fully decompose into a key + u64 are
1157/// silently skipped — a future kernel that introduces non-u64
1158/// values won't break the parser, just elide the offending key.
1159pub(super) fn parse_kv_counters(raw: &str) -> BTreeMap<String, u64> {
1160 let mut out = BTreeMap::new();
1161 for line in raw.lines() {
1162 let mut parts = line.split_ascii_whitespace();
1163 let Some(key) = parts.next() else { continue };
1164 let Some(value) = parts.next() else { continue };
1165 let Ok(parsed) = value.parse::<u64>() else {
1166 continue;
1167 };
1168 out.insert(key.to_string(), parsed);
1169 }
1170 out
1171}
1172
1173/// Parse a single-line LIMIT cgroup file (e.g. `memory.max`,
1174/// `memory.high`, `pids.max`). The literal token `max` means
1175/// "no limit" and yields `None`; a numeric value yields
1176/// `Some(u64)`. Whitespace-only or malformed input also yields
1177/// `None` (best-effort, matching the absent-counter contract).
1178///
1179/// Caller MUST NOT use this for FLOOR files (`memory.low`,
1180/// `memory.min`) — for floors, the literal token `max` means
1181/// "maximum protection", not "no floor", which is the semantic
1182/// opposite. Use [`parse_floor_value`] there instead.
1183pub(super) fn parse_max_or_u64(raw: &str) -> Option<u64> {
1184 let trimmed = raw.trim();
1185 if trimmed == "max" {
1186 return None;
1187 }
1188 trimmed.parse::<u64>().ok()
1189}
1190
1191/// Parse a single-line FLOOR cgroup file (`memory.low`,
1192/// `memory.min`). The literal token `max` means
1193/// "maximum protection" — yields `Some(u64::MAX)` rather than
1194/// `None`, because FLOORS use `None` to mean "absent file"
1195/// only. A numeric value yields `Some(u64)`; whitespace-only or
1196/// malformed input yields `None` (absent-counter contract).
1197///
1198/// The semantic asymmetry vs. [`parse_max_or_u64`] is critical:
1199/// for limits, "max" is the absence of a cap (collapse to
1200/// `None`); for floors, "max" is a fully-protected floor (it
1201/// must NOT collapse to "no floor"). `merge_min_option` then
1202/// correctly picks `min(u64::MAX, 5G) = 5G` instead of None
1203/// when one contributor has full protection and another has a
1204/// concrete protection.
1205pub(super) fn parse_floor_value(raw: &str) -> Option<u64> {
1206 let trimmed = raw.trim();
1207 if trimmed == "max" {
1208 return Some(u64::MAX);
1209 }
1210 trimmed.parse::<u64>().ok()
1211}
1212
1213/// Parse `cpu.max` (one line, two whitespace-separated tokens:
1214/// `<quota|max> <period>`). Returns `(quota, period)` where
1215/// `quota` is `None` for the literal `max` token (no CFS
1216/// bandwidth cap) and `Some(usec)` otherwise; `period` defaults
1217/// to the kernel default of 100_000 µs when missing or
1218/// malformed.
1219pub(super) fn parse_cpu_max(raw: &str) -> (Option<u64>, u64) {
1220 let mut parts = raw.split_ascii_whitespace();
1221 let quota_token = parts.next();
1222 let period_token = parts.next();
1223 let quota = quota_token.and_then(parse_max_or_u64_str);
1224 let period = period_token
1225 .and_then(|s| s.parse::<u64>().ok())
1226 .unwrap_or(CPU_MAX_DEFAULT_PERIOD_US);
1227 (quota, period)
1228}
1229
1230/// Helper for [`parse_cpu_max`]: route a single token through
1231/// the same `max`-vs-u64 disambiguation as [`parse_max_or_u64`]
1232/// without committing to a string-trimmed input shape.
1233pub(super) fn parse_max_or_u64_str(s: &str) -> Option<u64> {
1234 if s == "max" {
1235 return None;
1236 }
1237 s.parse::<u64>().ok()
1238}
1239
1240/// Default CFS bandwidth period when `cpu.max` is absent or its
1241/// period token is unreadable. Matches the kernel default
1242/// returned by `default_bw_period_us()`
1243/// (`kernel/sched/sched.h`); child cgroups inherit this when
1244/// `cpu.max` is unset.
1245pub(super) const CPU_MAX_DEFAULT_PERIOD_US: u64 = 100_000;
1246
1247/// Populate a [`CgroupStats`] by reading the cgroup v2 files
1248/// for `path` under `cgroup_root`. Missing files collapse to
1249/// the struct's `Default` (zero / `None` per field semantics) —
1250/// the root cgroup is missing most knob files, and child
1251/// cgroups on hosts without `pids` enabled in
1252/// `cgroup.subtree_control` are also expected to lack
1253/// `pids.{current,max}`.
1254pub(super) fn read_cgroup_stats_at(cgroup_root: &Path, path: &str) -> CgroupStats {
1255 let relative = path.strip_prefix('/').unwrap_or(path);
1256 let dir = if relative.is_empty() {
1257 cgroup_root.to_path_buf()
1258 } else {
1259 cgroup_root.join(relative)
1260 };
1261
1262 let (usage, throttled, throttled_usec) = fs::read_to_string(dir.join("cpu.stat"))
1263 .ok()
1264 .as_deref()
1265 .map(parse_cpu_stat)
1266 .unwrap_or((None, None, None));
1267 let (max_quota_us, max_period_us) = fs::read_to_string(dir.join("cpu.max"))
1268 .ok()
1269 .as_deref()
1270 .map(parse_cpu_max)
1271 .unwrap_or((None, CPU_MAX_DEFAULT_PERIOD_US));
1272 let weight = fs::read_to_string(dir.join("cpu.weight"))
1273 .ok()
1274 .and_then(|s| s.trim().parse::<u64>().ok());
1275 let weight_nice = fs::read_to_string(dir.join("cpu.weight.nice"))
1276 .ok()
1277 .and_then(|s| s.trim().parse::<i32>().ok());
1278
1279 let memory_current = fs::read_to_string(dir.join("memory.current"))
1280 .ok()
1281 .and_then(|s| s.trim().parse::<u64>().ok())
1282 .unwrap_or(0);
1283 let memory_max = fs::read_to_string(dir.join("memory.max"))
1284 .ok()
1285 .as_deref()
1286 .and_then(parse_max_or_u64);
1287 let memory_high = fs::read_to_string(dir.join("memory.high"))
1288 .ok()
1289 .as_deref()
1290 .and_then(parse_max_or_u64);
1291 let memory_low = fs::read_to_string(dir.join("memory.low"))
1292 .ok()
1293 .as_deref()
1294 .and_then(parse_floor_value);
1295 let memory_min = fs::read_to_string(dir.join("memory.min"))
1296 .ok()
1297 .as_deref()
1298 .and_then(parse_floor_value);
1299 let memory_stat = fs::read_to_string(dir.join("memory.stat"))
1300 .ok()
1301 .as_deref()
1302 .map(parse_kv_counters)
1303 .unwrap_or_default();
1304 let memory_events = fs::read_to_string(dir.join("memory.events"))
1305 .ok()
1306 .as_deref()
1307 .map(parse_kv_counters)
1308 .unwrap_or_default();
1309
1310 let pids_current = fs::read_to_string(dir.join("pids.current"))
1311 .ok()
1312 .and_then(|s| s.trim().parse::<u64>().ok());
1313 let pids_max = fs::read_to_string(dir.join("pids.max"))
1314 .ok()
1315 .as_deref()
1316 .and_then(parse_max_or_u64);
1317
1318 let psi = read_cgroup_psi_at(cgroup_root, path);
1319
1320 CgroupStats {
1321 cpu: CgroupCpuStats {
1322 usage_usec: usage.unwrap_or(0),
1323 nr_throttled: throttled.unwrap_or(0),
1324 throttled_usec: throttled_usec.unwrap_or(0),
1325 max_quota_us,
1326 max_period_us,
1327 weight,
1328 weight_nice,
1329 },
1330 memory: CgroupMemoryStats {
1331 current: memory_current,
1332 max: memory_max,
1333 high: memory_high,
1334 low: memory_low,
1335 min: memory_min,
1336 stat: memory_stat,
1337 events: memory_events,
1338 },
1339 pids: CgroupPidsStats {
1340 current: pids_current,
1341 max: pids_max,
1342 },
1343 psi,
1344 }
1345}