ktstr/ctprof/
parse.rs

1//! /proc parsers and tallying readers extracted from
2//! `super::mod.rs`. Holds:
3//! - `parse_psi` / `parse_centi_percent` and the `read_*psi_at`
4//!   helpers that wrap them
5//! - `read_sched_ext_sysfs_at` + `read_sysfs_u64`
6//! - `parse_stat` / `parse_schedstat` / `parse_io` / `parse_status`
7//!   plus their `read_*_at_with_tally` wrappers
8//! - `parse_cgroup_v2` / `read_cgroup_at*`
9//! - `parse_sched` and the `parsed_ns_from_dotted` half-millisecond
10//!   recovery (with the `ParseDottedNs` discriminator)
11//! - `parse_smaps_rollup` / `read_smaps_rollup_at_with_tally`
12//! - `parse_cpu_stat` / `parse_kv_counters` / `parse_max_or_u64*` /
13//!   `parse_floor_value` / `parse_cpu_max`
14//! - `read_cgroup_stats_at` — opens the cgroup v2 files actually
15//!   read by capture: `cpu.stat`, `cpu.max`, `cpu.weight`,
16//!   `cpu.weight.nice`, `memory.current`, `memory.max`,
17//!   `memory.high`, `memory.low`, `memory.min`, `memory.stat`,
18//!   `memory.events`, `pids.current`, `pids.max`, plus the
19//!   `<cgroup>.pressure` PSI files via `read_cgroup_psi_at`.
20//!   `memory.swap.current`, `memory.peak`, `memory.zswap.current`,
21//!   `pids.peak`, and `io.stat` are NOT read — the cgroup capture
22//!   surface intentionally limits to the v2 fields the comparator
23//!   schemas the rest of the pipeline.
24//!
25//! ## Visibility
26//!
27//! The parsers and `read_*` helpers are `pub(super)` so the capture
28//! pipeline in `mod.rs` can call them without re-exporting through
29//! the public API. The serialization surface
30//! ([`CtprofSnapshot::load`](super::CtprofSnapshot::load) /
31//! [`write`](super::CtprofSnapshot::write)) and the snapshot
32//! constants stay `pub` on the parent because they ARE part of the
33//! ktstr public API, consumed by `cargo ktstr ctprof compare` and
34//! by the snapshot-loader crate consumers.
35
36use super::*;
37use std::collections::BTreeMap;
38use std::path::Path;
39
40/// Parse one PSI file's contents. The kernel emits one or two
41/// lines (`some` then `full`), each formatted by `seq_printf` in
42/// `psi_show()` (`kernel/sched/psi.c`). Lines are tokenized by whitespace;
43/// each token is `key=value`. Unknown keys are ignored so a
44/// future kernel that adds a 4th avg or new field doesn't break
45/// the parser. Missing fields default to 0 (matching the
46/// absent-counter contract used elsewhere in this module).
47pub(super) fn parse_psi(raw: &str) -> PsiResource {
48    let mut out = PsiResource::default();
49    for line in raw.lines() {
50        let mut tokens = line.split_whitespace();
51        let Some(prefix) = tokens.next() else {
52            continue;
53        };
54        let half = match prefix {
55            "some" => &mut out.some,
56            "full" => &mut out.full,
57            _ => continue,
58        };
59        for tok in tokens {
60            let Some((key, value)) = tok.split_once('=') else {
61                continue;
62            };
63            match key {
64                "avg10" => half.avg10 = parse_centi_percent(value),
65                "avg60" => half.avg60 = parse_centi_percent(value),
66                "avg300" => half.avg300 = parse_centi_percent(value),
67                "total" => half.total_usec = value.parse::<u64>().unwrap_or(0),
68                _ => {}
69            }
70        }
71    }
72    out
73}
74
75/// Convert `"N.NN"` (kernel `%lu.%02lu` format from `psi_show()`)
76/// to `N * 100 + NN` (centi-percent integer). On malformed input
77/// returns 0, matching the absent-counter default contract.
78/// Saturates at u16::MAX to guard against pathological input.
79///
80/// The kernel always emits a 2-digit zero-padded fraction
81/// (`%02lu`), but a robust parser zero-pads its own input to
82/// exactly 2 digits before combining: a stray `"1.5"` (one
83/// fractional digit) must read as `150` (1.50%), not `105`
84/// (1.05%); a stray `"1.501"` (three fractional digits) is
85/// truncated to `1.50` rather than producing
86/// `1*100 + 501 = 601`. Mirrors the
87/// [`parsed_ns_from_dotted`] helper's zero-pad-to-six discipline.
88pub(super) fn parse_centi_percent(s: &str) -> u16 {
89    let (int_part, frac_part) = s.split_once('.').unwrap_or((s, ""));
90    let Ok(int) = int_part.parse::<u32>() else {
91        return 0;
92    };
93    let frac = if frac_part.is_empty() {
94        0
95    } else {
96        // Zero-pad-to-2 then truncate-to-2: "5" → "50", "501" →
97        // "50". Matches the kernel's `%02lu` format width
98        // exactly so a parser-side roundtrip can never under- or
99        // over-count the fractional weight.
100        let padded: String = frac_part
101            .chars()
102            .chain(std::iter::repeat('0'))
103            .take(2)
104            .collect();
105        padded.parse::<u32>().unwrap_or(0)
106    };
107    let combined = int.saturating_mul(100).saturating_add(frac);
108    combined.try_into().unwrap_or(u16::MAX)
109}
110
111/// Read host-level PSI files (`<proc_root>/pressure/{cpu,memory,io,irq}`)
112/// and populate a [`Psi`] bundle. Each file is read independently;
113/// absent files (older kernels missing irq.pressure, or hosts
114/// with CONFIG_PSI off) collapse to the all-zero default per the
115/// absent-counter contract.
116///
117/// PSI readers (this fn, `read_cgroup_psi_at`) and the
118/// `read_sched_ext_sysfs_at` reader deliberately omit the
119/// `ParseTally` argument that the per-thread procfs readers
120/// thread through. Their build-gate signal is presence of the
121/// containing directory (`/proc/pressure/`,
122/// `/sys/kernel/sched_ext/`): an absent directory means the
123/// kernel feature is off, which is a host-property fact rather
124/// than a per-tid parse failure, and the snapshot's all-zero
125/// default already encodes the absence. Threading these
126/// readers into the tally would multiply the failure tally by
127/// the worker count without adding any operator-actionable
128/// signal beyond what the absent fields already convey.
129pub(super) fn read_host_psi_at(proc_root: &Path) -> Psi {
130    let pressure_dir = proc_root.join("pressure");
131    Psi {
132        cpu: read_psi_file_at(&pressure_dir.join("cpu")),
133        memory: read_psi_file_at(&pressure_dir.join("memory")),
134        io: read_psi_file_at(&pressure_dir.join("io")),
135        irq: read_psi_file_at(&pressure_dir.join("irq")),
136    }
137}
138
139/// Read global sched_ext sysfs state from
140/// `<sys_root>/kernel/sched_ext/`. Returns `None` when the
141/// directory itself is absent (CONFIG_SCHED_CLASS_EXT=n
142/// kernels never expose it). Per-file misses default the
143/// affected field to zero / empty string per the
144/// absent-counter contract — a future kernel that adds new
145/// global attrs (and that we haven't surfaced as fields yet)
146/// won't break the parser; old kernels missing one or more of
147/// the existing five collapse cleanly.
148pub(super) fn read_sched_ext_sysfs_at(sys_root: &Path) -> Option<SchedExtSysfs> {
149    let dir = sys_root.join("kernel").join("sched_ext");
150    // No `tally` arg: directory presence (Option<SchedExtSysfs>)
151    // is THE not-built signal; per-attr misses collapse silently
152    // per the absent-counter contract.
153    if !dir.exists() {
154        return None;
155    }
156    Some(SchedExtSysfs {
157        state: fs::read_to_string(dir.join("state"))
158            .map(|s| s.trim().to_string())
159            .unwrap_or_default(),
160        switch_all: read_sysfs_u64(&dir.join("switch_all")),
161        nr_rejected: read_sysfs_u64(&dir.join("nr_rejected")),
162        hotplug_seq: read_sysfs_u64(&dir.join("hotplug_seq")),
163        enable_seq: read_sysfs_u64(&dir.join("enable_seq")),
164    })
165}
166
167/// Read a single-line u64 sysfs file. Trims trailing newline,
168/// parses, defaults to 0 on read or parse failure (matches the
169/// absent-counter contract).
170pub(super) fn read_sysfs_u64(path: &Path) -> u64 {
171    fs::read_to_string(path)
172        .ok()
173        .and_then(|s| s.trim().parse::<u64>().ok())
174        .unwrap_or(0)
175}
176
177/// Read per-cgroup PSI files (`<cgroup>/{cpu,memory,io,irq}.pressure`)
178/// and populate a [`Psi`] bundle. The four files are exposed by
179/// `cgroup_psi_files[]` (`kernel/cgroup/cgroup.c`); the per-cgroup interface
180/// uses the `<resource>.pressure` filename pattern rather than
181/// the host-level `pressure/<resource>` directory layout.
182pub(super) fn read_cgroup_psi_at(cgroup_root: &Path, path: &str) -> Psi {
183    let relative = path.strip_prefix('/').unwrap_or(path);
184    let dir = if relative.is_empty() {
185        cgroup_root.to_path_buf()
186    } else {
187        cgroup_root.join(relative)
188    };
189    Psi {
190        cpu: read_psi_file_at(&dir.join("cpu.pressure")),
191        memory: read_psi_file_at(&dir.join("memory.pressure")),
192        io: read_psi_file_at(&dir.join("io.pressure")),
193        irq: read_psi_file_at(&dir.join("irq.pressure")),
194    }
195}
196
197/// Read one PSI file by path. Absent files or read errors
198/// collapse to a default-zero [`PsiResource`].
199pub(super) fn read_psi_file_at(path: &Path) -> PsiResource {
200    fs::read_to_string(path)
201        .ok()
202        .as_deref()
203        .map(parse_psi)
204        .unwrap_or_default()
205}
206
207impl CtprofSnapshot {
208    /// Load a snapshot from a zstd-compressed JSON file.
209    ///
210    /// Errors propagate via [`anyhow`] with the source path in the
211    /// context chain so a malformed file surfaces an actionable
212    /// message rather than a generic deserialize error. The loader
213    /// does not validate that `threads` is non-empty — an empty
214    /// snapshot is a legitimate edge case (host idle, capture
215    /// filter excluded every thread) and the comparison engine
216    /// handles it by emitting an empty diff.
217    ///
218    /// The decompression step is bounded by
219    /// `MAX_DECOMPRESSED_SNAPSHOT_BYTES` — a payload that
220    /// decompresses past that ceiling surfaces an error rather
221    /// than allocating unbounded memory, guarding against a
222    /// hostile zstd payload (zstd compresses pathologically well
223    /// on repeated bytes).
224    pub fn load(path: &std::path::Path) -> anyhow::Result<Self> {
225        use anyhow::Context;
226        let bytes = std::fs::read(path)
227            .with_context(|| format!("read ctprof snapshot from {}", path.display()))?;
228        let json = decompress_capped(&bytes, MAX_DECOMPRESSED_SNAPSHOT_BYTES)
229            .with_context(|| format!("zstd decompress ctprof snapshot {}", path.display()))?;
230        let snap: CtprofSnapshot = serde_json::from_slice(&json).with_context(|| {
231            format!(
232                "parse ctprof snapshot JSON from {} (did the capture format change?)",
233                path.display(),
234            )
235        })?;
236        Ok(snap)
237    }
238
239    /// Write a snapshot as zstd-compressed JSON.
240    ///
241    /// Used by the capture layer; exposed from this module so that
242    /// both compare-side tests and the capture binary share one
243    /// on-disk shape. Compression level `3` mirrors the ktstr
244    /// remote-cache convention — adequate ratio at fast speed —
245    /// and is not tunable because ctprof captures are small
246    /// enough that further compression produces diminishing
247    /// returns on I/O.
248    pub fn write(&self, path: &std::path::Path) -> anyhow::Result<()> {
249        use anyhow::Context;
250        let json = serde_json::to_vec(self).context("serialize ctprof snapshot to JSON")?;
251        let compressed =
252            zstd::encode_all(json.as_slice(), 3).context("zstd compress ctprof snapshot")?;
253        std::fs::write(path, compressed)
254            .with_context(|| format!("write ctprof snapshot to {}", path.display()))?;
255        Ok(())
256    }
257}
258
259/// Decompress a zstd payload into a `Vec<u8>` capped at
260/// `max_decompressed` bytes — bombing out with an error if the
261/// payload would expand past the ceiling. Reads through
262/// `Read::take(cap + 1)` so a payload that decompresses to
263/// exactly `cap` bytes is accepted while one that produces
264/// `cap + 1` bytes (or more) is rejected — the +1 sentinel
265/// distinguishes "EOF coincided with the cap" from "more data
266/// behind the cap".
267pub(super) fn decompress_capped(bytes: &[u8], max_decompressed: u64) -> anyhow::Result<Vec<u8>> {
268    use std::io::Read;
269    let decoder = zstd::stream::read::Decoder::new(bytes)?;
270    let mut out = Vec::new();
271    decoder
272        .take(max_decompressed.saturating_add(1))
273        .read_to_end(&mut out)?;
274    if out.len() as u64 > max_decompressed {
275        anyhow::bail!(
276            "zstd-decompressed payload exceeds the {}-byte cap (decompression-bomb guard)",
277            max_decompressed,
278        );
279    }
280    Ok(out)
281}
282
283// ---------------------------------------------------------------
284// Capture layer: procfs readers + host walk.
285// ---------------------------------------------------------------
286
287/// Canonical file extension for a serialized snapshot.
288///
289/// `dead_code` allow: no production code references this
290/// constant — the only reference is a roundtrip test.
291/// [`write`](CtprofSnapshot::write) and
292/// [`CtprofSnapshot::load`] take a caller-supplied path and
293/// neither constructs the extension (the CLI accepts any path
294/// the operator supplies and the renderer reads via
295/// [`CtprofSnapshot::load`]). Kept as a named constant so a future
296/// caller that needs to construct paths from scratch has the
297/// canonical token available without re-typing the literal.
298#[allow(dead_code)]
299pub const SNAPSHOT_EXTENSION: &str = "ctprof.zst";
300
301/// Decompressed-size ceiling for [`CtprofSnapshot::load`].
302/// Bounds the allocation a malicious or corrupted zstd payload
303/// can force, since zstd compresses pathologically well on
304/// repeated bytes (a few-KiB compressed blob can decompress to
305/// gigabytes). 256 MiB covers any realistic production snapshot
306/// (typical hosts run 1K-100K live threads) while bounding
307/// worst-case allocation against hostile zstd payloads.
308/// Public so a downstream consumer can size buffers against the
309/// same ceiling without hardcoding the value.
310pub const MAX_DECOMPRESSED_SNAPSHOT_BYTES: u64 = 256 * 1024 * 1024;
311
312/// Default procfs root on Linux. The `_at` readers accept any
313/// `&Path` so tests stage a synthetic tree under a tempdir; the
314/// public readers delegate to those with this default.
315pub const DEFAULT_PROC_ROOT: &str = "/proc";
316
317/// Default cgroup v2 mount point.
318pub const DEFAULT_CGROUP_ROOT: &str = "/sys/fs/cgroup";
319
320/// Default sysfs root. Tests pass a tempdir so they don't read
321/// the live `/sys` tree (which would produce nondeterministic
322/// `sched_ext` state depending on the host kernel config). The
323/// public capture entry points pass this constant to read the
324/// real sysfs tree at runtime.
325pub const DEFAULT_SYS_ROOT: &str = "/sys";
326
327pub(super) fn task_file(proc_root: &Path, tgid: i32, tid: i32, leaf: &str) -> PathBuf {
328    proc_root
329        .join(tgid.to_string())
330        .join("task")
331        .join(tid.to_string())
332        .join(leaf)
333}
334
335pub(super) fn proc_file(proc_root: &Path, tgid: i32, leaf: &str) -> PathBuf {
336    proc_root.join(tgid.to_string()).join(leaf)
337}
338
339/// Map a numeric scheduling policy (as it appears in
340/// `/proc/<tgid>/task/<tid>/stat` field 41) to the canonical
341/// kernel identifier string. Unknown integers render as
342/// `"SCHED_UNKNOWN(<n>)"` rather than dropping the value so
343/// diff output still surfaces a novel policy from a future
344/// kernel.
345pub(super) fn policy_name(policy: i32) -> String {
346    match policy {
347        libc::SCHED_OTHER => "SCHED_OTHER".to_string(),
348        libc::SCHED_FIFO => "SCHED_FIFO".to_string(),
349        libc::SCHED_RR => "SCHED_RR".to_string(),
350        libc::SCHED_BATCH => "SCHED_BATCH".to_string(),
351        libc::SCHED_IDLE => "SCHED_IDLE".to_string(),
352        // `SCHED_DEADLINE` = 6, `SCHED_EXT` = 7 — neither is
353        // exposed by the libc crate as of this writing; use the
354        // kernel-canonical numeric codes.
355        6 => "SCHED_DEADLINE".to_string(),
356        7 => "SCHED_EXT".to_string(),
357        other => format!("SCHED_UNKNOWN({other})"),
358    }
359}
360
361/// Enumerate every numeric directory under the procfs root
362/// (live tgids). Returns sorted ids so snapshot ordering is
363/// deterministic. Empty vec on read failure.
364pub(super) fn iter_tgids_at(proc_root: &Path) -> Vec<i32> {
365    let Ok(entries) = fs::read_dir(proc_root) else {
366        return Vec::new();
367    };
368    let mut tgids: Vec<i32> = entries
369        .filter_map(|e| e.ok())
370        .filter_map(|e| e.file_name().to_str().and_then(|s| s.parse::<i32>().ok()))
371        .filter(|&p| p > 0)
372        .collect();
373    tgids.sort_unstable();
374    tgids
375}
376
377/// Enumerate tids under `<proc_root>/<tgid>/task`. Empty vec on
378/// read failure (process exited between enumeration and this
379/// call).
380pub(super) fn iter_task_ids_at(proc_root: &Path, tgid: i32) -> Vec<i32> {
381    let path = proc_root.join(tgid.to_string()).join("task");
382    let Ok(entries) = fs::read_dir(&path) else {
383        return Vec::new();
384    };
385    let mut tids: Vec<i32> = entries
386        .filter_map(|e| e.ok())
387        .filter_map(|e| e.file_name().to_str().and_then(|s| s.parse::<i32>().ok()))
388        .filter(|&t| t > 0)
389        .collect();
390    tids.sort_unstable();
391    tids
392}
393
394/// Read `<proc_root>/<tgid>/comm` trimmed. `None` on read
395/// failure or empty content.
396pub(super) fn read_process_comm_at(proc_root: &Path, tgid: i32) -> Option<String> {
397    let raw = fs::read_to_string(proc_file(proc_root, tgid, "comm")).ok()?;
398    let trimmed = raw.trim();
399    if trimmed.is_empty() {
400        None
401    } else {
402        Some(trimmed.to_string())
403    }
404}
405
406/// Read `<proc_root>/<tgid>/task/<tid>/comm` trimmed. `None`
407/// on read failure or empty content.
408pub(super) fn read_thread_comm_at(proc_root: &Path, tgid: i32, tid: i32) -> Option<String> {
409    let raw = fs::read_to_string(task_file(proc_root, tgid, tid, "comm")).ok()?;
410    let trimmed = raw.trim();
411    if trimmed.is_empty() {
412        None
413    } else {
414        Some(trimmed.to_string())
415    }
416}
417
418/// Selected fields parsed out of `/proc/<tgid>/task/<tid>/stat`.
419#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
420pub(super) struct StatFields {
421    pub(super) minflt: Option<u64>,
422    pub(super) majflt: Option<u64>,
423    pub(super) utime_clock_ticks: Option<u64>,
424    pub(super) stime_clock_ticks: Option<u64>,
425    /// Field 18: kernel-internal priority (signed, distinct
426    /// from `nice`). `seq_put_decimal_ll(m, " ", priority)` in
427    /// `do_task_stat()` (`fs/proc/array.c`); the value is the post-bias
428    /// scheduler priority (`task_prio(task)`).
429    pub(super) priority: Option<i32>,
430    pub(super) nice: Option<i32>,
431    pub(super) start_time_clock_ticks: Option<u64>,
432    pub(super) processor: Option<i32>,
433    /// Field 40: real-time priority. `seq_put_decimal_ull(m,
434    /// " ", task->rt_priority)` in `do_task_stat()` (`fs/proc/array.c`).
435    /// Stored as `u32` to match `unsigned int
436    /// task_struct::rt_priority` from `include/linux/sched.h`;
437    /// non-zero only when the task runs SCHED_FIFO / SCHED_RR.
438    pub(super) rt_priority: Option<u32>,
439    pub(super) policy: Option<i32>,
440}
441
442/// Pure parser for `/proc/<tgid>/task/<tid>/stat`. Per `proc(5)`,
443/// field 2 (`comm`) is wrapped in parens and may contain
444/// whitespace or `)`; every later field is indexed relative to
445/// the LAST `)` in the line. Tail offsets (0-indexed from the
446/// token past the final `)`):
447///
448/// | field | name                  | tail index |
449/// |-------|-----------------------|------------|
450/// | 10    | minflt                | 7          |
451/// | 12    | majflt                | 9          |
452/// | 14    | utime                 | 11         |
453/// | 15    | stime                 | 12         |
454/// | 18    | priority              | 15         |
455/// | 19    | nice                  | 16         |
456/// | 22    | starttime             | 19         |
457/// | 39    | processor             | 36         |
458/// | 40    | rt_priority           | 37         |
459/// | 41    | policy                | 38         |
460///
461/// Field 42 (`delayacct_blkio_ticks`) is intentionally NOT
462/// parsed — `blkio_delay_total_ns` from the taskstats genetlink
463/// path supersedes it (ns precision vs USER_HZ ticks; both gated
464/// by `CONFIG_TASK_DELAY_ACCT`, but the netlink path delivers
465/// the same data without the procfs USER_HZ truncation).
466///
467/// Missing fields return `None` individually so a short line
468/// (tid exited mid-read, stat truncated) degrades gracefully.
469pub(super) fn parse_stat(raw: &str) -> StatFields {
470    let Some(line) = raw.lines().next() else {
471        return StatFields::default();
472    };
473    let Some(last_close) = line.rfind(')') else {
474        return StatFields::default();
475    };
476    let Some(tail) = line.get(last_close + 1..) else {
477        return StatFields::default();
478    };
479    let parts: Vec<&str> = tail.split_ascii_whitespace().collect();
480    let get_u64 = |idx: usize| parts.get(idx).and_then(|s| s.parse::<u64>().ok());
481    let get_u32 = |idx: usize| parts.get(idx).and_then(|s| s.parse::<u32>().ok());
482    let get_i32 = |idx: usize| parts.get(idx).and_then(|s| s.parse::<i32>().ok());
483    StatFields {
484        minflt: get_u64(7),
485        majflt: get_u64(9),
486        utime_clock_ticks: get_u64(11),
487        stime_clock_ticks: get_u64(12),
488        priority: get_i32(15),
489        nice: get_i32(16),
490        start_time_clock_ticks: get_u64(19),
491        processor: get_i32(36),
492        rt_priority: get_u32(37),
493        policy: get_i32(38),
494    }
495}
496
497/// Read `<proc_root>/<tgid>/task/<tid>/stat` and parse fields.
498/// Records a `"stat"` failure into `tally` on read error so the
499/// per-snapshot [`CtprofParseSummary`] surfaces the dominant
500/// procfs read-failure category. `tally: &mut None` skips the
501/// recording (the synthetic-tree test pattern).
502pub(super) fn read_stat_at_with_tally(
503    proc_root: &Path,
504    tgid: i32,
505    tid: i32,
506    tally: &mut Option<&mut ParseTally>,
507) -> StatFields {
508    match fs::read_to_string(task_file(proc_root, tgid, tid, "stat")) {
509        Ok(raw) => parse_stat(&raw),
510        Err(_) => {
511            if let Some(t) = tally.as_mut() {
512                t.record_failure("stat");
513            }
514            StatFields::default()
515        }
516    }
517}
518
519/// Parse the three leading u64 fields from a single-line
520/// `/proc/<tgid>/task/<tid>/schedstat` — `(run_time_ns,
521/// wait_time_ns, timeslices)`. Missing fields drop individually.
522pub(super) fn parse_schedstat(raw: &str) -> (Option<u64>, Option<u64>, Option<u64>) {
523    let Some(line) = raw.lines().next() else {
524        return (None, None, None);
525    };
526    let mut parts = line.split_ascii_whitespace();
527    let run = parts.next().and_then(|s| s.parse::<u64>().ok());
528    let wait = parts.next().and_then(|s| s.parse::<u64>().ok());
529    let slices = parts.next().and_then(|s| s.parse::<u64>().ok());
530    (run, wait, slices)
531}
532
533/// Read `<proc_root>/<tgid>/task/<tid>/schedstat`. Three-tuple
534/// of `Option<u64>` — when `CONFIG_SCHED_INFO` is off the proc
535/// entry is absent (the registration is `#ifdef CONFIG_SCHED_INFO`
536/// in `fs/proc/base.c`), so the read fails with ENOENT and yields
537/// all-`None`; the kernel's "0 0 0" zero-fill branch is dead code
538/// for this file (it compiles only when `CONFIG_SCHED_INFO` is on).
539/// `CONFIG_SCHEDSTATS` selects `CONFIG_SCHED_INFO`, so a SCHEDSTATS
540/// kernel always has the file; SCHED_INFO is the minimal gate.
541/// Records a `"schedstat"` failure on read error when a tally is
542/// supplied.
543pub(super) fn read_schedstat_at_with_tally(
544    proc_root: &Path,
545    tgid: i32,
546    tid: i32,
547    tally: &mut Option<&mut ParseTally>,
548) -> (Option<u64>, Option<u64>, Option<u64>) {
549    match fs::read_to_string(task_file(proc_root, tgid, tid, "schedstat")) {
550        Ok(raw) => parse_schedstat(&raw),
551        Err(_) => {
552            if let Some(t) = tally.as_mut() {
553                t.record_failure("schedstat");
554            }
555            (None, None, None)
556        }
557    }
558}
559
560#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
561pub(super) struct IoFields {
562    pub(super) rchar: Option<u64>,
563    pub(super) wchar: Option<u64>,
564    pub(super) syscr: Option<u64>,
565    pub(super) syscw: Option<u64>,
566    pub(super) read_bytes: Option<u64>,
567    pub(super) write_bytes: Option<u64>,
568    pub(super) cancelled_write_bytes: Option<u64>,
569}
570
571/// Parse `/proc/<tgid>/task/<tid>/io` (line-oriented
572/// `key: value` format).
573pub(super) fn parse_io(raw: &str) -> IoFields {
574    let mut out = IoFields::default();
575    for line in raw.lines() {
576        let Some((key, value)) = line.split_once(':') else {
577            continue;
578        };
579        let parsed = value.trim().parse::<u64>().ok();
580        match key.trim() {
581            "rchar" => out.rchar = parsed,
582            "wchar" => out.wchar = parsed,
583            "syscr" => out.syscr = parsed,
584            "syscw" => out.syscw = parsed,
585            "read_bytes" => out.read_bytes = parsed,
586            "write_bytes" => out.write_bytes = parsed,
587            "cancelled_write_bytes" => out.cancelled_write_bytes = parsed,
588            _ => {}
589        }
590    }
591    out
592}
593
594/// Read `<proc_root>/<tgid>/task/<tid>/io` and parse fields.
595/// Records an `"io"` failure into `tally` on read error (kernel
596/// without `CONFIG_TASK_IO_ACCOUNTING` or per-tid race).
597pub(super) fn read_io_at_with_tally(
598    proc_root: &Path,
599    tgid: i32,
600    tid: i32,
601    tally: &mut Option<&mut ParseTally>,
602) -> IoFields {
603    match fs::read_to_string(task_file(proc_root, tgid, tid, "io")) {
604        Ok(raw) => parse_io(&raw),
605        Err(_) => {
606            if let Some(t) = tally.as_mut() {
607                t.record_failure("io");
608            }
609            IoFields::default()
610        }
611    }
612}
613
614#[derive(Debug, Clone, Default, PartialEq, Eq)]
615pub(super) struct StatusFields {
616    pub(super) voluntary_csw: Option<u64>,
617    pub(super) nonvoluntary_csw: Option<u64>,
618    /// First non-whitespace character of the `State:` line value.
619    /// Real kernel chars are `R` / `S` / `D` / `T` / `t` / `X` /
620    /// `Z` / `P` / `I` (see `fs/proc/array.c::task_state_array`).
621    /// `None` when the line is absent or blank — the capture site
622    /// collapses to `'~'` (via `default_state_char`) which sorts
623    /// strictly after every real kernel char in lex order, so
624    /// the [`crate::ctprof_compare::AggRule::ModeChar`]
625    /// lex-smallest-wins tiebreak picks a real letter when one
626    /// is present.
627    pub(super) state: Option<char>,
628    /// `Cpus_allowed_list:` as a parsed sorted vec. Kept separate
629    /// from the `sched_getaffinity` reader because status-file
630    /// reads attribute to the target task without a syscall
631    /// round-trip — useful when the caller cannot hold a pid
632    /// long enough for the syscall without a race.
633    pub(super) cpus_allowed: Option<Vec<u32>>,
634    /// `Threads:` value — `signal_struct->nr_threads` snapshot
635    /// per `task_sig()` (`fs/proc/array.c`). Identical across every thread
636    /// of the same tgid. The capture site dedups by populating
637    /// [`ThreadState::nr_threads`] only on tid == tgid threads
638    /// (see `capture_thread_at_with_tally`).
639    pub(super) nr_threads: Option<u64>,
640}
641
642pub(super) fn parse_status(raw: &str) -> StatusFields {
643    let mut out = StatusFields::default();
644    for line in raw.lines() {
645        let Some((key, value)) = line.split_once(':') else {
646            continue;
647        };
648        let value = value.trim();
649        match key.trim() {
650            // Kernel emits `State:\t<C> (<long>)` where <C> is the
651            // single-letter code from `task_state_array`
652            // (R/S/D/T/t/X/Z/P/I — nine codes, including the
653            // off-by-default `P` parked state). First non-whitespace
654            // char of the trimmed value is the letter;
655            // `value.chars().next()` produces `None` only on a truly
656            // empty line (which the split_once guards against
657            // already).
658            "State" => {
659                out.state = value.chars().next();
660            }
661            "voluntary_ctxt_switches" => {
662                out.voluntary_csw = value.parse::<u64>().ok();
663            }
664            "nonvoluntary_ctxt_switches" => {
665                out.nonvoluntary_csw = value.parse::<u64>().ok();
666            }
667            "Cpus_allowed_list" => {
668                out.cpus_allowed = crate::cpu_util::parse_cpu_list(value);
669            }
670            // `Threads:\t<num_threads>\n` per
671            // `task_sig()` (`fs/proc/array.c`). Same value across every
672            // thread of the same tgid; capture-side dedup picks
673            // only the leader thread to avoid double-counting.
674            "Threads" => {
675                out.nr_threads = value.parse::<u64>().ok();
676            }
677            _ => {}
678        }
679    }
680    out
681}
682
683/// Read `<proc_root>/<tgid>/task/<tid>/status` and parse fields.
684/// Records a `"status"` failure into `tally` on read error.
685pub(super) fn read_status_at_with_tally(
686    proc_root: &Path,
687    tgid: i32,
688    tid: i32,
689    tally: &mut Option<&mut ParseTally>,
690) -> StatusFields {
691    match fs::read_to_string(task_file(proc_root, tgid, tid, "status")) {
692        Ok(raw) => parse_status(&raw),
693        Err(_) => {
694            if let Some(t) = tally.as_mut() {
695                t.record_failure("status");
696            }
697            StatusFields::default()
698        }
699    }
700}
701
702/// Read the cgroup v2 path from
703/// `<proc_root>/<tgid>/task/<tid>/cgroup`. Format per
704/// `cgroup(7)`: one line per hierarchy, shape
705/// `hid:controllers:path`. The unified (v2) hierarchy is keyed
706/// `0::<path>`; mixed-mode hosts expose legacy v1 hierarchies
707/// alongside, which this reader skips. `None` on read failure
708/// or when no v2 line is present. Test-only — production callers
709/// pipe through [`read_cgroup_at_with_tally`] so per-tid failures
710/// surface in `parse_summary`.
711#[cfg(test)]
712pub(super) fn read_cgroup_at(proc_root: &Path, tgid: i32, tid: i32) -> Option<String> {
713    read_cgroup_at_with_tally(proc_root, tgid, tid, &mut None)
714}
715
716/// Records a `"cgroup"`
717/// failure on read error (file absent — typical when the tid
718/// exited mid-capture).
719pub(super) fn read_cgroup_at_with_tally(
720    proc_root: &Path,
721    tgid: i32,
722    tid: i32,
723    tally: &mut Option<&mut ParseTally>,
724) -> Option<String> {
725    match fs::read_to_string(task_file(proc_root, tgid, tid, "cgroup")) {
726        Ok(raw) => parse_cgroup_v2(&raw),
727        Err(_) => {
728            if let Some(t) = tally.as_mut() {
729                t.record_failure("cgroup");
730            }
731            None
732        }
733    }
734}
735
736pub(super) fn parse_cgroup_v2(raw: &str) -> Option<String> {
737    for line in raw.lines() {
738        if let Some(rest) = line.strip_prefix("0::") {
739            let trimmed = rest.trim();
740            if !trimmed.is_empty() {
741                return Some(trimmed.to_string());
742            }
743        }
744    }
745    None
746}
747
748#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
749pub(super) struct SchedFields {
750    pub(super) nr_wakeups: Option<u64>,
751    pub(super) nr_wakeups_local: Option<u64>,
752    pub(super) nr_wakeups_remote: Option<u64>,
753    pub(super) nr_wakeups_sync: Option<u64>,
754    pub(super) nr_wakeups_migrate: Option<u64>,
755    pub(super) nr_wakeups_affine: Option<u64>,
756    pub(super) nr_wakeups_affine_attempts: Option<u64>,
757    pub(super) nr_migrations: Option<u64>,
758    pub(super) nr_forced_migrations: Option<u64>,
759    pub(super) nr_failed_migrations_affine: Option<u64>,
760    pub(super) nr_failed_migrations_running: Option<u64>,
761    pub(super) nr_failed_migrations_hot: Option<u64>,
762    pub(super) wait_sum: Option<u64>,
763    pub(super) wait_count: Option<u64>,
764    pub(super) wait_max: Option<u64>,
765    pub(super) sleep_sum: Option<u64>,
766    pub(super) sleep_max: Option<u64>,
767    pub(super) block_sum: Option<u64>,
768    pub(super) block_max: Option<u64>,
769    pub(super) iowait_sum: Option<u64>,
770    pub(super) iowait_count: Option<u64>,
771    pub(super) exec_max: Option<u64>,
772    pub(super) slice_max: Option<u64>,
773    /// `core_forceidle_sum` from `/proc/<tid>/sched`, emitted via
774    /// `PN_SCHEDSTAT(core_forceidle_sum)` in
775    /// `proc_sched_show_task()` (`kernel/sched/debug.c`), build-gated on
776    /// `CONFIG_SCHED_CORE`. Emission additionally lives inside
777    /// the `if (schedstat_enabled())` block in
778    /// `proc_sched_show_task()` (`kernel/sched/debug.c`), so on a host with schedstat
779    /// off at runtime the line is absent and the parser arm
780    /// never fires — leaving the field at `None`.
781    /// Dotted ms.ns format like the other PN_SCHEDSTAT fields —
782    /// reconstructed to full ns via [`parsed_ns_from_dotted`]. Counts
783    /// time the task forced its SMT sibling idle for core-scheduling.
784    /// `None` on kernels without `CONFIG_SCHED_CORE`, on hosts
785    /// with schedstat disabled at runtime, or for tasks whose
786    /// SMT cohort never accumulated forceidle.
787    pub(super) core_forceidle_sum: Option<u64>,
788    /// `se.slice` from `/proc/<tid>/sched`, emitted via
789    /// `P(se.slice)` in `proc_sched_show_task()` (`kernel/sched/debug.c`). Plain
790    /// `%lld` integer (NOT dotted ns; the `P` macro uses
791    /// `%lld`, not `PN`'s `%lld.%06ld`). Per-thread
792    /// `p->se.slice` in nanoseconds. For fair-class tasks
793    /// (SCHED_NORMAL / SCHED_BATCH) it is the instantaneous
794    /// slice CFS is currently running the task with; for
795    /// SCHED_EXT tasks it reflects stale `p->se.slice` state
796    /// because ext-class schedulers maintain slice in
797    /// `p->scx.slice` and do not refresh `p->se.slice`. The
798    /// kernel emits this line ONLY when `fair_policy(p->policy)`
799    /// holds, which (per `normal_policy()`/`fair_policy()` (`kernel/sched/sched.h`)) is
800    /// true for SCHED_NORMAL, SCHED_BATCH, AND — under
801    /// `CONFIG_SCHED_CLASS_EXT` — SCHED_EXT. `None` for
802    /// SCHED_DEADLINE / SCHED_RR / SCHED_FIFO / SCHED_IDLE.
803    pub(super) fair_slice_ns: Option<u64>,
804    pub(super) ext_enabled: Option<bool>,
805}
806
807/// Outcome of [`parsed_ns_from_dotted`]. Distinguishes the two
808/// failure modes the caller may want to treat separately:
809/// [`Self::Negative`] (kernel emitted a value with a leading
810/// `-`, observable on clock-skew / suspend-resume hosts) is
811/// counted into [`CtprofParseSummary::negative_dotted_values`]
812/// so an operator can see that the snapshot's schedstat values
813/// are routinely negative-and-zeroed; [`Self::Malformed`]
814/// (non-numeric, empty, overflow) is the every-other failure
815/// mode and stays silent (the data source is ill-formed in a way
816/// the operator can't act on).
817#[derive(Debug, Clone, Copy, PartialEq, Eq)]
818pub(super) enum ParseDottedNs {
819    /// Trimmed input started with `-` — the kernel's PN_SCHEDSTAT
820    /// `%Ld.%06ld` format emitted a negative integer part. The
821    /// `parse::<u64>()` rejection is by design (u64 cannot
822    /// represent the sign) but the SIGNAL is meaningful: a
823    /// negative schedstat field is rare and worth surfacing
824    /// rather than silently zeroing.
825    ///
826    /// Note: `-0.000000` would also route here, but is
827    /// unreachable from real kernel output — `SPLIT_NS(0)`
828    /// yields `(0, 0)` which `%Ld.%06ld` formats as
829    /// `0.000000` with no leading sign. The parser still
830    /// classifies the unreachable shape as `Negative` rather
831    /// than special-casing it; a fixture that synthesizes
832    /// `-0.000000` directly will land in this variant.
833    Negative,
834    /// Otherwise unparseable: non-numeric integer or fractional
835    /// part, empty input, or u64 overflow on the
836    /// `ms * 1_000_000 + ns_remainder` reconstruction.
837    Malformed,
838}
839
840/// Parse a `PN_SCHEDSTAT`-emitted dotted nanosecond value into
841/// full ns. The kernel formats schedstat fractional fields via
842/// `__PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS(F))`
843/// where `SPLIT_NS(x) = (x / 1_000_000, x % 1_000_000)` — the
844/// integer part is MILLISECONDS, the 6-digit fractional part is
845/// the NANOSECOND remainder within a millisecond. Reconstructing
846/// the original ns value is `ms * 1_000_000 + ns_remainder`.
847///
848/// Tolerates fractional widths other than 6 (some test fixtures
849/// emit `5000.25` or `7.999`) by zero-padding the right side
850/// before parsing — `.25` becomes `.250000` (=250_000 ns), `.999`
851/// becomes `.999000` (=999_000 ns). Truncates fractional widths
852/// >6 to the first 6 digits.
853///
854/// Returns `Err(ParseDottedNs::Negative)` when EITHER:
855/// - the trimmed integer part starts with `-` (kernel emitted
856///   `-5.000000` for a magnitude ≥ 1ms negative SPLIT_NS: for
857///   x < 0, `nsec_high` returns `-nsec` — a negative long long
858///   — so `%14Ld` on the integer side carries the sign), OR
859/// - the trimmed fractional part starts with `-`. The kernel
860///   never emits this shape: `nsec_low` (kernel/sched/debug.c)
861///   negates via `nsec = -nsec` before `do_div` and returns an
862///   unsigned `long`, so `%06ld` on the fractional side is
863///   always positive and a sub-millisecond negative prints as
864///   `0.000500` — indistinguishable from a genuine positive.
865///   This fractional-side check therefore catches no real
866///   kernel output; it defends only against synthetic/fixture
867///   input that injects a `0.-NNNNNN` shape.
868///
869/// The caller records the bump in the per-snapshot
870/// [`CtprofParseSummary::negative_dotted_values`] before
871/// folding to zero. Returns `Err(ParseDottedNs::Malformed)`
872/// for any other parse failure (non-numeric, empty, overflow);
873/// the caller folds to zero silently per the best-effort capture
874/// contract.
875///
876/// The bare-integer (no dot) branch parses the value as raw ns
877/// — used for test fixtures and graceful degradation; the
878/// kernel's PN_SCHEDSTAT format always emits the dotted form.
879/// Same negative-vs-malformed split applies to the bare-integer
880/// branch so a stray bare-integer negative is also tallied.
881pub(super) fn parsed_ns_from_dotted(value: &str) -> Result<u64, ParseDottedNs> {
882    if let Some((ms_str, ns_str)) = value.split_once('.') {
883        let ms_trimmed = ms_str.trim();
884        if ms_trimmed.starts_with('-') {
885            return Err(ParseDottedNs::Negative);
886        }
887        // Fractional-side negative: the kernel never emits this
888        // shape. `nsec_low` (kernel/sched/debug.c) does
889        // `nsec = -nsec` before `do_div` and returns an unsigned
890        // `long`, so `%06ld` on the fractional side is always
891        // positive — a sub-millisecond negative prints as
892        // `0.000500`, not `0.-000500`. This check defends only
893        // against synthetic/fixture input that injects a
894        // sign-only fractional. Check it BEFORE the
895        // chars().take(6) truncation would otherwise swallow a
896        // sign-only fractional like `-`.
897        if ns_str.trim_start().starts_with('-') {
898            return Err(ParseDottedNs::Negative);
899        }
900        let ms = ms_trimmed
901            .parse::<u64>()
902            .map_err(|_| ParseDottedNs::Malformed)?;
903        let ns_part: String = ns_str.chars().take(6).collect();
904        let padded = format!("{:0<6}", ns_part);
905        let ns = padded
906            .parse::<u64>()
907            .map_err(|_| ParseDottedNs::Malformed)?;
908        ms.checked_mul(1_000_000)
909            .and_then(|x| x.checked_add(ns))
910            .ok_or(ParseDottedNs::Malformed)
911    } else {
912        let trimmed = value.trim();
913        if trimmed.starts_with('-') {
914            return Err(ParseDottedNs::Negative);
915        }
916        trimmed.parse::<u64>().map_err(|_| ParseDottedNs::Malformed)
917    }
918}
919
920/// Parse `/proc/<tgid>/task/<tid>/sched`. The file is registered
921/// unconditionally (always present); the schedstat-prefixed fields
922/// this reads are emitted only under `CONFIG_SCHEDSTATS` (the
923/// kernel's `if (schedstat_enabled())` block) and are absent
924/// otherwise. Format is many lines of `key : value`
925/// where the key is dot-delimited (`se.statistics.nr_wakeups`);
926/// different kernel versions use `se.statistics.`, `stats.`,
927/// or bare names. The reader matches on the LAST dot-delimited
928/// segment to absorb that variation.
929///
930/// PN_SCHEDSTAT fields (`wait_sum`, `sum_sleep_runtime`,
931/// `sum_block_runtime`, `iowait_sum`) emit a `ms.ns_remainder`
932/// dotted format — reconstructed to full ns via
933/// [`parsed_ns_from_dotted`]. P_SCHEDSTAT fields
934/// (`wait_count`, `iowait_count`, `nr_wakeups*`,
935/// `nr_migrations`) emit plain integers — parsed as `u64`.
936///
937/// `tally`, when supplied, records each negative dotted-ns parse
938/// outcome via [`ParseTally::record_negative_dotted`] so the
939/// per-snapshot summary surfaces the rate at which schedstat
940/// fields were silently zeroed. `&mut None` skips the recording —
941/// the synthetic-tree test path that doesn't carry a tally.
942pub(super) fn parse_sched(raw: &str, tally: &mut Option<&mut ParseTally>) -> SchedFields {
943    let mut out = SchedFields::default();
944    let mut parse_dotted = |value: &str| -> Option<u64> {
945        match parsed_ns_from_dotted(value) {
946            Ok(v) => Some(v),
947            Err(ParseDottedNs::Negative) => {
948                if let Some(t) = tally.as_mut() {
949                    t.record_negative_dotted();
950                }
951                None
952            }
953            Err(ParseDottedNs::Malformed) => None,
954        }
955    };
956    for line in raw.lines() {
957        let Some((key, value)) = line.split_once(':') else {
958            continue;
959        };
960        let key = key.trim();
961        let value = value.trim();
962        let parsed_u64 = || value.parse::<u64>().ok();
963        // `ext.enabled` is the only key the kernel emits with a
964        // literal dot in the variable name (every other dot is a
965        // namespace prefix like `se.statistics.`). Match on the full
966        // key BEFORE the rsplit-on-dot fallback so a future kernel
967        // line ending in `.enabled` cannot collide.
968        if key == "ext.enabled" {
969            out.ext_enabled = value.parse::<u64>().ok().map(|n| n != 0);
970            continue;
971        }
972        let short = key.rsplit('.').next().unwrap_or(key);
973        match short {
974            "nr_wakeups" => out.nr_wakeups = parsed_u64(),
975            "nr_wakeups_local" => out.nr_wakeups_local = parsed_u64(),
976            "nr_wakeups_remote" => out.nr_wakeups_remote = parsed_u64(),
977            "nr_wakeups_sync" => out.nr_wakeups_sync = parsed_u64(),
978            "nr_wakeups_migrate" => out.nr_wakeups_migrate = parsed_u64(),
979            "nr_wakeups_affine" => out.nr_wakeups_affine = parsed_u64(),
980            "nr_wakeups_affine_attempts" => out.nr_wakeups_affine_attempts = parsed_u64(),
981            "nr_migrations" => out.nr_migrations = parsed_u64(),
982            "nr_forced_migrations" => out.nr_forced_migrations = parsed_u64(),
983            "nr_failed_migrations_affine" => out.nr_failed_migrations_affine = parsed_u64(),
984            "nr_failed_migrations_running" => out.nr_failed_migrations_running = parsed_u64(),
985            "nr_failed_migrations_hot" => out.nr_failed_migrations_hot = parsed_u64(),
986            "wait_sum" => out.wait_sum = parse_dotted(value),
987            "wait_count" => out.wait_count = parsed_u64(),
988            "wait_max" => out.wait_max = parse_dotted(value),
989            // Kernel emits `sum_sleep_runtime` (see
990            // `kernel/sched/debug.c` -> `proc_sched_show_task`).
991            // The raw value lands in `SchedFields::sleep_sum`; the
992            // capture site at `capture_thread_at_with_tally`
993            // subtracts `sum_block_runtime` to derive
994            // `ThreadState::voluntary_sleep_ns` — the kernel
995            // double-counts block under sum_sleep_runtime, so the
996            // raw value is not surfaced in ThreadState. The kernel
997            // does not emit a `sleep_count` counterpart;
998            // `nr_wakeups` (matched above) covers the wake-side
999            // event tally.
1000            "sum_sleep_runtime" => out.sleep_sum = parse_dotted(value),
1001            "sleep_max" => out.sleep_max = parse_dotted(value),
1002            // Kernel emits `sum_block_runtime`; the matching
1003            // ThreadState field is `block_sum` for symmetry with
1004            // the other `*_sum` fields. There is no `block_count`
1005            // counterpart from the kernel — the schedstat printout
1006            // pairs `wait_sum/wait_count` and `iowait_sum/iowait_count`
1007            // but `sum_block_runtime` has no per-event counter.
1008            "sum_block_runtime" => out.block_sum = parse_dotted(value),
1009            "block_max" => out.block_max = parse_dotted(value),
1010            "iowait_sum" => out.iowait_sum = parse_dotted(value),
1011            "iowait_count" => out.iowait_count = parsed_u64(),
1012            "exec_max" => out.exec_max = parse_dotted(value),
1013            "slice_max" => out.slice_max = parse_dotted(value),
1014            // PN_SCHEDSTAT dotted ns; CONFIG_SCHED_CORE-gated. Same
1015            // ms.ns reconstruction as wait_sum / block_sum.
1016            "core_forceidle_sum" => out.core_forceidle_sum = parse_dotted(value),
1017            // P plain integer in ns. The kernel emits this only
1018            // for fair-policy tasks (`fair_policy(p->policy)` in
1019            // `proc_sched_show_task()`); for other policies the line is absent
1020            // and `parsed_u64()` collapses to None.
1021            "slice" => out.fair_slice_ns = parsed_u64(),
1022            _ => {}
1023        }
1024    }
1025    out
1026}
1027
1028/// Read `<proc_root>/<tgid>/task/<tid>/sched` and parse fields.
1029/// Records a `"sched"` failure into `tally` on read error, plus
1030/// per-line negative-dotted-value bumps via `parse_sched`.
1031pub(super) fn read_sched_at_with_tally(
1032    proc_root: &Path,
1033    tgid: i32,
1034    tid: i32,
1035    tally: &mut Option<&mut ParseTally>,
1036) -> SchedFields {
1037    match fs::read_to_string(task_file(proc_root, tgid, tid, "sched")) {
1038        Ok(raw) => parse_sched(&raw, tally),
1039        Err(_) => {
1040            if let Some(t) = tally.as_mut() {
1041                t.record_failure("sched");
1042            }
1043            SchedFields::default()
1044        }
1045    }
1046}
1047
1048/// Parse `/proc/<pid>/smaps_rollup` contents into a key→u64-kB
1049/// map. Format per `__show_smap()`
1050/// (`fs/proc/task_mmu.c`): each kv line is
1051/// `<Name>:<whitespace><u64><whitespace>kB`. The kernel ALSO
1052/// emits a `<addr_start>-<addr_end> ---p <off> XX:XX <inode> [rollup]`
1053/// header line (built by `show_vma_header_prefix()` then
1054/// `seq_puts(m, "[rollup]\n")` in `show_smaps_rollup()`). That
1055/// header CONTAINS a `:` (the device-major:minor pair `XX:XX`),
1056/// so a naive `split_once(':')` would mis-extract a junk key
1057/// (the whitespace-laden address range + flags + offset prefix)
1058/// with value 0 (the minor-device integer parses as the first
1059/// whitespace token of the value side). Real smaps_rollup keys
1060/// are single-word identifiers (Rss, Pss, Pss_Dirty, etc.) that
1061/// never contain whitespace or `-`; the address-range header
1062/// always contains both. Reject any line whose pre-`:` segment
1063/// carries either character.
1064///
1065/// Lines whose value field doesn't parse as u64 are silently
1066/// dropped (best-effort, matching the absent-counter contract).
1067pub(super) fn parse_smaps_rollup(raw: &str) -> BTreeMap<String, u64> {
1068    let mut out = BTreeMap::new();
1069    for line in raw.lines() {
1070        let Some((key, value)) = line.split_once(':') else {
1071            continue;
1072        };
1073        let key_trimmed = key.trim();
1074        // Header-line guard: real smaps_rollup keys never
1075        // contain whitespace or `-`. Address-range headers
1076        // (`<addr_start>-<addr_end> ---p <off> XX:XX <inode>
1077        // [rollup]`) carry both.
1078        if key_trimmed.contains(char::is_whitespace) || key_trimmed.contains('-') {
1079            continue;
1080        }
1081        // Value field: whitespace-prefixed integer + " kB" suffix
1082        // (or rarely no suffix on a future kernel addition). The
1083        // first whitespace-token after trimming IS the integer;
1084        // dropping the unit suffix happens for free via
1085        // `split_ascii_whitespace().next()`.
1086        let Some(n_str) = value.split_ascii_whitespace().next() else {
1087            continue;
1088        };
1089        let Ok(n) = n_str.parse::<u64>() else {
1090            continue;
1091        };
1092        out.insert(key_trimmed.to_string(), n);
1093    }
1094    out
1095}
1096
1097/// Read `<proc_root>/<tgid>/task/<tid>/smaps_rollup` for the
1098/// thread leader (tid == tgid) and parse it. Non-leader threads
1099/// short-circuit to an empty map: the underlying mm_struct is
1100/// shared per-tgid, so reading from any thread yields identical
1101/// values, and capturing once per tgid avoids redundant
1102/// per-thread work. Records a `"smaps_rollup"` failure into
1103/// `tally` on read error.
1104///
1105/// Permission semantics: `/proc/<pid>/smaps_rollup` requires
1106/// `CAP_SYS_PTRACE` for processes the caller doesn't own (PID 1
1107/// being the canonical example). Read failure is treated as
1108/// best-effort — empty map, tally bump, no panic. Older kernels
1109/// (pre-4.14) lack the file entirely; same handling.
1110pub(super) fn read_smaps_rollup_at_with_tally(
1111    proc_root: &Path,
1112    tgid: i32,
1113    tid: i32,
1114    tally: &mut Option<&mut ParseTally>,
1115) -> BTreeMap<String, u64> {
1116    if tid != tgid {
1117        // Leader-dedup: non-leader threads share the same
1118        // mm_struct, so the file would yield identical values.
1119        // Skip the read entirely.
1120        return BTreeMap::new();
1121    }
1122    match fs::read_to_string(task_file(proc_root, tgid, tid, "smaps_rollup")) {
1123        Ok(raw) => parse_smaps_rollup(&raw),
1124        Err(_) => {
1125            if let Some(t) = tally.as_mut() {
1126                t.record_failure("smaps_rollup");
1127            }
1128            BTreeMap::new()
1129        }
1130    }
1131}
1132
1133/// Parse cgroup v2 `cpu.stat`. Format is lines of `key value`
1134/// (space-separated, not `key: value`).
1135pub(super) fn parse_cpu_stat(raw: &str) -> (Option<u64>, Option<u64>, Option<u64>) {
1136    let mut usage = None;
1137    let mut throttled = None;
1138    let mut throttled_usec = None;
1139    for line in raw.lines() {
1140        let mut parts = line.split_ascii_whitespace();
1141        let Some(key) = parts.next() else { continue };
1142        let Some(value) = parts.next() else { continue };
1143        let parsed = value.parse::<u64>().ok();
1144        match key {
1145            "usage_usec" => usage = parsed,
1146            "nr_throttled" => throttled = parsed,
1147            "throttled_usec" => throttled_usec = parsed,
1148            _ => {}
1149        }
1150    }
1151    (usage, throttled, throttled_usec)
1152}
1153
1154/// Parse a cgroup v2 key-value file (one `<key> <u64>` per
1155/// line). Used for `memory.stat` and `memory.events`. Lines
1156/// the parser cannot fully decompose into a key + u64 are
1157/// silently skipped — a future kernel that introduces non-u64
1158/// values won't break the parser, just elide the offending key.
1159pub(super) fn parse_kv_counters(raw: &str) -> BTreeMap<String, u64> {
1160    let mut out = BTreeMap::new();
1161    for line in raw.lines() {
1162        let mut parts = line.split_ascii_whitespace();
1163        let Some(key) = parts.next() else { continue };
1164        let Some(value) = parts.next() else { continue };
1165        let Ok(parsed) = value.parse::<u64>() else {
1166            continue;
1167        };
1168        out.insert(key.to_string(), parsed);
1169    }
1170    out
1171}
1172
1173/// Parse a single-line LIMIT cgroup file (e.g. `memory.max`,
1174/// `memory.high`, `pids.max`). The literal token `max` means
1175/// "no limit" and yields `None`; a numeric value yields
1176/// `Some(u64)`. Whitespace-only or malformed input also yields
1177/// `None` (best-effort, matching the absent-counter contract).
1178///
1179/// Caller MUST NOT use this for FLOOR files (`memory.low`,
1180/// `memory.min`) — for floors, the literal token `max` means
1181/// "maximum protection", not "no floor", which is the semantic
1182/// opposite. Use [`parse_floor_value`] there instead.
1183pub(super) fn parse_max_or_u64(raw: &str) -> Option<u64> {
1184    let trimmed = raw.trim();
1185    if trimmed == "max" {
1186        return None;
1187    }
1188    trimmed.parse::<u64>().ok()
1189}
1190
1191/// Parse a single-line FLOOR cgroup file (`memory.low`,
1192/// `memory.min`). The literal token `max` means
1193/// "maximum protection" — yields `Some(u64::MAX)` rather than
1194/// `None`, because FLOORS use `None` to mean "absent file"
1195/// only. A numeric value yields `Some(u64)`; whitespace-only or
1196/// malformed input yields `None` (absent-counter contract).
1197///
1198/// The semantic asymmetry vs. [`parse_max_or_u64`] is critical:
1199/// for limits, "max" is the absence of a cap (collapse to
1200/// `None`); for floors, "max" is a fully-protected floor (it
1201/// must NOT collapse to "no floor"). `merge_min_option` then
1202/// correctly picks `min(u64::MAX, 5G) = 5G` instead of None
1203/// when one contributor has full protection and another has a
1204/// concrete protection.
1205pub(super) fn parse_floor_value(raw: &str) -> Option<u64> {
1206    let trimmed = raw.trim();
1207    if trimmed == "max" {
1208        return Some(u64::MAX);
1209    }
1210    trimmed.parse::<u64>().ok()
1211}
1212
1213/// Parse `cpu.max` (one line, two whitespace-separated tokens:
1214/// `<quota|max> <period>`). Returns `(quota, period)` where
1215/// `quota` is `None` for the literal `max` token (no CFS
1216/// bandwidth cap) and `Some(usec)` otherwise; `period` defaults
1217/// to the kernel default of 100_000 µs when missing or
1218/// malformed.
1219pub(super) fn parse_cpu_max(raw: &str) -> (Option<u64>, u64) {
1220    let mut parts = raw.split_ascii_whitespace();
1221    let quota_token = parts.next();
1222    let period_token = parts.next();
1223    let quota = quota_token.and_then(parse_max_or_u64_str);
1224    let period = period_token
1225        .and_then(|s| s.parse::<u64>().ok())
1226        .unwrap_or(CPU_MAX_DEFAULT_PERIOD_US);
1227    (quota, period)
1228}
1229
1230/// Helper for [`parse_cpu_max`]: route a single token through
1231/// the same `max`-vs-u64 disambiguation as [`parse_max_or_u64`]
1232/// without committing to a string-trimmed input shape.
1233pub(super) fn parse_max_or_u64_str(s: &str) -> Option<u64> {
1234    if s == "max" {
1235        return None;
1236    }
1237    s.parse::<u64>().ok()
1238}
1239
1240/// Default CFS bandwidth period when `cpu.max` is absent or its
1241/// period token is unreadable. Matches the kernel default
1242/// returned by `default_bw_period_us()`
1243/// (`kernel/sched/sched.h`); child cgroups inherit this when
1244/// `cpu.max` is unset.
1245pub(super) const CPU_MAX_DEFAULT_PERIOD_US: u64 = 100_000;
1246
1247/// Populate a [`CgroupStats`] by reading the cgroup v2 files
1248/// for `path` under `cgroup_root`. Missing files collapse to
1249/// the struct's `Default` (zero / `None` per field semantics) —
1250/// the root cgroup is missing most knob files, and child
1251/// cgroups on hosts without `pids` enabled in
1252/// `cgroup.subtree_control` are also expected to lack
1253/// `pids.{current,max}`.
1254pub(super) fn read_cgroup_stats_at(cgroup_root: &Path, path: &str) -> CgroupStats {
1255    let relative = path.strip_prefix('/').unwrap_or(path);
1256    let dir = if relative.is_empty() {
1257        cgroup_root.to_path_buf()
1258    } else {
1259        cgroup_root.join(relative)
1260    };
1261
1262    let (usage, throttled, throttled_usec) = fs::read_to_string(dir.join("cpu.stat"))
1263        .ok()
1264        .as_deref()
1265        .map(parse_cpu_stat)
1266        .unwrap_or((None, None, None));
1267    let (max_quota_us, max_period_us) = fs::read_to_string(dir.join("cpu.max"))
1268        .ok()
1269        .as_deref()
1270        .map(parse_cpu_max)
1271        .unwrap_or((None, CPU_MAX_DEFAULT_PERIOD_US));
1272    let weight = fs::read_to_string(dir.join("cpu.weight"))
1273        .ok()
1274        .and_then(|s| s.trim().parse::<u64>().ok());
1275    let weight_nice = fs::read_to_string(dir.join("cpu.weight.nice"))
1276        .ok()
1277        .and_then(|s| s.trim().parse::<i32>().ok());
1278
1279    let memory_current = fs::read_to_string(dir.join("memory.current"))
1280        .ok()
1281        .and_then(|s| s.trim().parse::<u64>().ok())
1282        .unwrap_or(0);
1283    let memory_max = fs::read_to_string(dir.join("memory.max"))
1284        .ok()
1285        .as_deref()
1286        .and_then(parse_max_or_u64);
1287    let memory_high = fs::read_to_string(dir.join("memory.high"))
1288        .ok()
1289        .as_deref()
1290        .and_then(parse_max_or_u64);
1291    let memory_low = fs::read_to_string(dir.join("memory.low"))
1292        .ok()
1293        .as_deref()
1294        .and_then(parse_floor_value);
1295    let memory_min = fs::read_to_string(dir.join("memory.min"))
1296        .ok()
1297        .as_deref()
1298        .and_then(parse_floor_value);
1299    let memory_stat = fs::read_to_string(dir.join("memory.stat"))
1300        .ok()
1301        .as_deref()
1302        .map(parse_kv_counters)
1303        .unwrap_or_default();
1304    let memory_events = fs::read_to_string(dir.join("memory.events"))
1305        .ok()
1306        .as_deref()
1307        .map(parse_kv_counters)
1308        .unwrap_or_default();
1309
1310    let pids_current = fs::read_to_string(dir.join("pids.current"))
1311        .ok()
1312        .and_then(|s| s.trim().parse::<u64>().ok());
1313    let pids_max = fs::read_to_string(dir.join("pids.max"))
1314        .ok()
1315        .as_deref()
1316        .and_then(parse_max_or_u64);
1317
1318    let psi = read_cgroup_psi_at(cgroup_root, path);
1319
1320    CgroupStats {
1321        cpu: CgroupCpuStats {
1322            usage_usec: usage.unwrap_or(0),
1323            nr_throttled: throttled.unwrap_or(0),
1324            throttled_usec: throttled_usec.unwrap_or(0),
1325            max_quota_us,
1326            max_period_us,
1327            weight,
1328            weight_nice,
1329        },
1330        memory: CgroupMemoryStats {
1331            current: memory_current,
1332            max: memory_max,
1333            high: memory_high,
1334            low: memory_low,
1335            min: memory_min,
1336            stat: memory_stat,
1337            events: memory_events,
1338        },
1339        pids: CgroupPidsStats {
1340            current: pids_current,
1341            max: pids_max,
1342        },
1343        psi,
1344    }
1345}