ktstr/
cpu_util.rs

1//! CPU-affinity utilities shared across the crate.
2//!
3//! Two helpers for reading and parsing per-task CPU affinity:
4//!
5//! - [`parse_cpu_list`] decodes the kernel cpulist string format
6//!   (`"0-3,5,7-9"`) emitted by `/proc/<pid>/status:Cpus_allowed_list`
7//!   and `/sys/devices/system/cpu/online`.
8//! - [`read_affinity`] calls `sched_getaffinity(2)` with a
9//!   dynamically-sized buffer so `CONFIG_NR_CPUS > 1024` hosts are
10//!   handled correctly (libc's fixed `cpu_set_t` is only 1024 bits).
11//!
12//! Both produce sorted-deduped `Vec<u32>` of CPU ids and route
13//! garbled / over-cap input to `None`. Used by the per-thread
14//! profiler (ctprof) AND the VM topology planner
15//! (vmm::host_topology) — the function shape is generic enough that
16//! either subsystem could have owned it; keeping the impls here so
17//! neither has to depend on the other for a CPU-list helper.
18//!
19//! # Why this is NOT [`crate::topology::parse_cpu_list`]
20//!
21//! [`crate::topology`] carries its own `parse_cpu_list` (returns
22//! `Result<Vec<usize>>`) and `parse_cpu_list_lenient` (returns
23//! `Vec<usize>`, never fails). The split is deliberate, not a
24//! duplicate to consolidate:
25//!
26//! - **Threat model.** This module's parser ingests `/proc/<tid>/status`
27//!   data captured from arbitrary tasks on the host. A hostile or
28//!   corrupt `Cpus_allowed_list:` value like `0-4294967295` would
29//!   allocate 16 GiB without the `MAX_CPU_RANGE_EXPANSION` cap.
30//!   The topology parser ingests operator-supplied VM config —
31//!   no untrusted-input concerns, no cap needed.
32//! - **Return shape.** `Option<Vec<u32>>` here vs
33//!   `Result<Vec<usize>>` / `Vec<usize>` in topology. The capture
34//!   path needs to distinguish "no data" (None) from "data but
35//!   garbled" (also None for now, with an explicit comment); the
36//!   topology path needs `anyhow::Error` for upstream `?`
37//!   propagation and `Vec<usize>` to interop with sysfs APIs that
38//!   speak `usize`.
39//! - **Dedup semantics.** This module dedups duplicates produced
40//!   by overlapping ranges (`0-2,1` → `[0,1,2]`); the topology
41//!   parser preserves duplicates so callers detecting operator
42//!   config errors (e.g. accidentally listing the same CPU
43//!   twice) can surface them.
44//!
45//! Unifying the two behind a generic helper would require either
46//! collapsing one set of invariants into the other or carrying
47//! both behaviors through a config struct — neither produces a
48//! cleaner end result than the current cohabitation.
49
50use libc;
51
52/// Parse a cpulist string of the form `"0-3,5,7-9"` into a
53/// sorted deduped vec of CPU ids. `None` on empty input or any
54/// malformed token (partial results are rejected so the caller
55/// can distinguish "no data" from "data but garbled").
56///
57/// # Range expansion cap
58///
59/// A single `lo-hi` token that would expand to more than
60/// 65 536 CPUs is rejected as malformed. Without this gate a
61/// hostile or corrupted `Cpus_allowed_list:` value like
62/// `0-4294967295` would allocate 16 GiB for the expansion vec
63/// and crash the capture (or OOM the process). The cap is far
64/// above any realistic `CONFIG_NR_CPUS` (current Linux defaults
65/// top out at a few thousand; even `NR_CPUS=8192` builds stay
66/// inside this bound), so legitimate input is never rejected.
67pub fn parse_cpu_list(s: &str) -> Option<Vec<u32>> {
68    /// Upper bound on the number of CPUs a single `lo-hi` token
69    /// can expand to. 64 Ki — orders of magnitude above any
70    /// in-production `NR_CPUS` — leaves headroom for future
71    /// large-NUMA hosts while capping the worst-case allocation
72    /// at 256 KiB (64 Ki × u32 = 256 KiB).
73    const MAX_CPU_RANGE_EXPANSION: u64 = 65_536;
74
75    let s = s.trim();
76    if s.is_empty() {
77        return None;
78    }
79    let mut out: Vec<u32> = Vec::new();
80    for token in s.split(',') {
81        let token = token.trim();
82        if token.is_empty() {
83            continue;
84        }
85        if let Some((lo, hi)) = token.split_once('-') {
86            let lo: u32 = lo.parse().ok()?;
87            let hi: u32 = hi.parse().ok()?;
88            if hi < lo {
89                return None;
90            }
91            // Guard against hostile / corrupt range expansions.
92            // Use u64 arithmetic so the `hi - lo + 1` compute
93            // cannot overflow even at u32::MAX. Reject rather
94            // than clamp so the caller's "no data vs data but
95            // garbled" distinction stays intact.
96            let span = (hi as u64) - (lo as u64) + 1;
97            if span > MAX_CPU_RANGE_EXPANSION {
98                return None;
99            }
100            for c in lo..=hi {
101                out.push(c);
102            }
103        } else {
104            out.push(token.parse::<u32>().ok()?);
105        }
106    }
107    out.sort_unstable();
108    out.dedup();
109    Some(out)
110}
111
112/// Read the effective CPU affinity for a task via the
113/// `sched_getaffinity(2)` syscall. The kernel gates
114/// `sched_getaffinity` on `security_task_getscheduler(p)` only —
115/// under the default DAC config this is unrestricted (any task may
116/// read any other task's affinity); an active LSM (SELinux/Yama)
117/// may return EPERM. Returns sorted CPU ids.
118/// `None` on syscall failure (EPERM, ESRCH) or when the kernel's
119/// mask exceeds [`AFFINITY_MAX_BITS`] (hosts beyond 262144 CPUs).
120///
121/// # Dynamic buffer sizing
122///
123/// The kernel's `SYSCALL_DEFINE3(sched_getaffinity)`
124/// (`kernel/sched/syscalls.c`) rejects a caller buffer shorter
125/// than `nr_cpu_ids / BITS_PER_BYTE` with `EINVAL`. The x86_64
126/// `CONFIG_NR_CPUS` maximum is 8192 (`NR_CPUS_RANGE_END` with
127/// `CPUMASK_OFFSTACK`; without it the max is 512); other
128/// architectures may allow higher (large NUMA / partitioning
129/// hardware). libc's fixed [`libc::cpu_set_t`] is only 1024 bits
130/// wide, so calling `sched_getaffinity` with
131/// `size_of::<cpu_set_t>()` against a `CONFIG_NR_CPUS > 1024`
132/// kernel fails EINVAL even when the caller has legitimate
133/// access.
134///
135/// This helper avoids the cap by allocating a dynamically-sized
136/// `Vec<c_ulong>` (an array of kernel `unsigned long` — the
137/// wire format the syscall expects, aligned and byte-length a
138/// multiple of `sizeof(unsigned long)` per the kernel's second
139/// validation). On EINVAL the buffer doubles and the call
140/// retries, capped at [`AFFINITY_MAX_BITS`] = 262144 (32 KiB of
141/// mask data — covers every real-world `CONFIG_NR_CPUS` setting
142/// and bounds the worst-case allocation).
143///
144/// # Error-class handling
145///
146/// - `EINVAL` → buffer too small. Double and retry until the
147///   ceiling is reached, then surface None.
148/// - `EPERM` / `ESRCH` → real access / process-identity failures.
149///   Return None so the caller falls back to the procfs
150///   `Cpus_allowed_list:` path. That field is emitted in
151///   `/proc/<tid>/status` and is governed by procfs DAC (open /
152///   directory-traversal permission), not the syscall's
153///   `security_task_getscheduler` LSM hook, so it can succeed
154///   where an active LSM denied the syscall.
155/// - Any other error → return None. The procfs fallback will
156///   produce the correct value or its own None.
157///
158/// Without this split, the previous implementation collapsed
159/// every error to None indistinguishably — EINVAL on a
160/// \>1024-CPU host was treated the same as EPERM, and every
161/// caller had to rely on the procfs fallback for correctness,
162/// making the syscall path effectively useless on the very
163/// hosts where affinity data matters most (1000-plus-CPU NUMA
164/// boxes).
165pub fn read_affinity(tid: i32) -> Option<Vec<u32>> {
166    let mut bits = AFFINITY_INITIAL_BITS;
167    loop {
168        let mut buffer = affinity_zeroed_buffer(bits);
169        let bytes = std::mem::size_of_val(buffer.as_slice());
170        // SAFETY: `buffer.as_mut_ptr()` produces a live pointer
171        // valid for `bytes` writes; the kernel writes at most
172        // `min(bytes, cpumask_size)` and returns the actual byte
173        // count. `bits` is always a multiple of
174        // `c_ulong::BITS`, so `bytes` satisfies the kernel's
175        // alignment validation (`len & (sizeof(unsigned long)-1)
176        // == 0`).
177        let ret = unsafe {
178            libc::syscall(
179                libc::SYS_sched_getaffinity,
180                tid as libc::pid_t,
181                bytes,
182                buffer.as_mut_ptr(),
183            )
184        };
185        if ret >= 0 {
186            // ret carries the actual byte count the kernel
187            // wrote. Bits beyond `ret * 8` were not touched and
188            // stay at the zero-init value above — safe to
189            // iterate the full buffer, but tightening the bound
190            // avoids wasted work on small masks inside a large
191            // buffer.
192            let written_bytes = ret as usize;
193            return extract_cpus_from_mask(&buffer, written_bytes);
194        }
195        // Error path: classify via errno.
196        let errno = std::io::Error::last_os_error().raw_os_error();
197        // Only EINVAL warrants a retry — it signals "buffer too
198        // small" under the kernel's
199        // `(len * BITS_PER_BYTE) < nr_cpu_ids` check. Every other
200        // error (EPERM permission denied, ESRCH process gone,
201        // EFAULT bad pointer, etc.) is terminal.
202        if errno != Some(libc::EINVAL) {
203            return None;
204        }
205        let Some(next) = affinity_next_bits(bits) else {
206            // Ceiling reached without success — the host claims
207            // more CPUs than the helper is willing to allocate
208            // for. Surface None so the caller falls back to the
209            // procfs string form, which has no bit-count cap.
210            return None;
211        };
212        bits = next;
213    }
214}
215
216/// Initial number of CPU bits the affinity buffer starts at.
217/// 8192 is the x86_64 `CONFIG_NR_CPUS` ceiling (`NR_CPUS_RANGE_END`
218/// with `CPUMASK_OFFSTACK`; also the `MAXSMP` default), so no
219/// x86_64 host exceeds it and the overwhelming majority resolve
220/// on the first syscall.
221pub const AFFINITY_INITIAL_BITS: usize = 8192;
222
223/// Maximum number of CPU bits [`read_affinity`] is willing to
224/// allocate for. 262144 bits = 32 KiB of mask data, well above
225/// the largest in-production `CONFIG_NR_CPUS` this project
226/// targets. Capping bounds the worst-case allocation and
227/// bounds the retry loop's iteration count
228/// (`log2(AFFINITY_MAX_BITS / AFFINITY_INITIAL_BITS)` = 5
229/// doublings).
230pub const AFFINITY_MAX_BITS: usize = 262144;
231
232/// Given the current buffer size in bits, return the size for
233/// the next retry attempt — double the current size, rejecting
234/// any step that would exceed [`AFFINITY_MAX_BITS`]. Returns
235/// `None` when the ceiling has been reached and no further
236/// retry is allowed.
237///
238/// Extracted from [`read_affinity`] so the loop-termination
239/// policy is unit-testable without syscall dispatch.
240pub(crate) fn affinity_next_bits(current_bits: usize) -> Option<usize> {
241    let doubled = current_bits.checked_mul(2)?;
242    if doubled > AFFINITY_MAX_BITS {
243        None
244    } else {
245        Some(doubled)
246    }
247}
248
249/// Allocate a zeroed buffer of `c_ulong` words sized to hold
250/// `bits` CPU-mask bits. The kernel's
251/// `sys_sched_getaffinity` rejects any `len & (sizeof(unsigned
252/// long)-1) != 0`, so the buffer is allocated in whole-word
253/// units.
254///
255/// Extracted so [`read_affinity`]'s reset-on-retry contract is
256/// visible (a fresh zeroed buffer per attempt prevents stale
257/// bits from a truncated earlier read leaking into the current
258/// attempt's iteration).
259fn affinity_zeroed_buffer(bits: usize) -> Vec<libc::c_ulong> {
260    let word_bits = libc::c_ulong::BITS as usize;
261    let words = bits.div_ceil(word_bits);
262    vec![0 as libc::c_ulong; words]
263}
264
265/// Walk a successfully-filled cpu-mask buffer and return the
266/// sorted list of set CPU ids, or `None` when no bits were set
267/// (the kernel writes a mask with at least one bit for any
268/// task that was dispatchable at all; an all-zero mask would
269/// imply the task has been taken off every CPU, which the
270/// kernel does not expose as a valid affinity — surface None
271/// rather than `Some(vec![])` so downstream callers can tell
272/// "no data" from "legitimately empty mask").
273///
274/// `written_bytes` is the byte count the syscall reported; we
275/// iterate only that range so a small mask inside a large
276/// buffer does not scan past what the kernel actually wrote.
277fn extract_cpus_from_mask(buffer: &[libc::c_ulong], written_bytes: usize) -> Option<Vec<u32>> {
278    let word_bytes = std::mem::size_of::<libc::c_ulong>();
279    let word_bits = libc::c_ulong::BITS as usize;
280    let written_words = written_bytes / word_bytes;
281    let mut cpus: Vec<u32> = Vec::new();
282    for (word_idx, &word) in buffer.iter().take(written_words).enumerate() {
283        if word == 0 {
284            continue;
285        }
286        for bit in 0..word_bits {
287            if word & (1 as libc::c_ulong) << bit != 0 {
288                let cpu = word_idx * word_bits + bit;
289                cpus.push(cpu as u32);
290            }
291        }
292    }
293    if cpus.is_empty() { None } else { Some(cpus) }
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    #[test]
301    fn parse_cpu_list_accepts_ranges_singletons_and_mixtures() {
302        assert_eq!(parse_cpu_list("0-3").unwrap(), vec![0, 1, 2, 3]);
303        assert_eq!(parse_cpu_list("5").unwrap(), vec![5]);
304        assert_eq!(parse_cpu_list("0,2,4").unwrap(), vec![0, 2, 4]);
305        assert_eq!(parse_cpu_list("0-2,4,6-7").unwrap(), vec![0, 1, 2, 4, 6, 7]);
306    }
307
308    #[test]
309    fn parse_cpu_list_rejects_malformed_input() {
310        assert!(parse_cpu_list("").is_none());
311        assert!(parse_cpu_list("5-3").is_none());
312        assert!(parse_cpu_list("abc").is_none());
313        assert!(parse_cpu_list("0-").is_none());
314        assert!(parse_cpu_list("-3").is_none());
315    }
316
317    #[test]
318    fn parse_cpu_list_dedups_and_sorts() {
319        assert_eq!(parse_cpu_list("3,0-2,1,2").unwrap(), vec![0, 1, 2, 3]);
320    }
321
322    /// A range whose expansion would exceed 64 Ki CPUs is
323    /// rejected as malformed rather than allocating
324    /// gigabytes. Without the `span > MAX_CPU_RANGE_EXPANSION`
325    /// gate, a hostile or corrupt `Cpus_allowed_list:` value
326    /// like `0-4294967295` would try to push 4 billion u32s
327    /// into a Vec and either OOM the process or crash the
328    /// capture. The cap sits orders of magnitude above any
329    /// realistic `CONFIG_NR_CPUS` so legitimate inputs are
330    /// never rejected.
331    #[test]
332    fn parse_cpu_list_rejects_huge_range() {
333        // Malicious u32::MAX range — cap bites.
334        assert_eq!(parse_cpu_list("0-4294967295"), None);
335        // Just above the 64 Ki cap — still rejected.
336        assert_eq!(parse_cpu_list("0-65536"), None);
337        // At the cap — accepted (65_536 elements, the inclusive
338        // `lo..=hi` boundary: 0 through 65_535).
339        let at_cap = parse_cpu_list("0-65535").unwrap();
340        assert_eq!(at_cap.len(), 65_536);
341        // A realistic large-CPU range (e.g. 8192-way host) is
342        // well under the cap and passes.
343        let realistic = parse_cpu_list("0-8191").unwrap();
344        assert_eq!(realistic.len(), 8192);
345    }
346
347    /// parse_cpu_list on a single-CPU range (`"5-5"`) must return
348    /// a 1-element vec. `lo == hi` is the boundary of the inclusive
349    /// range expansion — a regression that skipped the `lo == hi`
350    /// case (e.g. `lo < hi` instead of `lo <= hi` in the loop)
351    /// would drop the single element.
352    #[test]
353    fn parse_cpu_list_single_element_range_lo_equals_hi() {
354        assert_eq!(parse_cpu_list("5-5").unwrap(), vec![5]);
355        // Also pin at the cap boundary and bottom edge.
356        assert_eq!(parse_cpu_list("0-0").unwrap(), vec![0]);
357    }
358
359    /// parse_cpu_list with a trailing comma (`"0,1,"`) must succeed
360    /// and drop the empty token — the tokenizer has a dedicated
361    /// `if token.is_empty() { continue }` arm precisely for this
362    /// case. A user-pasted cpulist sometimes carries a stray comma
363    /// from copy+paste; rejecting it would be a usability
364    /// regression.
365    #[test]
366    fn parse_cpu_list_trailing_comma_accepted() {
367        assert_eq!(parse_cpu_list("0,1,").unwrap(), vec![0, 1]);
368        // Also the leading-comma case — same codepath.
369        assert_eq!(parse_cpu_list(",0,1").unwrap(), vec![0, 1]);
370    }
371
372    /// `affinity_next_bits` doubles the buffer until the
373    /// [`AFFINITY_MAX_BITS`] ceiling bites, then returns `None`
374    /// to signal "give up". Pins the exact sequence 8192 →
375    /// 16384 → 32768 → 65536 → 131072 → 262144 → None so a
376    /// regression that replaced `checked_mul(2)` with `+= step`
377    /// (or otherwise changed the growth curve) surfaces here.
378    #[test]
379    fn affinity_next_bits_doubles_until_ceiling() {
380        assert_eq!(AFFINITY_INITIAL_BITS, 8192);
381        assert_eq!(AFFINITY_MAX_BITS, 262144);
382        // Full doubling chain from the initial size to the cap.
383        let mut cur = AFFINITY_INITIAL_BITS;
384        let expected = [16384usize, 32768, 65536, 131072, 262144];
385        for &want in &expected {
386            let next = affinity_next_bits(cur).expect("doubling must succeed below ceiling");
387            assert_eq!(next, want, "expected {want}, got {next}");
388            cur = next;
389        }
390        // At the cap, the next step would be 524288 > 262144 — return None.
391        assert_eq!(
392            affinity_next_bits(AFFINITY_MAX_BITS),
393            None,
394            "at the ceiling, no further retry must be allowed",
395        );
396    }
397
398    /// A single-set-bit mask in the first word must be extracted
399    /// to exactly that CPU id. Pins the word_idx*word_bits +
400    /// bit offset arithmetic against off-by-one drift.
401    #[test]
402    fn extract_cpus_from_mask_single_bit_in_first_word() {
403        let mut buf = vec![0 as libc::c_ulong; 4];
404        // Set CPU 5 in word 0.
405        buf[0] = (1 as libc::c_ulong) << 5;
406        let bytes = std::mem::size_of_val(buf.as_slice());
407        let cpus = extract_cpus_from_mask(&buf, bytes).expect("non-empty mask");
408        assert_eq!(cpus, vec![5]);
409    }
410
411    /// A bit set in a NON-first word must be offset by
412    /// word_bits * word_idx. Guards against a regression that
413    /// dropped the `word_idx * word_bits` term and reported the
414    /// bit position within the word instead of the absolute CPU
415    /// id.
416    #[test]
417    fn extract_cpus_from_mask_offset_bit_in_later_word() {
418        let word_bits = libc::c_ulong::BITS as usize;
419        let mut buf = vec![0 as libc::c_ulong; 4];
420        // Set CPU (2 * word_bits + 3) in word 2, bit 3.
421        buf[2] = (1 as libc::c_ulong) << 3;
422        let bytes = std::mem::size_of_val(buf.as_slice());
423        let cpus = extract_cpus_from_mask(&buf, bytes).expect("non-empty mask");
424        let expected = (2 * word_bits + 3) as u32;
425        assert_eq!(cpus, vec![expected]);
426    }
427
428    /// `written_bytes` tighter than the buffer size must stop
429    /// iteration at that byte count — bits beyond it belong to
430    /// caller-zeroed padding and a kernel that returned a
431    /// smaller mask than our buffer doesn't promise their shape.
432    /// Pins that a stale bit planted past `written_bytes` is
433    /// NOT harvested.
434    #[test]
435    fn extract_cpus_from_mask_respects_written_bytes() {
436        let mut buf = vec![0 as libc::c_ulong; 4];
437        // Plant CPU bits in word 0 AND word 3; tell the
438        // extractor only word 0 was written by the kernel.
439        buf[0] = (1 as libc::c_ulong) << 7; // CPU 7
440        buf[3] = 1 as libc::c_ulong; // would-be CPU 3*word_bits
441        let one_word_bytes = std::mem::size_of::<libc::c_ulong>();
442        let cpus = extract_cpus_from_mask(&buf, one_word_bytes).expect("non-empty mask");
443        // Only the bit in the first (kernel-written) word comes back.
444        assert_eq!(cpus, vec![7]);
445    }
446
447    /// Empty mask (every word zero) → `None`. Pins the
448    /// "Some(vec![]) is NOT a valid return" invariant — any
449    /// caller that dispatches on `.is_some()` must be able to
450    /// trust that a Some carries at least one CPU.
451    #[test]
452    fn extract_cpus_from_mask_empty_buffer_returns_none() {
453        let buf = vec![0 as libc::c_ulong; 4];
454        let bytes = std::mem::size_of_val(buf.as_slice());
455        assert_eq!(extract_cpus_from_mask(&buf, bytes), None);
456    }
457
458    /// `affinity_zeroed_buffer` rounds UP to whole words so the
459    /// byte length satisfies the kernel's
460    /// `len & (sizeof(unsigned long)-1) == 0` alignment check.
461    /// An off-by-one in the `div_ceil` would produce a
462    /// non-multiple-of-word-size buffer and the syscall would
463    /// reject with EINVAL forever (retry loop would churn but
464    /// never succeed).
465    #[test]
466    fn affinity_zeroed_buffer_rounds_up_and_is_zeroed() {
467        let word_bits = libc::c_ulong::BITS as usize;
468        // Ask for exactly one word — get exactly one word.
469        let exact = affinity_zeroed_buffer(word_bits);
470        assert_eq!(exact.len(), 1);
471        // Ask for one bit more than a word — get two words.
472        let over = affinity_zeroed_buffer(word_bits + 1);
473        assert_eq!(over.len(), 2);
474        // Initial bits → 8192 / word_bits words.
475        let init = affinity_zeroed_buffer(AFFINITY_INITIAL_BITS);
476        assert_eq!(init.len(), AFFINITY_INITIAL_BITS / word_bits);
477        // Every slot must be zeroed.
478        assert!(init.iter().all(|&w| w == 0));
479    }
480
481    /// Smoke test against the real syscall for the current
482    /// process — `read_affinity(getpid())` must succeed and
483    /// return at least one CPU. The test process always has an
484    /// affinity set (the kernel never runs a task off all
485    /// CPUs), so None here signals a regression in the retry
486    /// loop / errno classification.
487    ///
488    /// Distinct from the per-thread capture-path test in
489    /// ctprof — this test focuses on `read_affinity` in
490    /// isolation so a failure localizes to the fn's own logic
491    /// rather than a capture-path wiring issue.
492    #[test]
493    fn read_affinity_for_self_returns_at_least_one_cpu() {
494        let pid = std::process::id() as i32;
495        let cpus = read_affinity(pid).expect("own affinity must resolve");
496        assert!(
497            !cpus.is_empty(),
498            "self affinity must carry at least one CPU"
499        );
500        // CPUs come out sorted.
501        let mut sorted = cpus.clone();
502        sorted.sort_unstable();
503        assert_eq!(cpus, sorted, "cpus must be returned sorted ascending");
504    }
505}