ktstr/vmm/host_topology/mod.rs
1//! Host CPU topology discovery for performance_mode.
2//!
3//! Wraps [`TestTopology`](crate::topology::TestTopology) for LLC-aware
4//! vCPU pinning and host resource validation.
5
6use anyhow::{Context, Result};
7
8// Advisory flock primitives live in `crate::flock` so both LLC +
9// per-CPU coordination here and per-cache-entry coordination in
10// `crate::cache` share one `try_flock` implementation (with a single
11// `O_CLOEXEC` source of truth) plus one `HolderInfo` /proc/locks
12// parser. Re-importing the names keeps existing in-module call sites
13// (production + `super::*` tests) compiling unchanged.
14use crate::flock::{FlockMode, try_flock};
15
16/// Resource contention error — LLC slots or CPUs unavailable.
17/// Downcast via `anyhow::Error::downcast_ref::<ResourceContention>()`
18/// to distinguish from fatal errors.
19#[derive(Debug)]
20pub struct ResourceContention {
21 pub reason: String,
22}
23
24impl std::fmt::Display for ResourceContention {
25 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
26 write!(f, "{}", self.reason)
27 }
28}
29
30impl std::error::Error for ResourceContention {}
31
32/// The requested topology cannot be realized on this host, and no retry
33/// changes that. Surfaced as a SKIP by the x86_64 VM-creation caps (guest
34/// RAM top above the host MAXPHYADDR, vCPU count above KVM_CAP_MAX_VCPUS,
35/// or max APIC id at/above KVM_CAP_MAX_VCPU_ID): these fire for ANY VM of
36/// this shape, perf-mode or not, so the test cannot run here. Also
37/// returned by the `performance_mode` planner (`compute_pinning`) when the
38/// host has too few physical CPUs / LLC groups — but that perf-mode caller
39/// RE-MAPS it to [`PerfModeUnavailable`] (a host-insufficiency: skip by
40/// default, fail under `KTSTR_NO_SKIP_MODE`). Also raised by
41/// `resolve_cpu_budget` when an author's per-test `cpu_budget` exceeds the
42/// allowed-CPU count — the author-attribute half of a provenance split (a
43/// capability requirement a bigger host satisfies → skip), mirroring the
44/// operator-knob half [`CpuBudgetUnsatisfiable`] (a concrete `--cpu-cap`
45/// number the host cannot satisfy → hard fail). Distinct
46/// from [`ResourceContention`] (a transient slot/resource shortage a retry
47/// resolves → skip); a too-small host is permanent, so the operator must
48/// provision different hardware or narrow the topology rather than retry.
49///
50/// Downcast via `anyhow::Error::downcast_ref::<TopologyInsufficient>()`
51/// (chain-aware: the `#[ktstr_test]` dispatch and `skip_on_contention!`
52/// walk the full error chain so a `.context(...)`-wrapped instance is
53/// still recognised). This typed error replaced a fragile message
54/// string-match (`"need"` + `"LLC"`/`"CPU"`) that would misclassify any
55/// unrelated error happening to contain those words.
56#[derive(Debug)]
57pub struct TopologyInsufficient {
58 pub reason: String,
59}
60
61impl std::fmt::Display for TopologyInsufficient {
62 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
63 write!(f, "{}", self.reason)
64 }
65}
66
67impl std::error::Error for TopologyInsufficient {}
68
69/// The host cannot honor the `performance_mode` guarantee (an exclusive
70/// host LLC for the test's virtual LLC topology + a service CPU), and no
71/// retry changes that — a permanent host-insufficiency (e.g. a single-LLC
72/// host whose LLC spans every CPU, so LLC + 1 service never fits). Treated
73/// like [`TopologyInsufficient`] / [`ResourceContention`]: a SKIP by
74/// default (the VM never runs unisolated — it errors at build, so a
75/// visible skip informs the operator without reddening CI on a host that
76/// can never satisfy perf-mode), promoted to a hard FAIL under
77/// `KTSTR_NO_SKIP_MODE` for runs that demand perf-mode execution. The
78/// remedy is unchanged: provision a host with a spare LLC/CPU, narrow the
79/// topology, or drop `--perf-mode`.
80///
81/// Downcast via `anyhow::Error::downcast_ref::<PerfModeUnavailable>()`
82/// (chain-aware: the dispatch + macro predicates walk the full error
83/// chain, so a `.context(...)`-wrapped instance is still recognised).
84#[derive(Debug)]
85pub struct PerfModeUnavailable {
86 pub reason: String,
87}
88
89impl std::fmt::Display for PerfModeUnavailable {
90 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91 write!(f, "{}", self.reason)
92 }
93}
94
95impl std::error::Error for PerfModeUnavailable {}
96
97/// An operator `--cpu-cap N` (or `KTSTR_CPU_CAP`) the host cannot satisfy: N
98/// exceeds the CPUs this process is allowed on. A HARD ERROR, not a skip —
99/// the operator typed a concrete number that does not exist on this host (a
100/// user-input error). This is the OPERATOR-knob half of a provenance split:
101/// an author's per-test `cpu_budget` over the allowance is instead a
102/// [`TopologyInsufficient`] SKIP (a capability request a bigger host would
103/// satisfy), raised in `resolve_cpu_budget`. Contrast [`ResourceContention`]
104/// (a transient shortage of an otherwise-satisfiable budget → skip/retry).
105///
106/// Downcast via `anyhow::Error::downcast_ref::<CpuBudgetUnsatisfiable>()`
107/// (chain-aware).
108#[derive(Debug)]
109pub struct CpuBudgetUnsatisfiable {
110 pub reason: String,
111}
112
113impl std::fmt::Display for CpuBudgetUnsatisfiable {
114 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
115 write!(f, "{}", self.reason)
116 }
117}
118
119impl std::error::Error for CpuBudgetUnsatisfiable {}
120
121/// The requested topology cannot be represented by this VMM's static
122/// device layout, and the limit is host-INDEPENDENT, so no retry and no
123/// different host changes that. Concretely: the aarch64 vCPU count
124/// exceeds `MAX_VCPUS` (the capacity of the statically sized GICv3
125/// redistributor MMIO window) — with more vCPUs the redistributor region
126/// overruns the device MMIO window and shadows serial/virtio.
127///
128/// A HARD ERROR — distinct from [`TopologyInsufficient`], its deliberate
129/// counterpart. `TopologyInsufficient` is host-DEPENDENT (the VM cannot
130/// boot on *this* host, but a bigger host could → skip);
131/// `TopologyUnrepresentable` is a fixed VMM-layout limit no aarch64 host
132/// can satisfy under this VMM, so it is a test misconfiguration — the
133/// author must narrow the topology, not provision different hardware.
134/// Routes to `EXIT_FAIL` via a DEDICATED hard-fail arm (the
135/// `is_topology_unrepresentable` predicate) in both `result_to_exit_code`
136/// and the `#[ktstr_test]` macro body, placed ABOVE the `expect_err`
137/// inversion and the skip arms — mirroring `CpuBudgetUnsatisfiable` (the
138/// other dedicated hard-fail). That placement is what makes it fail even in
139/// an `expect_err` test (the generic `expect_err` arm would otherwise
140/// invert it to a pass) and keeps it out of the `skip_on_contention!` /
141/// `is_topology_insufficient` skip paths, so the misconfiguration can
142/// never masquerade as the expected failure or be turned into a skip.
143///
144/// Downcast via `anyhow::Error::downcast_ref::<TopologyUnrepresentable>()`
145/// (chain-aware: walks `e.chain()`, so a `.context(...)`-wrapped instance
146/// is still recognised) to identify it programmatically — e.g. tests
147/// asserting the over-`MAX_VCPUS` bail is this hard-fault and not a bare
148/// string-matched error.
149// Constructed only on aarch64 (the GICv3-layout over-MAX_VCPUS bail in
150// aarch64::kvm) and in cross-arch routing tests; a non-aarch64 lib-only
151// build sees no construction site. Keep the dead-code check live on
152// aarch64 (where the bail MUST construct it — a real regression if it
153// stops) and allow it only off-arch.
154#[cfg_attr(not(target_arch = "aarch64"), allow(dead_code))]
155#[derive(Debug)]
156pub struct TopologyUnrepresentable {
157 pub reason: String,
158}
159
160impl std::fmt::Display for TopologyUnrepresentable {
161 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162 write!(f, "{}", self.reason)
163 }
164}
165
166impl std::error::Error for TopologyUnrepresentable {}
167
168/// A physical LLC group on the host, identified by its cache ID.
169#[derive(Debug, Clone)]
170pub struct LlcGroup {
171 /// CPUs sharing this LLC.
172 pub cpus: Vec<usize>,
173}
174
175/// Host CPU topology: LLC groups, NUMA nodes, and online CPU set.
176#[derive(Debug, Clone)]
177pub struct HostTopology {
178 /// LLC groups indexed by their order of discovery.
179 pub llc_groups: Vec<LlcGroup>,
180 /// All online CPUs.
181 pub online_cpus: Vec<usize>,
182 /// NUMA node ID for each online CPU, indexed by CPU ID.
183 /// CPUs not in the map default to node 0.
184 pub cpu_to_node: std::collections::HashMap<usize, usize>,
185 /// LLC indices grouped by their NUMA node. Memoized at construction
186 /// time from `llc_groups + cpu_to_node` so repeated NUMA-aware
187 /// placement queries (perf-mode rotation, `--cpu-cap` consolidation
188 /// PLAN) don't re-walk every LLC's CPU list on every call. Access
189 /// via [`HostTopology::host_llcs_by_numa_node`]. `BTreeMap` (not
190 /// `HashMap`) for deterministic iteration order — two ktstr
191 /// invocations on the same host MUST produce identical LLC
192 /// selections so their ACQUIRE phases converge on the same indices.
193 pub(crate) host_node_llcs: std::collections::BTreeMap<usize, Vec<usize>>,
194}
195
196/// Pinning plan: maps each vCPU index to a host CPU, plus a dedicated
197/// CPU for service threads (monitor, watchdog).
198#[derive(Debug)]
199pub struct PinningPlan {
200 /// vcpu_index -> host_cpu
201 pub assignments: Vec<(u32, usize)>,
202 /// Dedicated host CPU for monitor/watchdog threads. Set when
203 /// `reserve_service_cpu` is true in `compute_pinning`.
204 pub service_cpu: Option<usize>,
205 /// Host LLC group indices used by this plan, sorted.
206 pub llc_indices: Vec<usize>,
207 /// Held flock fds for resource reservation. Dropped when the plan
208 /// (and the KtstrVm holding it) is dropped, releasing all locks.
209 #[allow(dead_code)] // RAII: flock fds released on Drop, not read after construction.
210 pub(crate) locks: Vec<std::os::fd::OwnedFd>,
211}
212
213/// Process-wide cache for [`HostTopology::cached`]. Only
214/// populated on success — a failed sysfs probe retries on the
215/// next call instead of poisoning the cache.
216static CACHED_HOST_TOPOLOGY: std::sync::OnceLock<HostTopology> = std::sync::OnceLock::new();
217
218impl HostTopology {
219 /// Read host topology from sysfs via [`TestTopology::from_system()`](crate::topology::TestTopology::from_system).
220 pub fn from_sysfs() -> Result<Self> {
221 let topo = crate::topology::TestTopology::from_system()
222 .context("read host topology from sysfs")?;
223 let online_cpus = topo.all_cpus().to_vec();
224 let llc_groups: Vec<LlcGroup> = topo
225 .llcs()
226 .iter()
227 .map(|llc| LlcGroup {
228 cpus: llc.cpus().to_vec(),
229 })
230 .collect();
231 let cpu_to_node: std::collections::HashMap<usize, usize> = topo
232 .llcs()
233 .iter()
234 .flat_map(|llc| llc.cpus().iter().map(|&cpu| (cpu, llc.numa_node())))
235 .collect();
236 let host_node_llcs = Self::compute_host_node_llcs(&llc_groups, &cpu_to_node);
237 Ok(Self {
238 llc_groups,
239 online_cpus,
240 cpu_to_node,
241 host_node_llcs,
242 })
243 }
244
245 /// Return a cached host topology, populating the cache on first
246 /// successful call. Failed reads retry on the next call — the
247 /// cache only stores success so a transient sysfs issue at
248 /// process start doesn't poison every subsequent build().
249 pub fn cached() -> Result<Self> {
250 if let Some(topo) = CACHED_HOST_TOPOLOGY.get() {
251 return Ok(topo.clone());
252 }
253 let topo = Self::from_sysfs()?;
254 let _ = CACHED_HOST_TOPOLOGY.set(topo.clone());
255 Ok(topo)
256 }
257
258 /// Build a synthetic `HostTopology` from `(cpu_list, node_id)`
259 /// pairs for tests. One pair per LLC group; within a pair the
260 /// `cpu_list` becomes the group's CPUs and the `node_id` is the
261 /// NUMA node every CPU in that group is assigned to.
262 /// `online_cpus` is the flattened concatenation of every group's
263 /// CPUs in input order; `cpu_to_node` is built by broadcasting
264 /// each group's node over its CPUs; `host_node_llcs` goes through
265 /// the same [`compute_host_node_llcs`] path production uses, so
266 /// tests never diverge from the sysfs-derived memoization.
267 ///
268 /// Intended for test fixtures that want a deterministic in-memory
269 /// topology without stubbing `/sys/devices/system/cpu/*`.
270 /// Previously this logic was duplicated across three helper
271 /// functions (`synthetic_topo`, `synthetic_topo_numa`,
272 /// `synth_host_topo`) — consolidated here so the
273 /// `HostTopology` invariant is maintained in one place. The
274 /// `#[cfg(test)]` gate keeps the symbol out of release builds.
275 #[cfg(test)]
276 pub(crate) fn new_for_tests(groups: &[(Vec<usize>, usize)]) -> Self {
277 let llc_groups: Vec<LlcGroup> = groups
278 .iter()
279 .map(|(cpus, _)| LlcGroup { cpus: cpus.clone() })
280 .collect();
281 let cpu_to_node: std::collections::HashMap<usize, usize> = groups
282 .iter()
283 .flat_map(|(cpus, node)| cpus.iter().map(move |&cpu| (cpu, *node)))
284 .collect();
285 let online_cpus: Vec<usize> = groups
286 .iter()
287 .flat_map(|(cpus, _)| cpus.iter().copied())
288 .collect();
289 let host_node_llcs = HostTopology::compute_host_node_llcs(&llc_groups, &cpu_to_node);
290 HostTopology {
291 llc_groups,
292 online_cpus,
293 cpu_to_node,
294 host_node_llcs,
295 }
296 }
297
298 /// Compute the memoized `host_node_llcs` map from `llc_groups` +
299 /// `cpu_to_node`. Uses the same majority-vote NUMA-assignment rule
300 /// as [`Self::llc_numa_node`], so the memoized map and the one-off query
301 /// method never disagree. Separate fn (not inlined) so
302 /// `from_sysfs` and synthetic-test constructors share one path.
303 fn compute_host_node_llcs(
304 llc_groups: &[LlcGroup],
305 cpu_to_node: &std::collections::HashMap<usize, usize>,
306 ) -> std::collections::BTreeMap<usize, Vec<usize>> {
307 let mut node_llcs: std::collections::BTreeMap<usize, Vec<usize>> =
308 std::collections::BTreeMap::new();
309 for (idx, group) in llc_groups.iter().enumerate() {
310 // Majority-vote NUMA node for this LLC — matches
311 // `llc_numa_node` exactly. We inline the logic here rather
312 // than calling the method because we don't yet have `self`.
313 let mut counts: std::collections::HashMap<usize, usize> =
314 std::collections::HashMap::new();
315 for &cpu in &group.cpus {
316 let node = cpu_to_node.get(&cpu).copied().unwrap_or(0);
317 *counts.entry(node).or_insert(0) += 1;
318 }
319 let node = counts
320 .into_iter()
321 .max_by_key(|&(_, count)| count)
322 .map(|(node, _)| node)
323 .unwrap_or(0);
324 node_llcs.entry(node).or_default().push(idx);
325 }
326 // Within-node LLC ordering: ascending llc_idx. Callers that
327 // walk `host_node_llcs[node]` rely on this for deterministic
328 // output — two ktstr invocations with identical topology see
329 // the same walk order.
330 for llcs in node_llcs.values_mut() {
331 llcs.sort_unstable();
332 }
333 node_llcs
334 }
335
336 /// Maximum cores per LLC group on the host.
337 pub fn max_cores_per_llc(&self) -> usize {
338 self.llc_groups
339 .iter()
340 .map(|g| g.cpus.len())
341 .max()
342 .unwrap_or(0)
343 }
344
345 /// Total available host CPUs.
346 pub fn total_cpus(&self) -> usize {
347 self.online_cpus.len()
348 }
349
350 // ------------------------------------------------------------------
351 // Shared NUMA-placement primitives
352 // ------------------------------------------------------------------
353 //
354 // Used by the existing perf-mode pinning path
355 // ([`numa_aware_llc_order`]) AND the `--cpu-cap` consolidation
356 // PLAN phase. Both callers implement DIFFERENT selection algorithms
357 // on top of these queries:
358 //
359 // - Perf-mode distributes virtual NUMA nodes across host NUMA
360 // nodes with modulo rotation; uses primitive 2
361 // (eligibility-by-capacity). No distance lookup.
362 // - Consolidation seeds from a scored LLC list then greedily
363 // expands within the seed's node, spilling to nearest-by-distance
364 // when needed; uses primitive 3 (plus llc_numa_node).
365 //
366 // Kept as small orthogonal queries rather than a single mega-selector
367 // — the two algorithms genuinely do different things, but they both
368 // need the same three topology lookups.
369
370 /// Memoized map of NUMA node → LLC indices on that node. Returned
371 /// by reference so callers can iterate without cloning; `BTreeMap`
372 /// gives deterministic iteration so two invocations on identical
373 /// topologies produce identical walks.
374 ///
375 /// In-tree callers currently reach the same data via
376 /// [`Self::numa_nodes_sorted_by_distance`] and [`Self::numa_nodes_with_capacity`]
377 /// — both iterate `host_node_llcs` internally — so this accessor
378 /// has no direct consumer today. Kept as a stable handle for
379 /// future callers (e.g. a planned `ktstr topo --json` NUMA
380 /// section) and downstream tooling that wants the raw map.
381 #[allow(dead_code)]
382 pub(crate) fn host_llcs_by_numa_node(&self) -> &std::collections::BTreeMap<usize, Vec<usize>> {
383 &self.host_node_llcs
384 }
385
386 /// Return every NUMA node that has `>= min_llcs` LLCs, paired with
387 /// that node's LLC-index slice. Callers filter through this when
388 /// their algorithm requires per-node capacity guarantees (perf-mode
389 /// passes `ceil(llcs/numa_nodes)` so any guest node can land on any
390 /// host node; consolidation passes 1 so every node with at least
391 /// one free LLC is a valid spill candidate). Iteration order
392 /// follows the underlying `BTreeMap` — ascending by node id.
393 pub(crate) fn numa_nodes_with_capacity(&self, min_llcs: usize) -> Vec<(usize, &Vec<usize>)> {
394 self.host_node_llcs
395 .iter()
396 .filter(|(_, llcs)| llcs.len() >= min_llcs)
397 .map(|(&node, llcs)| (node, llcs))
398 .collect()
399 }
400
401 /// Return NUMA node ids sorted by distance from `anchor` ascending,
402 /// with unreachable nodes (distance 255 per Linux convention)
403 /// demoted to the end. Caller supplies the distance lookup via
404 /// `distance_fn` so this primitive stays independent of any
405 /// specific distance source — consolidation threads
406 /// `TestTopology::numa_distance` through a closure, while callers
407 /// without a distance matrix can pass
408 /// `|from, to| if from == to { 10 } else { 20 }` for a trivial
409 /// near/far split.
410 ///
411 /// `anchor` is included in the output (distance to self = 10 on
412 /// the Linux convention, sorting first). Nodes without any LLCs
413 /// on this host are skipped — spilling to an empty node has no
414 /// value.
415 pub(crate) fn numa_nodes_sorted_by_distance(
416 &self,
417 anchor: usize,
418 distance_fn: impl Fn(usize, usize) -> u8,
419 ) -> Vec<usize> {
420 let mut nodes: Vec<(usize, u8)> = self
421 .host_node_llcs
422 .keys()
423 .map(|&node| (node, distance_fn(anchor, node)))
424 .collect();
425 // Sort: unreachable (255) last; among reachable, ascending
426 // distance; ties broken by ascending node id via the stable
427 // sort applied over a pre-sorted (BTreeMap-ordered) input.
428 nodes.sort_by(|a, b| {
429 let a_unreachable = a.1 == 255;
430 let b_unreachable = b.1 == 255;
431 match (a_unreachable, b_unreachable) {
432 (true, false) => std::cmp::Ordering::Greater,
433 (false, true) => std::cmp::Ordering::Less,
434 _ => a.1.cmp(&b.1),
435 }
436 });
437 nodes.into_iter().map(|(node, _)| node).collect()
438 }
439
440 /// NUMA node for a host LLC group, determined by majority vote of
441 /// its CPUs' NUMA assignments. Returns 0 when the map is empty
442 /// (single-node systems).
443 ///
444 /// Production callers pre-compute the node-to-LLC mapping once at
445 /// [`HostTopology::from_sysfs`] via
446 /// [`compute_host_node_llcs`](Self::compute_host_node_llcs)
447 /// (memoized in [`host_node_llcs`](Self::host_node_llcs)); use
448 /// [`Self::host_llcs_by_numa_node`](Self::host_llcs_by_numa_node) to
449 /// iterate the pre-built map. This method stays exposed for
450 /// external callers (future `ktstr locks` NUMA column + any
451 /// downstream tooling that needs a single-LLC lookup) and
452 /// synthetic-topology tests that assert per-LLC node assignment.
453 pub fn llc_numa_node(&self, llc_idx: usize) -> usize {
454 let group = &self.llc_groups[llc_idx];
455 let mut counts: std::collections::HashMap<usize, usize> = std::collections::HashMap::new();
456 for &cpu in &group.cpus {
457 let node = self.cpu_to_node.get(&cpu).copied().unwrap_or(0);
458 *counts.entry(node).or_insert(0) += 1;
459 }
460 counts
461 .into_iter()
462 .max_by_key(|&(_, count)| count)
463 .map(|(node, _)| node)
464 .unwrap_or(0)
465 }
466
467 /// Compute a pinning plan that maps virtual LLCs to physical LLC groups.
468 ///
469 /// Each virtual LLC's vCPUs are assigned to cores within a single physical LLC.
470 /// `llc_offset` rotates the starting LLC group so concurrent VMs pin to
471 /// different physical cores. When `reserve_service_cpu` is true, one
472 /// additional host CPU is reserved for service threads (monitor, watchdog).
473 ///
474 /// When `topo.numa_nodes > 1`, virtual LLCs are grouped by guest NUMA
475 /// node and each group is placed on host LLCs within the same physical
476 /// NUMA node. Falls back to sequential placement when the host lacks
477 /// enough NUMA-aligned LLCs.
478 ///
479 /// Returns an error if the host cannot satisfy the topology.
480 pub fn compute_pinning(
481 &self,
482 topo: &super::topology::Topology,
483 reserve_service_cpu: bool,
484 llc_offset: usize,
485 ) -> Result<PinningPlan> {
486 let cores = topo.cores_per_llc;
487 let threads = topo.threads_per_core;
488 let llcs = topo.llcs;
489 let vcpus_per_llc = cores * threads;
490 let total_vcpus = llcs * vcpus_per_llc;
491 let total_needed = total_vcpus as usize + if reserve_service_cpu { 1 } else { 0 };
492
493 if total_needed > self.total_cpus() {
494 return Err(anyhow::Error::new(TopologyInsufficient {
495 reason: format!(
496 "performance_mode: need {} CPUs ({} vCPUs + {} service) \
497 but only {} host CPUs available",
498 total_needed,
499 total_vcpus,
500 if reserve_service_cpu { 1 } else { 0 },
501 self.total_cpus(),
502 ),
503 }));
504 }
505
506 let num_llcs = self.llc_groups.len();
507 if llcs as usize > num_llcs {
508 return Err(anyhow::Error::new(TopologyInsufficient {
509 reason: format!(
510 "performance_mode: need {} LLCs for {} virtual LLCs, \
511 but host has {} LLC groups",
512 llcs, llcs, num_llcs,
513 ),
514 }));
515 }
516
517 // Build the virtual-to-host LLC index mapping. When numa_nodes > 1,
518 // try to place each guest NUMA node's LLCs on host LLCs within
519 // the same physical NUMA node.
520 let llc_order = self.numa_aware_llc_order(topo.numa_nodes, llcs, llc_offset);
521
522 let mut assignments = Vec::with_capacity(total_vcpus as usize);
523 let mut used_cpus = std::collections::HashSet::new();
524
525 for llc in 0..llcs {
526 let llc_idx = llc_order[llc as usize];
527 let group = &self.llc_groups[llc_idx];
528 let available: Vec<usize> = group
529 .cpus
530 .iter()
531 .copied()
532 .filter(|c| !used_cpus.contains(c))
533 .collect();
534
535 if available.len() < vcpus_per_llc as usize {
536 return Err(anyhow::Error::new(TopologyInsufficient {
537 reason: format!(
538 "performance_mode: LLC group {} has {} available CPUs, \
539 need {} for virtual LLC {}",
540 llc_idx,
541 available.len(),
542 vcpus_per_llc,
543 llc,
544 ),
545 }));
546 }
547
548 for vcpu_in_llc in 0..vcpus_per_llc {
549 let vcpu_id = llc * vcpus_per_llc + vcpu_in_llc;
550 let host_cpu = available[vcpu_in_llc as usize];
551 used_cpus.insert(host_cpu);
552 assignments.push((vcpu_id, host_cpu));
553 }
554 }
555
556 let service_cpu = if reserve_service_cpu {
557 let cpu = self
558 .online_cpus
559 .iter()
560 .copied()
561 .find(|c| !used_cpus.contains(c));
562 // Defensive: the total-CPU check above already folds the +1
563 // service CPU into `total_needed`, so a passing host always
564 // has at least one online CPU beyond the assigned vCPUs and
565 // this never fires today. Typed as TopologyInsufficient (not
566 // plain anyhow) so that if a future refactor of that check ever
567 // lets it through, it is handled identically to its three
568 // sibling shortfall checks: the perf-mode caller
569 // (acquire_slot_with_locks) re-maps every compute_pinning
570 // TopologyInsufficient to PerfModeUnavailable (a host-insufficiency
571 // skip, fail under KTSTR_NO_SKIP_MODE), and
572 // the non-perf caller passes reserve_service_cpu=false so this
573 // site is unreachable there.
574 if cpu.is_none() {
575 return Err(anyhow::Error::new(TopologyInsufficient {
576 reason: format!(
577 "performance_mode: no free host CPU for service threads \
578 after assigning {total_vcpus} vCPUs"
579 ),
580 }));
581 }
582 cpu
583 } else {
584 None
585 };
586
587 // Deduplicate LLC indices (multiple virtual LLCs may map to the
588 // same host LLC at different offsets, but that's prevented by the
589 // used_cpus check above — each virtual LLC consumes distinct CPUs).
590 let mut llc_indices = llc_order;
591 llc_indices.sort_unstable();
592 llc_indices.dedup();
593
594 Ok(PinningPlan {
595 assignments,
596 service_cpu,
597 llc_indices,
598 locks: Vec::new(),
599 })
600 }
601
602 /// Build the virtual LLC to host LLC index mapping.
603 ///
604 /// Falls back to sequential offset mapping when any of these hold:
605 /// `numa_nodes == 0` (avoids divide-by-zero), `numa_nodes == 1`
606 /// (no NUMA-awareness needed), `cpu_to_node` is empty (no NUMA
607 /// map available), `llcs < numa_nodes` (base-per-node would be 0
608 /// and leave guest nodes empty), or the host lacks enough
609 /// NUMA-aligned LLCs.
610 ///
611 /// Otherwise, distributes `llcs` across `numa_nodes` guest nodes:
612 /// the first `llcs % numa_nodes` guest nodes receive
613 /// `base + 1 = ceil(llcs / numa_nodes)` LLCs each; the rest
614 /// receive `base = floor(llcs / numa_nodes)` LLCs. This preserves
615 /// the remainder that floor-only division would silently drop
616 /// (e.g. `llcs=5, numa_nodes=2` yields counts 3+2 = 5).
617 /// Eligibility requires each host NUMA node to supply at least
618 /// `ceil(llcs / numa_nodes)` (the max any single guest node will
619 /// claim) — stricter than the prior floor-based check, so the
620 /// "+1" guest nodes always land on a node with capacity.
621 ///
622 /// Implementation composes [`Self::numa_nodes_with_capacity`],
623 /// which iterates the memoized `host_node_llcs` map. The
624 /// `--cpu-cap` consolidation PLAN phase instead composes
625 /// [`Self::numa_nodes_sorted_by_distance`] plus
626 /// [`Self::llc_numa_node`], so the two callers share the memoized
627 /// `host_node_llcs` map rather than the same accessor calls. The
628 /// two callers' SELECTION algorithms also differ: perf-mode does
629 /// modulo rotation of guest onto host nodes; consolidation does
630 /// score-driven greedy expansion.
631 pub(crate) fn numa_aware_llc_order(
632 &self,
633 numa_nodes: u32,
634 llcs: u32,
635 llc_offset: usize,
636 ) -> Vec<usize> {
637 let num_host_llcs = self.llc_groups.len();
638
639 // Sequential fallback used by the degenerate cases below.
640 let sequential_fallback = || -> Vec<usize> {
641 (0..llcs as usize)
642 .map(|i| (i + llc_offset) % num_host_llcs)
643 .collect()
644 };
645
646 // Defensive: zero NUMA nodes would divide-by-zero below. Also
647 // handles the single-node case (no NUMA-awareness needed) and
648 // the "cpu_to_node map unavailable" case.
649 if numa_nodes == 0 || numa_nodes == 1 || self.cpu_to_node.is_empty() {
650 return sequential_fallback();
651 }
652
653 // If the guest has fewer LLCs than NUMA nodes, a per-node base
654 // of 0 would leave some guest nodes empty. Fall back rather
655 // than silently dropping those nodes' LLCs.
656 if llcs < numa_nodes {
657 return sequential_fallback();
658 }
659
660 // Distribute LLCs across guest NUMA nodes. Integer division
661 // alone drops the remainder (e.g. llcs=5, numa_nodes=2 gave
662 // 2 per node = 4 LLCs assigned, 5th dropped). Fix: the first
663 // `remainder` nodes get `base + 1`, the rest get `base`.
664 let base_per_node = (llcs / numa_nodes) as usize;
665 let remainder = (llcs % numa_nodes) as usize;
666 // Ceiling-per-node — the largest count any single guest node
667 // will claim. Host NUMA nodes must supply at least this many
668 // to remain eligible.
669 let max_per_node = base_per_node + if remainder > 0 { 1 } else { 0 };
670
671 // Collect host NUMA nodes that can supply the ceiling (max)
672 // per-node count — so any guest node can land there regardless
673 // of whether it's one of the `remainder` "+1" nodes. Shared
674 // primitive: `numa_nodes_with_capacity` filters the memoized
675 // group-by-node map.
676 let eligible_nodes = self.numa_nodes_with_capacity(max_per_node);
677
678 // Need at least numa_nodes distinct host NUMA nodes with enough
679 // LLCs each.
680 if eligible_nodes.len() < numa_nodes as usize {
681 return sequential_fallback();
682 }
683
684 // Assign guest NUMA nodes to host NUMA nodes, rotating by
685 // llc_offset to spread concurrent VMs.
686 let mut order = Vec::with_capacity(llcs as usize);
687 let node_offset = llc_offset / max_per_node.max(1);
688 for guest_node in 0..numa_nodes as usize {
689 let host_idx = (guest_node + node_offset) % eligible_nodes.len();
690 let (_, host_llcs) = &eligible_nodes[host_idx];
691 let within_offset = llc_offset % host_llcs.len();
692 // First `remainder` guest nodes get `base + 1` LLCs; rest
693 // get `base`. Total assigned == llcs (remainder preserved).
694 let count = if guest_node < remainder {
695 base_per_node + 1
696 } else {
697 base_per_node
698 };
699 for i in 0..count {
700 let llc_idx = host_llcs[(i + within_offset) % host_llcs.len()];
701 order.push(llc_idx);
702 }
703 }
704
705 order
706 }
707}
708
709/// Lock mode for LLC reservation.
710#[derive(Debug, Clone, Copy, PartialEq, Eq)]
711pub enum LlcLockMode {
712 /// Exclusive access to the entire LLC (performance_mode tests).
713 /// Returns unavailable when any shared or exclusive holder exists.
714 Exclusive,
715 /// Shared access to the LLC (non-perf pinned tests).
716 /// Multiple shared holders coexist; returns unavailable when
717 /// exclusive holder exists.
718 #[allow(dead_code)]
719 Shared,
720}
721
722/// Resource lock acquisition outcome.
723#[derive(Debug)]
724pub enum LockOutcome {
725 /// All locks acquired successfully.
726 Acquired {
727 /// LLC offset consumed; read only by the locking test fixtures.
728 #[allow(dead_code)]
729 llc_offset: usize,
730 locks: Vec<std::os::fd::OwnedFd>,
731 },
732 /// Resources busy. The inner string carries the diagnostic reason
733 /// surfaced to test fixtures; production callers only match the
734 /// variant tag.
735 Unavailable(#[allow(dead_code)] String),
736}
737
738/// Acquire resource locks for a pinning plan (non-blocking).
739///
740/// **LLC locks** (`{lock_dir}/ktstr-llc-{N}.lock`):
741/// - `Exclusive`: `flock(LOCK_EX | LOCK_NB)` — sole access to the LLC.
742/// - `Shared`: `flock(LOCK_SH | LOCK_NB)` — multiple holders coexist.
743///
744/// **CPU locks** (`{lock_dir}/ktstr-cpu-{C}.lock`):
745/// - Always `flock(LOCK_EX | LOCK_NB)` — exclusive per CPU.
746/// - Skipped for `Exclusive` LLC mode (the LLC lock already provides
747/// exclusivity over all CPUs in the group).
748///
749/// Single non-blocking attempt. Returns `LockOutcome::Unavailable`
750/// immediately when any resource is busy. Callers rely on nextest
751/// retry backoff for contention resolution.
752///
753/// `KTSTR_CARGO_TEST_MODE` short-circuits the entire flock dance and
754/// returns `Acquired` with an empty fd list — bare `cargo test`
755/// invocations don't share the cross-process LLC reservation
756/// contract that nextest / `cargo ktstr test` peers rely on. Tests
757/// run on whatever CPUs the OS schedules them onto.
758pub fn acquire_resource_locks(
759 plan: &PinningPlan,
760 llc_indices: &[usize],
761 llc_mode: LlcLockMode,
762) -> Result<LockOutcome> {
763 if crate::cargo_test_mode::cargo_test_mode_active() {
764 return Ok(LockOutcome::Acquired {
765 llc_offset: llc_indices.first().copied().unwrap_or(0),
766 locks: Vec::new(),
767 });
768 }
769 match try_acquire_all(plan, llc_indices, llc_mode) {
770 Ok(locks) => Ok(LockOutcome::Acquired {
771 llc_offset: llc_indices.first().copied().unwrap_or(0),
772 locks,
773 }),
774 Err(reason) => Ok(LockOutcome::Unavailable(reason)),
775 }
776}
777
778/// Compose the LLC lockfile prefix from the resolved lock directory.
779/// Returns `{lock_dir}/ktstr-llc-`.
780fn llc_lock_prefix() -> String {
781 format!("{}/ktstr-llc-", crate::cache::resolve_lock_dir().display())
782}
783
784/// Compose the per-CPU lockfile prefix from the resolved lock directory.
785/// Returns `{lock_dir}/ktstr-cpu-`.
786fn cpu_lock_prefix() -> String {
787 format!("{}/ktstr-cpu-", crate::cache::resolve_lock_dir().display())
788}
789
790#[cfg(test)]
791thread_local! {
792 /// Thread-local override for the LLC lock prefix. Tests set this
793 /// to a per-test tempdir so the acquire path operates on its
794 /// own lockfile pool instead of padding the `LlcGroup` vector
795 /// to 90,000+ entries just to avoid collision with production
796 /// indices at 0..<host-llcs>. See tests `acquire_llc_plan_*`
797 /// that build a small synth topo and point the prefix at a
798 /// `TempDir`.
799 static LLC_LOCK_PREFIX_OVERRIDE: std::cell::RefCell<Option<String>> =
800 const { std::cell::RefCell::new(None) };
801
802 /// Thread-local override for the per-CPU lock prefix. Symmetric
803 /// with `LLC_LOCK_PREFIX_OVERRIDE`.
804 static CPU_LOCK_PREFIX_OVERRIDE: std::cell::RefCell<Option<String>> =
805 const { std::cell::RefCell::new(None) };
806}
807
808/// Compose the LLC lockfile path for `llc_idx`. Production resolves
809/// via `KTSTR_LOCK_DIR` (fallback `/tmp`); tests can override the
810/// prefix via `LLC_LOCK_PREFIX_OVERRIDE` to keep their lockfile
811/// pool isolated.
812fn llc_lock_path(llc_idx: usize) -> String {
813 #[cfg(test)]
814 {
815 if let Some(p) = LLC_LOCK_PREFIX_OVERRIDE.with(|p| p.borrow().clone()) {
816 return format!("{p}{llc_idx}.lock");
817 }
818 }
819 format!("{}{llc_idx}.lock", llc_lock_prefix())
820}
821
822/// Compose the per-CPU lockfile path for `cpu`. Symmetric with
823/// [`llc_lock_path`] — production resolves via `KTSTR_LOCK_DIR`;
824/// tests can override via `CPU_LOCK_PREFIX_OVERRIDE`.
825fn cpu_lock_path(cpu: usize) -> String {
826 #[cfg(test)]
827 {
828 if let Some(p) = CPU_LOCK_PREFIX_OVERRIDE.with(|p| p.borrow().clone()) {
829 return format!("{p}{cpu}.lock");
830 }
831 }
832 format!("{}{cpu}.lock", cpu_lock_prefix())
833}
834
835/// Try to acquire all resource locks (all-or-nothing).
836/// Returns the held fds on success, or an error string describing
837/// which resource was busy.
838fn try_acquire_all(
839 plan: &PinningPlan,
840 llc_indices: &[usize],
841 llc_mode: LlcLockMode,
842) -> std::result::Result<Vec<std::os::fd::OwnedFd>, String> {
843 let flock_mode = match llc_mode {
844 LlcLockMode::Exclusive => FlockMode::Exclusive,
845 LlcLockMode::Shared => FlockMode::Shared,
846 };
847 let mut locks = Vec::new();
848
849 // Lock LLC files.
850 for &llc_idx in llc_indices {
851 let path = llc_lock_path(llc_idx);
852 match try_flock(&path, flock_mode) {
853 Ok(Some(fd)) => locks.push(fd),
854 Ok(None) => return Err(format!("LLC {llc_idx} busy")),
855 Err(e) => return Err(format!("LLC {llc_idx}: {e}")),
856 }
857 }
858
859 // Per-CPU locks: skip for exclusive LLC mode (the LLC lock covers
860 // all CPUs in the group).
861 if llc_mode != LlcLockMode::Exclusive {
862 for &(_vcpu, host_cpu) in &plan.assignments {
863 let path = cpu_lock_path(host_cpu);
864 match try_flock(&path, FlockMode::Exclusive) {
865 Ok(Some(fd)) => locks.push(fd),
866 Ok(None) => return Err(format!("CPU {host_cpu} busy")),
867 Err(e) => return Err(format!("CPU {host_cpu}: {e}")),
868 }
869 }
870 if let Some(cpu) = plan.service_cpu {
871 let path = cpu_lock_path(cpu);
872 match try_flock(&path, FlockMode::Exclusive) {
873 Ok(Some(fd)) => locks.push(fd),
874 Ok(None) => return Err(format!("service CPU {cpu} busy")),
875 Err(e) => return Err(format!("service CPU {cpu}: {e}")),
876 }
877 }
878 }
879
880 Ok(locks)
881}
882
883/// Diffuse a pid across `[0, max_start)` so adjacent pids do not
884/// land on adjacent offsets. Used by the default-else run-lock path
885/// (`KtstrVm::acquire_default_run_locks`) to pick a starting LLC slot so
886/// two ktstr invocations launching simultaneously don't both probe slot 0
887/// first.
888///
889/// Bare `pid % max_start` collapses adjacent pids onto adjacent
890/// offsets (Linux's pid allocator walks `pid_max` sequentially),
891/// which is the worst spread shape for the common batch-spawn
892/// case: nextest forks N test processes back-to-back, every pid
893/// lands within a small contiguous range, every `pid % max_start`
894/// lands within an equally small contiguous slice of the offset
895/// space, and they all probe overlapping slots on the first
896/// pass. AHasher avalanche on the pid bytes diffuses adjacent
897/// pids across the whole `[0, max_start)` range, so the
898/// slot-rotation loop has a fair chance of finding a free slot
899/// without burning the entire lockfile pool.
900///
901/// The hasher is `ahash::AHasher` keyed with fixed zero seeds
902/// (`RandomState::with_seeds(0, 0, 0, 0)`); a per-run random
903/// seed would defeat reproducibility for unit-test fixtures and
904/// for any future debug logging that wants to confirm "pid X
905/// picks offset Y for slot N".
906///
907/// Caller invariant: `max_start >= 1`. Panics on `max_start == 0`
908/// (modulo-by-zero); callers must enforce this upstream (the
909/// run-lock path floors `max_slots` at 1).
910pub(crate) fn pid_window_offset(pid: u32, max_start: usize) -> usize {
911 use std::hash::{BuildHasher, Hasher};
912 let mut hasher = ahash::RandomState::with_seeds(0, 0, 0, 0).build_hasher();
913 hasher.write(&pid.to_le_bytes());
914 (hasher.finish() as usize) % max_start
915}
916
917// ===========================================================================
918// --cpu-cap PLAN pipeline — CpuCap / LlcSnapshot / LlcPlan + discover/plan/acquire
919// ===========================================================================
920//
921// Entry point [`acquire_llc_plan`] is the single non-perf-mode
922// reservation path: kernel builds and no-perf-mode VMs both call it
923// with or without `--cpu-cap N`. `--cpu-cap` is a CPU-count budget:
924// the planner reserves exactly N host CPUs by walking whole LLCs in
925// contention- / NUMA-aware order and partial-taking the last LLC
926// so `plan.cpus.len() == N`. The flock is per-LLC even when the
927// last LLC is only partially used — coordination with concurrent
928// ktstr peers is unchanged at LLC granularity. When `--cpu-cap`
929// is absent the planner defaults to 30% of the calling process's
930// sched_getaffinity cpuset (see [`default_cpu_budget`] and
931// [`host_allowed_cpus`]) — not 30% of the host's online CPU count,
932// because a CI runner whose parent cgroup pins ktstr to a 4-CPU
933// subset must plan within THAT subset or sched_setaffinity on the
934// resulting mask produces an empty effective set.
935// Perf-mode never reaches this path; it stays on
936// [`acquire_resource_locks`] for its `LOCK_EX` reservation contract.
937//
938// The pipeline has three phases: discover (snapshot holders per
939// LLC, filtered to the process's allowed cpuset), plan (NUMA-aware,
940// consolidation-aware selection), acquire (non-blocking `LOCK_SH`
941// on each selected LLC). Up to ACQUIRE_MAX_TOCTOU_RETRIES retries
942// absorb the window between the discover snapshot and the
943// non-blocking acquire; between retries the loop sleeps for an
944// ascending micro-budget (TOCTOU_RETRY_DELAYS) so a peer that
945// raced us has time to drop its fds before the next snapshot.
946// If every retry fails, the contention is persistent and the
947// caller falls back to nextest-retry / operator-wait.
948
949/// Return the CPUs the calling process is allowed to run on, per
950/// `sched_getaffinity(2)` with a `/proc/self/status` Cpus_allowed_list
951/// fallback. Every consumer of the `--cpu-cap` pipeline plans against
952/// this set instead of `HostTopology::online_cpus` so
953/// `sched_setaffinity` on the plan's CPU list never produces an empty
954/// effective mask under a cgroup-restricted runner (CI hosts, systemd
955/// slices, sudo -u under a limited cpuset).
956///
957/// Returns an empty vec only when BOTH the syscall AND procfs fail —
958/// a pathological host that can't enumerate its own affinity. Callers
959/// treat that as a bail reason, not a fallback "every CPU" permission:
960/// guessing on a misconfigured host is worse than failing visibly.
961///
962/// Tests override the return value via `ALLOWED_CPUS_OVERRIDE` so
963/// the 30% default and allowed-cpu filtering are deterministic in
964/// unit tests regardless of the CI runner's real cpuset.
965pub(crate) fn host_allowed_cpus() -> Vec<usize> {
966 #[cfg(test)]
967 {
968 if let Some(override_set) = ALLOWED_CPUS_OVERRIDE.with(|p| p.borrow().clone()) {
969 return override_set;
970 }
971 }
972 if let Some(cpus) = crate::cpu_util::read_affinity(0) {
973 return cpus.into_iter().map(|c| c as usize).collect();
974 }
975 if let Ok(raw) = std::fs::read_to_string("/proc/self/status") {
976 for line in raw.lines() {
977 if let Some(v) = line.strip_prefix("Cpus_allowed_list:")
978 && let Some(parsed) = crate::cpu_util::parse_cpu_list(v.trim())
979 {
980 return parsed.into_iter().map(|c| c as usize).collect();
981 }
982 }
983 }
984 Vec::new()
985}
986
987#[cfg(test)]
988thread_local! {
989 /// Test-only override for [`host_allowed_cpus`]. Set via
990 /// [`AllowedCpusGuard`] to make 30%-of-allowed calculations and
991 /// plan filtering deterministic in unit tests. Mirrors the
992 /// `LLC_LOCK_PREFIX_OVERRIDE` pattern.
993 pub(crate) static ALLOWED_CPUS_OVERRIDE: std::cell::RefCell<Option<Vec<usize>>> =
994 const { std::cell::RefCell::new(None) };
995}
996
997/// Default CPU budget when `--cpu-cap` is not set: 30% of the
998/// allowed-CPU count, rounded up, with a min-1 floor for small or
999/// degenerate hosts. 30% leaves enough headroom for concurrent peers
1000/// (tests, builds) while still reserving a non-trivial slice; the
1001/// min-1 floor prevents returning 0 on a 1- or 2-CPU host, where
1002/// ceil(×0.30) ≥ 1 anyway — the `.max(1)` is defense in depth for
1003/// future ratio tweaks.
1004fn default_cpu_budget(allowed_cpus: usize) -> usize {
1005 allowed_cpus.saturating_mul(30).div_ceil(100).max(1)
1006}
1007
1008/// No-perf CPU budget when no explicit `--cpu-cap` (or `cpu_budget` knob) is
1009/// set: at least the VM's own vCPU count, clamped to the allowed cpuset.
1010///
1011/// The rationale is TEST VALIDITY, not boot speed — do not "optimize" this
1012/// back to a flat 30%. A scheduler test measures how the GUEST scheduler
1013/// places tasks across the guest's CPUs. If the VM's vCPU threads are
1014/// oversubscribed on the host (256 vCPUs sharing the 30% default mask is
1015/// ~95 pCPUs = 2.7x), the HOST scheduler time-slices them, so guest vCPUs
1016/// stall for reasons unrelated to the workload — a host-contention confound
1017/// that invalidates the guest-scheduler measurement (the silent-wrong-answer
1018/// class the project guards against). Sizing the budget to `>= vcpus` gives
1019/// the guest's CPUs real host CPUs, so its scheduler view tracks real
1020/// concurrency. (A wide boot also drops ~0.7s as the kernel's parallel AP
1021/// bring-up runs unthrottled, but that is incidental.)
1022///
1023/// Floored at the 30% `default_cpu_budget` so small VMs (vcpus < 30%) keep
1024/// the cross-test concurrency headroom; clamped to `allowed_cpus` so it never
1025/// exceeds the process cpuset. An explicit cap LOWER than vcpus is the
1026/// deliberate opt-in to oversubscribe for contention testing.
1027pub(crate) fn no_perf_cpu_budget(allowed_cpus: usize, vm_vcpus: usize) -> usize {
1028 default_cpu_budget(allowed_cpus).max(vm_vcpus.min(allowed_cpus))
1029}
1030
1031/// Parsed `--cpu-cap N` value. N is a CPU count: the planner reserves
1032/// exactly N host CPUs by walking whole LLCs in contention- /
1033/// NUMA-aware order (filtered to the calling process's allowed
1034/// cpuset) and partial-taking the last LLC so `plan.cpus.len() == N`.
1035/// The flock set is still per-LLC (the last LLC is flocked whole
1036/// even when only a prefix of its CPUs enters `plan.cpus`).
1037/// Bounded to `1..=usize::MAX` at the constructor — a cap of 0 is
1038/// nonsensical (reserving zero CPUs is just "don't run") and
1039/// rejected upstream by the CLI layer, but we enforce the bound in
1040/// the type system via `NonZeroUsize` so callers can
1041/// `CpuCap::new(...)?` without a follow-up bounds check.
1042///
1043/// The runtime upper bound — "don't exceed the process's allowed
1044/// CPU count" — is enforced at acquire time via
1045/// [`CpuCap::effective_count`] because the allowed set is not known
1046/// until `host_allowed_cpus` reads `sched_getaffinity`.
1047#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1048pub struct CpuCap {
1049 n: std::num::NonZeroUsize,
1050}
1051
1052impl CpuCap {
1053 /// Construct from a raw `usize` CPU count. Returns `Err` on `0`;
1054 /// `usize::MAX` is accepted here and clamped later by
1055 /// `effective_count`.
1056 pub fn new(n: usize) -> Result<Self> {
1057 std::num::NonZeroUsize::new(n)
1058 .map(|n| CpuCap { n })
1059 .ok_or_else(|| anyhow::anyhow!("--cpu-cap must be ≥ 1 CPU (got 0)"))
1060 }
1061
1062 /// Three-tier resolution: explicit CLI flag wins over env var,
1063 /// which wins over "not set". Returns `None` when neither is present,
1064 /// meaning "use the caller's auto-sized default": the
1065 /// kernel-build/planner path expands `None` to `default_cpu_budget`
1066 /// (30% of the allowed set); the no-perf VM-builder path expands it to
1067 /// `no_perf_cpu_budget` (max(30%, min(vcpus, allowed)), usually the
1068 /// vCPU count).
1069 ///
1070 /// Env var is `KTSTR_CPU_CAP` (integer ≥ 1, CPU count). An empty
1071 /// or unset env var is treated as absent; a non-numeric value
1072 /// OR the numeric value `0` is an error — `KTSTR_CPU_CAP=0`
1073 /// flows through `CpuCap::new(0)` which rejects with "--cpu-cap
1074 /// must be ≥ 1 CPU (got 0)". Zero is not a silent fallback to
1075 /// "no cap"; it surfaces as a parse-time error so typos and
1076 /// scripting mistakes don't accidentally disable the resource
1077 /// contract.
1078 pub fn resolve(cli_flag: Option<usize>) -> Result<Option<CpuCap>> {
1079 if let Some(n) = cli_flag {
1080 return Ok(Some(CpuCap::new(n)?));
1081 }
1082 match std::env::var(crate::KTSTR_CPU_CAP_ENV) {
1083 Ok(s) if s.is_empty() => Ok(None),
1084 Ok(s) => {
1085 let n: usize = s
1086 .parse()
1087 .with_context(|| format!("KTSTR_CPU_CAP is not a valid integer: {s:?}"))?;
1088 Ok(Some(CpuCap::new(n)?))
1089 }
1090 Err(std::env::VarError::NotPresent) => Ok(None),
1091 Err(std::env::VarError::NotUnicode(raw)) => {
1092 anyhow::bail!(
1093 "KTSTR_CPU_CAP contains non-UTF-8 bytes ({} bytes): {raw:?}. \
1094 Set an integer value or unset.",
1095 raw.len(),
1096 )
1097 }
1098 }
1099 }
1100
1101 /// Runtime-bounded cap: returns the inner count unless it exceeds
1102 /// `allowed_cpus` (the calling process's sched_getaffinity cpuset
1103 /// count), in which case a `CpuBudgetUnsatisfiable` hard error (an
1104 /// explicit cap the host cannot satisfy is a FAIL, not a transient
1105 /// skip) steers the caller toward an actionable message. This check
1106 /// lives at acquire time — not at construction — because the allowed
1107 /// set is not known until `host_allowed_cpus` reads the syscall.
1108 pub fn effective_count(&self, allowed_cpus: usize) -> Result<usize> {
1109 let n = self.n.get();
1110 if n > allowed_cpus {
1111 // An explicit --cpu-cap the host cannot satisfy is a hard ERROR
1112 // (the author typed a concrete number that does not exist here),
1113 // not transient contention: CpuBudgetUnsatisfiable, not
1114 // ResourceContention, so it fails rather than skips.
1115 return Err(anyhow::Error::new(CpuBudgetUnsatisfiable {
1116 reason: format!(
1117 "--cpu-cap N = {n} exceeds the {allowed_cpus} CPUs this \
1118 process is allowed on (from sched_getaffinity / \
1119 Cpus_allowed_list). Pick a value ≤ {allowed_cpus}, \
1120 release the cgroup/taskset constraint restricting this \
1121 process, or omit --cpu-cap to use the auto-sized default \
1122 (30% of the allowed set for kernel builds; the vCPU \
1123 count, floored at 30%, for VMs)."
1124 ),
1125 }));
1126 }
1127 Ok(n)
1128 }
1129}
1130
1131/// Per-LLC discover snapshot: identity + current holder set.
1132/// Constructed by [`discover_llc_snapshots`] before the PLAN phase.
1133/// `pub(crate)` so the in-crate PLAN pipeline and this module's tests
1134/// can construct and inspect it; the `ktstr locks` observational
1135/// command shares only [`crate::flock::HolderInfo`], not this
1136/// structure. External callers have no reason to construct one.
1137#[derive(Debug, Clone)]
1138pub(crate) struct LlcSnapshot {
1139 /// Host LLC index — matches [`HostTopology::llc_groups`] ordering.
1140 pub(crate) llc_idx: usize,
1141 /// Canonical `{lock_dir}/ktstr-llc-{N}.lock` path. Stored so the
1142 /// ACQUIRE phase doesn't re-format the string per LLC.
1143 pub(crate) lockfile_path: std::path::PathBuf,
1144 /// Processes currently holding this LLC's flock (any mode). Empty
1145 /// when no peer holds the lock. Derived from a single `/proc/locks`
1146 /// read shared across every LLC in the discover phase.
1147 pub(crate) holders: Vec<crate::flock::HolderInfo>,
1148 /// `holders.len()`, cached so the PLAN sort can access it without
1149 /// re-traversing the holder list per candidate.
1150 pub(crate) holder_count: usize,
1151}
1152
1153/// Output of [`acquire_llc_plan`]: the concrete LLC reservation plus
1154/// every piece of diagnostic context a downstream consumer could
1155/// want.
1156///
1157/// `mems` is the union of NUMA nodes containing the selected CPUs —
1158/// `BuildSandbox::try_create` writes this to the child cgroup's
1159/// `cpuset.mems` so memory allocations respect the same NUMA locality
1160/// the CPU reservation already implies.
1161///
1162/// `locks` holds the RAII file descriptors whose `OwnedFd::drop`
1163/// releases the kernel-side flock; the field is `pub(crate)` because
1164/// direct manipulation from outside the crate would defeat the drop
1165/// guarantee.
1166#[derive(Debug)]
1167pub struct LlcPlan {
1168 /// Selected host LLC indices, sorted ASCENDING. Acquire order
1169 /// matches this slice — two callers with the same target see the
1170 /// same ordering and converge on the same one-wins-the-others-retry
1171 /// livelock-proof sequence.
1172 pub locked_llcs: Vec<usize>,
1173 /// Flattened host CPU list, sized exactly `target_cpus`. The last
1174 /// locked LLC may contribute only a prefix of its allowed CPUs.
1175 /// Preserves LLC ordering: CPUs from `locked_llcs[0]` come
1176 /// before CPUs from `locked_llcs[1]`, etc.
1177 pub cpus: Vec<usize>,
1178 /// Union of NUMA nodes hosting the locked LLCs. When the plan
1179 /// spans > 1 node (cross-node spill — seed node exhausted, plan
1180 /// spilled to nearest-by-distance neighbors), `mems`
1181 /// contains every node — not just the seed node's.
1182 pub mems: std::collections::BTreeSet<usize>,
1183 /// Per-LLC discovery trail. Preserved through the lifetime of the
1184 /// plan so error-formatting (via `acquire_llc_plan`'s final
1185 /// fresh snapshot) and future `ktstr locks` rendering don't
1186 /// re-probe `/proc/locks`. In-tree consumers currently re-read
1187 /// the snapshot only on the TOCTOU failure path; the field is
1188 /// kept populated so downstream tooling can inspect the
1189 /// plan-at-acquire holder set without a second pass.
1190 #[allow(dead_code)]
1191 pub(crate) snapshot: Vec<LlcSnapshot>,
1192 /// RAII flock holders. Dropped when the plan goes out of scope,
1193 /// releasing each LLC's `LOCK_SH` in declared order.
1194 #[allow(dead_code)] // RAII only — Drop releases flocks, no reads.
1195 pub(crate) locks: Vec<std::os::fd::OwnedFd>,
1196}
1197
1198/// Maximum TOCTOU retry budget for the DISCOVER → PLAN → ACQUIRE
1199/// pipeline. Production sees up to `RETRIES + 1 = 4` attempts: one
1200/// initial DISCOVER and three retries. Between retries the caller
1201/// sleeps for an ascending micro-budget (10ms, 50ms, 200ms — see
1202/// [`TOCTOU_RETRY_DELAYS`]) so two peers that initially raced on the
1203/// same LLC have time to drop their fds before the next snapshot.
1204/// Without the sleep the second DISCOVER often sees the same holder
1205/// state and bails on a transient race; the in-process micro-sleep
1206/// absorbs that without paying the nextest-retry cost.
1207const ACQUIRE_MAX_TOCTOU_RETRIES: u32 = 3;
1208
1209/// Per-retry sleep durations between DISCOVER attempts. Indexed by
1210/// the retry index: after attempt 0 fails the loop sleeps
1211/// `TOCTOU_RETRY_DELAYS[0]`, after attempt 1 fails it sleeps
1212/// `TOCTOU_RETRY_DELAYS[1]`, etc. Length must equal
1213/// [`ACQUIRE_MAX_TOCTOU_RETRIES`] — there are exactly that many
1214/// sleeps before the final attempt that can still bail.
1215const TOCTOU_RETRY_DELAYS: [std::time::Duration; ACQUIRE_MAX_TOCTOU_RETRIES as usize] = [
1216 std::time::Duration::from_millis(10),
1217 std::time::Duration::from_millis(50),
1218 std::time::Duration::from_millis(200),
1219];
1220
1221/// DISCOVER phase — read-only LLC snapshot.
1222///
1223/// Walks ONLY the LLCs whose CPUs overlap `allowed` (the calling
1224/// process's `sched_getaffinity` cpuset). LLCs entirely outside the
1225/// cpuset are skipped — locking one would never contribute a
1226/// schedulable CPU to `plan.cpus`, and on a heavily-pinned runner
1227/// (CI cgroup with N out of M CPUs allowed) skipping them avoids
1228/// O(host_llcs - allowed_llcs) lockfile materializations and
1229/// /proc/locks lookups per attempt. The PLAN phase still receives a
1230/// snapshot vector indexed by `LlcSnapshot.llc_idx`, not by
1231/// position, so a sparse snapshot set works without any further
1232/// adjustment downstream.
1233///
1234/// For every selected LLC: stat the canonical lockfile (materializing
1235/// it with `O_CREAT | O_CLOEXEC | 0o666` if absent so subsequent
1236/// ACQUIRE has a stable inode), then parse one `/proc/locks` read to
1237/// populate every snapshot's holder list in a single pass. No flock
1238/// acquires — DISCOVER never contends.
1239///
1240/// `mountinfo` is the `/proc/self/mountinfo` text read once per
1241/// `acquire_llc_plan` invocation at [`acquire_llc_plan_with_acquire_fn`]
1242/// and threaded through here so a host with N LLCs pays for exactly
1243/// one mountinfo read per DISCOVER pass (DISCOVER runs once per retry
1244/// attempt — up to ACQUIRE_MAX_TOCTOU_RETRIES+1 — plus once on the
1245/// retry-exhausted diagnostic path, up to 5 passes, hence caching at
1246/// the plan level rather than per snapshot walk).
1247///
1248/// Returns `Ok(snapshots)` on success. Propagates opening + stat
1249/// errors so a missing `/tmp` or permission failure surfaces
1250/// actionably.
1251fn discover_llc_snapshots(
1252 topo: &HostTopology,
1253 allowed: &std::collections::BTreeSet<usize>,
1254 mountinfo: &str,
1255) -> Result<Vec<LlcSnapshot>> {
1256 let mut snapshots: Vec<LlcSnapshot> = Vec::with_capacity(topo.llc_groups.len());
1257 for llc_idx in 0..topo.llc_groups.len() {
1258 // Skip LLCs whose CPUs are entirely outside the calling
1259 // process's allowed cpuset — they cannot contribute a
1260 // schedulable CPU to `plan.cpus`, and locking one would just
1261 // pay for a lockfile + /proc/locks pass without coordination
1262 // value. The sparse snapshot vector keeps llc_idx as the
1263 // identity key, so PLAN's index-based iteration is
1264 // unaffected.
1265 if !topo.llc_groups[llc_idx]
1266 .cpus
1267 .iter()
1268 .any(|c| allowed.contains(c))
1269 {
1270 continue;
1271 }
1272 let path = std::path::PathBuf::from(llc_lock_path(llc_idx));
1273 // Ensure the lockfile inode exists so `read_holders_with_mountinfo`
1274 // can key /proc/locks lookups on it. Deliberately takes no
1275 // flock — DISCOVER is observational. Also runs the NFS/FUSE
1276 // reject check inside `materialize`, so a misconfigured
1277 // `/tmp` mount surfaces here instead of silently at ACQUIRE
1278 // time.
1279 crate::flock::materialize(&path)?;
1280 let holders =
1281 crate::flock::read_holders_with_mountinfo(&path, mountinfo).unwrap_or_default();
1282 let holder_count = holders.len();
1283 snapshots.push(LlcSnapshot {
1284 llc_idx,
1285 lockfile_path: path,
1286 holders,
1287 holder_count,
1288 });
1289 }
1290 Ok(snapshots)
1291}
1292
1293/// PLAN phase — NUMA-aware placement over discover snapshots.
1294///
1295/// Composite sort driven by three ordered keys:
1296/// 1. Consolidation — prefer LLCs already holding peers.
1297/// 2. NUMA locality — after seeding on the highest-scored LLC's
1298/// node, greedily fill the seed node before spilling.
1299/// 3. LLC index ASC — tiebreak + final ACQUIRE ordering for livelock
1300/// safety.
1301///
1302/// `target_cpus` is the exact number of allowed CPUs the plan
1303/// reserves. The walk selects whole LLCs (filtered to their
1304/// allowed-CPU overlap) until the accumulated contribution meets
1305/// the budget. The LAST selected LLC may contribute more allowed
1306/// CPUs than the remaining budget needs; the materialization layer
1307/// at [`acquire_llc_plan_with_acquire_fn`] takes only the needed
1308/// prefix of that LLC's allowed CPUs into `plan.cpus`. The flock
1309/// is always held at LLC granularity — coordination with concurrent
1310/// ktstr peers happens per-LLC, regardless of how many of the LLC's
1311/// CPUs are consumed here. LLCs whose CPUs are all outside
1312/// `allowed` are skipped entirely — locking one would never
1313/// contribute a schedulable CPU to `plan.cpus`.
1314///
1315/// Distance fallback: callers without a distance matrix pass a closure
1316/// that returns `10` for equal nodes and `20` otherwise — primitive 3
1317/// keeps the spill order reasonable even on hosts whose
1318/// `/sys/devices/system/node/*/distance` is unavailable.
1319fn plan_from_snapshots(
1320 snapshots: &[LlcSnapshot],
1321 target_cpus: usize,
1322 topo: &HostTopology,
1323 allowed: &std::collections::BTreeSet<usize>,
1324 distance_fn: impl Fn(usize, usize) -> u8,
1325) -> Vec<usize> {
1326 if target_cpus == 0 {
1327 return Vec::new();
1328 }
1329
1330 // Allowed-CPU count contributed by each LLC. An LLC with zero
1331 // overlap contributes no schedulable CPUs to `plan.cpus`, so
1332 // reserving it adds a useless flock and no planning value — drop
1333 // those up front so every subsequent walk only considers
1334 // candidates that can actually carry budget.
1335 let llc_allowed_cpus = |idx: usize| -> usize {
1336 topo.llc_groups[idx]
1337 .cpus
1338 .iter()
1339 .filter(|c| allowed.contains(c))
1340 .count()
1341 };
1342 let total_allowed_in_llcs: usize = (0..snapshots.len()).map(llc_allowed_cpus).sum();
1343 if target_cpus >= total_allowed_in_llcs {
1344 // Budget ≥ sum of per-LLC contributions: select every LLC
1345 // that has at least one allowed CPU, in ascending order.
1346 // Short-circuits the scoring walk when the cap degenerates
1347 // to "reserve everything we can schedule on."
1348 let mut all: Vec<usize> = (0..snapshots.len())
1349 .filter(|&idx| llc_allowed_cpus(idx) > 0)
1350 .collect();
1351 all.sort_unstable();
1352 return all;
1353 }
1354
1355 // Step a: partition + sort. Only LLCs with at least one allowed
1356 // CPU are eligible — locking an out-of-cpuset LLC is useless.
1357 // Consolidation candidates first (holder_count DESC, llc_idx ASC);
1358 // fresh candidates after, sorted by llc_idx ASC. A single
1359 // composite sort would do the same work but the two-partition
1360 // form is easier to read and lets future "prefer consolidation
1361 // only if score ≥ threshold" tweaks slot in.
1362 let eligible = |s: &&LlcSnapshot| -> bool { llc_allowed_cpus(s.llc_idx) > 0 };
1363 let mut consolidation: Vec<&LlcSnapshot> = snapshots
1364 .iter()
1365 .filter(|s| s.holder_count > 0)
1366 .filter(eligible)
1367 .collect();
1368 let mut fresh: Vec<&LlcSnapshot> = snapshots
1369 .iter()
1370 .filter(|s| s.holder_count == 0)
1371 .filter(eligible)
1372 .collect();
1373 consolidation.sort_by(|a, b| {
1374 b.holder_count
1375 .cmp(&a.holder_count)
1376 .then(a.llc_idx.cmp(&b.llc_idx))
1377 });
1378 fresh.sort_by_key(|s| s.llc_idx);
1379 let ranked: Vec<&LlcSnapshot> = consolidation.into_iter().chain(fresh).collect();
1380 if ranked.is_empty() {
1381 // No LLC on this host overlaps the caller's allowed cpuset.
1382 // Bail upstream handles this as ResourceContention; here we
1383 // just return empty so the caller can surface the diagnostic.
1384 return Vec::new();
1385 }
1386
1387 // Step b: seed. Highest-scored eligible LLC; its NUMA node
1388 // anchors the greedy expansion.
1389 let seed = ranked[0];
1390 let seed_node = topo.llc_numa_node(seed.llc_idx);
1391
1392 // Step c–d: walk seed-node LLCs first, then spill to
1393 // nearest-by-distance nodes. Primitives 1 + 3 drive the node
1394 // ordering; the per-node LLC lists come from primitive 1. Within
1395 // each node, we still honour the composite score by walking
1396 // `ranked` and skipping LLCs not on the current target node.
1397 // Accumulation is by allowed-CPU contribution — an LLC with 4
1398 // CPUs of which 2 are in `allowed` counts as 2 toward the
1399 // budget and the other 2 never appear in `plan.cpus`.
1400 let node_order = topo.numa_nodes_sorted_by_distance(seed_node, distance_fn);
1401 let mut selected: Vec<usize> = Vec::new();
1402 let mut picked: std::collections::HashSet<usize> = std::collections::HashSet::new();
1403 let mut accumulated: usize = 0;
1404 for node in node_order {
1405 if accumulated >= target_cpus {
1406 break;
1407 }
1408 // Ranked walk, taking every candidate on this node in
1409 // score-order until we've filled `target_cpus` or exhausted
1410 // the node.
1411 for snap in &ranked {
1412 if accumulated >= target_cpus {
1413 break;
1414 }
1415 if picked.contains(&snap.llc_idx) {
1416 continue;
1417 }
1418 if topo.llc_numa_node(snap.llc_idx) != node {
1419 continue;
1420 }
1421 selected.push(snap.llc_idx);
1422 picked.insert(snap.llc_idx);
1423 accumulated += llc_allowed_cpus(snap.llc_idx);
1424 }
1425 }
1426
1427 // Step e: livelock-proof acquire order — ascending index.
1428 selected.sort_unstable();
1429 selected
1430}
1431
1432/// ACQUIRE phase — non-blocking `LOCK_SH` on every selected LLC.
1433///
1434/// All-or-nothing. A single `EWOULDBLOCK` releases every held fd (via
1435/// `drop(locks)`) and returns `Ok(None)` so the caller re-runs
1436/// discover + plan with a fresh snapshot. Non-retryable errors
1437/// (unexpected errno, path open failures) propagate unchanged.
1438fn try_acquire_llc_plan_locks(
1439 selected: &[usize],
1440 snapshots: &[LlcSnapshot],
1441) -> Result<Option<Vec<std::os::fd::OwnedFd>>> {
1442 let mut locks: Vec<std::os::fd::OwnedFd> = Vec::with_capacity(selected.len());
1443 for &idx in selected {
1444 let snap = snapshots
1445 .iter()
1446 .find(|s| s.llc_idx == idx)
1447 .expect("selected index must come from snapshots — plan invariant");
1448 match crate::flock::try_flock(&snap.lockfile_path, FlockMode::Shared)? {
1449 Some(fd) => locks.push(fd),
1450 None => {
1451 // Drop previously-held fds so the peer racing us sees
1452 // a consistent post-bail state, then signal "retry".
1453 drop(locks);
1454 return Ok(None);
1455 }
1456 }
1457 }
1458 Ok(Some(locks))
1459}
1460
1461/// Entry point for the `--cpu-cap` PLAN pipeline.
1462///
1463/// Runs DISCOVER → PLAN → ACQUIRE with up to
1464/// [`ACQUIRE_MAX_TOCTOU_RETRIES`] retries (each separated by a
1465/// per-retry sleep from [`TOCTOU_RETRY_DELAYS`]). On
1466/// success returns an [`LlcPlan`] holding the selected LLCs, their
1467/// flattened CPUs (intersected with the calling process's allowed
1468/// cpuset), the derived `mems` set, the diagnostic snapshot, and the
1469/// RAII flock handles.
1470///
1471/// `cpu_cap == None` means "reserve 30% of the allowed-CPU set" (see
1472/// [`default_cpu_budget`]). `cpu_cap == Some(cap)` where
1473/// `cap > allowed_cpus` errors at acquire time via
1474/// [`CpuCap::effective_count`]. The allowed-CPU set comes from
1475/// [`host_allowed_cpus`] — `sched_getaffinity(0)` with a procfs
1476/// fallback — so plans are always schedulable under cgroup-restricted
1477/// runners (CI hosts, systemd slices, sudo under a limited cpuset).
1478///
1479/// Consolidation uses the host distance matrix from [`crate::topology::TestTopology`]
1480/// so spill order matches actual NUMA cost. Hosts whose
1481/// `/sys/devices/system/node/*/distance` failed to parse degrade to a
1482/// numerically-adjacent ordering via the distance closure (`10` for
1483/// same-node, `20` for cross-node).
1484pub fn acquire_llc_plan(
1485 topo: &HostTopology,
1486 test_topo: &crate::topology::TestTopology,
1487 cpu_cap: Option<CpuCap>,
1488) -> Result<LlcPlan> {
1489 if crate::cargo_test_mode::cargo_test_mode_active() {
1490 // Bare `cargo test` mode: no peer-coordination contract.
1491 // Synthesise a degenerate plan that names every LLC and
1492 // every allowed CPU but holds no flocks. The vmm caller
1493 // strips `locks` after build (see `KtstrVmBuilder::build`)
1494 // and re-acquires via `acquire_resource_locks` at run time
1495 // — also short-circuited above. `cpus` is the calling
1496 // process's allowed cpuset so the `sched_setaffinity`
1497 // sites inside the vmm have a valid mask to apply
1498 // (allowed cpuset = whatever the OS schedules us onto).
1499 let allowed = host_allowed_cpus();
1500 if allowed.is_empty() {
1501 return Err(ResourceContention {
1502 reason: "could not determine allowed CPU set \
1503 (sched_getaffinity and /proc/self/status both failed)"
1504 .into(),
1505 }
1506 .into());
1507 }
1508 let _ = test_topo;
1509 let _ = cpu_cap;
1510 let allowed_set: std::collections::BTreeSet<usize> = allowed.iter().copied().collect();
1511 let locked_llcs: Vec<usize> = topo
1512 .llc_groups
1513 .iter()
1514 .enumerate()
1515 .filter_map(|(idx, group)| {
1516 if group.cpus.iter().any(|c| allowed_set.contains(c)) {
1517 Some(idx)
1518 } else {
1519 None
1520 }
1521 })
1522 .collect();
1523 let mems: std::collections::BTreeSet<usize> = locked_llcs
1524 .iter()
1525 .filter_map(|&idx| {
1526 topo.llc_groups
1527 .get(idx)
1528 .and_then(|g| g.cpus.first().copied())
1529 .and_then(|c| topo.cpu_to_node.get(&c).copied())
1530 })
1531 .collect();
1532 return Ok(LlcPlan {
1533 locked_llcs,
1534 cpus: allowed,
1535 mems,
1536 snapshot: Vec::new(),
1537 locks: Vec::new(),
1538 });
1539 }
1540 acquire_llc_plan_with_acquire_fn(topo, test_topo, cpu_cap, try_acquire_llc_plan_locks)
1541}
1542
1543/// Parameterized form of [`acquire_llc_plan`] that takes the
1544/// ACQUIRE closure as a seam. Production calls this with
1545/// [`try_acquire_llc_plan_locks`] (non-blocking `LOCK_SH` per LLC);
1546/// tests can pass a closure that returns `Ok(None)` on attempt 0 and
1547/// forwards on attempt 1 to simulate a peer winning the first race,
1548/// or an attempt-counting closure that always fails to exercise the
1549/// retry-exhausted error path.
1550///
1551/// `acquire_fn` receives `(selected, snapshots)` and returns
1552/// `Ok(Some(locks))` on success, `Ok(None)` to trigger a retry, or
1553/// propagates hard errors unchanged. Production closure is the
1554/// free-standing [`try_acquire_llc_plan_locks`]; the test closure
1555/// can track its own attempt counter via interior mutability
1556/// ([`std::cell::Cell`], `Mutex`, atomic int).
1557///
1558/// The outer loop body — DISCOVER, PLAN, retry budget, final
1559/// holder diagnostics — is shared between both entry points so the
1560/// test seam exercises the exact retry-and-diagnose sequence
1561/// production uses, not a parallel implementation.
1562fn acquire_llc_plan_with_acquire_fn<F>(
1563 topo: &HostTopology,
1564 test_topo: &crate::topology::TestTopology,
1565 cpu_cap: Option<CpuCap>,
1566 mut acquire_fn: F,
1567) -> Result<LlcPlan>
1568where
1569 F: FnMut(&[usize], &[LlcSnapshot]) -> Result<Option<Vec<std::os::fd::OwnedFd>>>,
1570{
1571 // Resolve the calling process's allowed cpuset. Plans must fit
1572 // inside this set — sched_setaffinity against a mask outside the
1573 // process's cgroup cpuset either fails outright or produces an
1574 // empty effective set (the vCPU thread then cannot run). Reading
1575 // the syscall ONCE here and threading it through means every
1576 // TOCTOU retry sees the same baseline; a cgroup change mid-plan
1577 // is a host-reconfiguration event the retry budget does not
1578 // attempt to absorb.
1579 let allowed_vec = host_allowed_cpus();
1580 if allowed_vec.is_empty() {
1581 return Err(ResourceContention {
1582 reason: "could not determine allowed CPU set \
1583 (sched_getaffinity and /proc/self/status both failed)"
1584 .into(),
1585 }
1586 .into());
1587 }
1588 let allowed: std::collections::BTreeSet<usize> = allowed_vec.iter().copied().collect();
1589 let allowed_cpus = allowed.len();
1590
1591 let target_cpus = match cpu_cap {
1592 Some(cap) => cap.effective_count(allowed_cpus)?,
1593 None => default_cpu_budget(allowed_cpus),
1594 };
1595 if target_cpus == 0 {
1596 // Defense in depth. `default_cpu_budget` has a `.max(1)`
1597 // floor and `effective_count` on a `NonZeroUsize` cap can
1598 // never return 0, but surfacing this as an explicit bail
1599 // catches future regressions (e.g. someone wires a signed
1600 // integer into the budget math) instead of silently
1601 // producing a plan with no locks.
1602 return Err(ResourceContention {
1603 reason: "CPU budget resolved to zero".into(),
1604 }
1605 .into());
1606 }
1607
1608 // Read /proc/self/mountinfo ONCE per acquire_llc_plan invocation.
1609 // Every DISCOVER pass re-uses this text to derive per-LLC
1610 // /proc/locks needles (major:minor:inode). Without this cache, a
1611 // host with N LLCs would re-read mountinfo N× per DISCOVER pass,
1612 // and DISCOVER itself runs up to ACQUIRE_MAX_TOCTOU_RETRIES+1
1613 // times in the retry loop, plus once on the retry-exhausted
1614 // diagnostic path (up to 5 total). Mount points are
1615 // effectively static during a plan acquisition — a bind mount
1616 // changing under us mid-acquire is a host-reconfiguration event
1617 // that invalidates every parallel acquirer anyway, not something
1618 // we need to re-read to observe.
1619 let mountinfo = crate::flock::read_mountinfo().map_err(|e| ResourceContention {
1620 reason: format!("read /proc/self/mountinfo: {e}"),
1621 })?;
1622
1623 let mut attempt: u32 = 0;
1624 loop {
1625 let snapshots =
1626 discover_llc_snapshots(topo, &allowed, &mountinfo).map_err(|e| ResourceContention {
1627 reason: format!("discover LLC snapshots: {e}"),
1628 })?;
1629 let selected = plan_from_snapshots(&snapshots, target_cpus, topo, &allowed, |from, to| {
1630 test_topo.numa_distance(from, to)
1631 });
1632 if selected.is_empty() {
1633 // Every LLC's CPU set lies outside the allowed cpuset —
1634 // sysfs disagrees with sched_getaffinity. This is a host
1635 // misconfiguration (stale sysfs after hotplug, cgroup
1636 // pinned to a CPU range the kernel no longer reports in
1637 // llc_groups, etc.). Bail with actionable text rather
1638 // than looping through retries that cannot change the
1639 // outcome.
1640 return Err(ResourceContention {
1641 reason: format!(
1642 "no host LLC overlaps the process's \
1643 {allowed_cpus}-CPU allowed set — sysfs LLC groups \
1644 and sched_getaffinity disagree"
1645 ),
1646 }
1647 .into());
1648 }
1649 match acquire_fn(&selected, &snapshots).map_err(|e| ResourceContention {
1650 reason: format!("acquire LLC locks: {e}"),
1651 })? {
1652 Some(locks) => {
1653 // Success — materialize cpus + mems from the selected
1654 // indices, intersecting each LLC's CPU list with
1655 // `allowed` so `plan.cpus` never contains a CPU the
1656 // process cannot run on, and TRUNCATING at exactly
1657 // `target_cpus` so the last-LLC overshoot
1658 // contributes only the prefix the budget needs. The
1659 // full LLC is still flocked (the coordination unit
1660 // is per-LLC), but the CPUs beyond `target_cpus`
1661 // never appear in `plan.cpus` — sched_setaffinity
1662 // masks and cgroup cpuset.cpus writes reflect the
1663 // exact budget. `mems` collects the NUMA nodes of
1664 // CPUs that actually appear in `plan.cpus`; an LLC
1665 // that contributes a partial slice on a cross-node
1666 // split only registers the nodes of its
1667 // actually-used CPUs.
1668 let mut cpus: Vec<usize> = Vec::new();
1669 let mut mems: std::collections::BTreeSet<usize> = std::collections::BTreeSet::new();
1670 'outer: for &idx in &selected {
1671 let group = &topo.llc_groups[idx];
1672 for &cpu in &group.cpus {
1673 if !allowed.contains(&cpu) {
1674 continue;
1675 }
1676 if cpus.len() >= target_cpus {
1677 break 'outer;
1678 }
1679 cpus.push(cpu);
1680 let node = topo.cpu_to_node.get(&cpu).copied().unwrap_or(0);
1681 mems.insert(node);
1682 }
1683 }
1684 return Ok(LlcPlan {
1685 locked_llcs: selected,
1686 cpus,
1687 mems,
1688 snapshot: snapshots,
1689 locks,
1690 });
1691 }
1692 None => {
1693 if attempt >= ACQUIRE_MAX_TOCTOU_RETRIES {
1694 // Rebuild holder diagnostics from a FRESH read so
1695 // the error points at the peer that actually won.
1696 let final_snapshots = discover_llc_snapshots(topo, &allowed, &mountinfo)?;
1697 let holders: Vec<String> = final_snapshots
1698 .iter()
1699 .filter(|s| !s.holders.is_empty())
1700 .map(|s| {
1701 format!(
1702 "LLC {}: {}",
1703 s.llc_idx,
1704 crate::flock::format_holder_list(&s.holders)
1705 )
1706 })
1707 .collect();
1708 let holder_text = if holders.is_empty() {
1709 "<none recorded>".to_string()
1710 } else {
1711 holders.join("; ")
1712 };
1713 return Err(anyhow::Error::new(ResourceContention {
1714 reason: format!(
1715 "acquire_llc_plan: could not reserve {target_cpus} \
1716 CPU(s) after {attempts} attempts; holders: \
1717 {holder_text}. Run `ktstr locks --json` to see \
1718 every ktstr lock on this host.",
1719 attempts = ACQUIRE_MAX_TOCTOU_RETRIES + 1,
1720 ),
1721 }));
1722 }
1723 // Sleep between attempts so a racing peer has time
1724 // to drop its fds before the next DISCOVER. Indexed
1725 // by `attempt` (0..RETRIES) — see TOCTOU_RETRY_DELAYS.
1726 std::thread::sleep(TOCTOU_RETRY_DELAYS[attempt as usize]);
1727 attempt += 1;
1728 }
1729 }
1730 }
1731}
1732
1733/// Parallelism hint for `make -j{N}` when running under an
1734/// [`LlcPlan`] reservation. Returns the flattened host-CPU count
1735/// (`plan.cpus.len()`), clamped to at least 1 so a pathological empty
1736/// plan still produces a runnable command.
1737///
1738/// Rationale: without this hint, `make -j$(nproc)` fans gcc
1739/// children across every online CPU, defeating the --cpu-cap
1740/// reservation — the build escapes the cgroup cpuset in scheduling
1741/// terms even though the kernel enforces CPU membership. Passing
1742/// `plan.cpus.len()` to make keeps gcc's parallel width aligned with
1743/// the reserved capacity.
1744pub fn make_jobs_for_plan(plan: &LlcPlan) -> usize {
1745 plan.cpus.len().max(1)
1746}
1747
1748/// Render selected LLC indices for user-facing warning text.
1749///
1750/// Format is compact and stable: `[0 (node 0), 2 (node 1)]` when the
1751/// host exposes NUMA information, `[0, 2]` on degraded hosts whose
1752/// `cpu_to_node` map is empty. Used by
1753/// [`warn_if_cross_node_spill`] to render the `ktstr: reserving LLCs
1754/// …` message when an `--cpu-cap` plan spills across nodes.
1755pub fn format_llc_list(locked: &[usize], topo: &HostTopology) -> String {
1756 let parts: Vec<String> = locked
1757 .iter()
1758 .map(|&idx| {
1759 if topo.cpu_to_node.is_empty() {
1760 idx.to_string()
1761 } else {
1762 let node = topo.llc_numa_node(idx);
1763 format!("{idx} (node {node})")
1764 }
1765 })
1766 .collect();
1767 format!("[{}]", parts.join(", "))
1768}
1769
1770/// Emit the cross-node spill warning when an `--cpu-cap` plan's
1771/// `mems` set spans more than one NUMA node. No-op for single-node
1772/// plans.
1773///
1774/// `eprintln!`, not `tracing::warn!`: this is user-visible
1775/// UX feedback (the operator picked a cap that couldn't fit in one
1776/// NUMA node), not operational instrumentation. Fires at most once
1777/// per plan — there is nothing in the plan lifecycle that causes a
1778/// re-trigger. Single-node plans (including single-socket hosts and
1779/// caps that fit within a single node) never emit.
1780///
1781/// Placement: called by `kernel_build_pipeline` and friends right
1782/// after [`acquire_llc_plan`] returns, before the sandbox mount.
1783/// Extracting this into a helper rather than inlining at the call
1784/// site lets the message body be unit-tested via
1785/// [`cross_node_spill_warning`] without capturing stderr.
1786pub fn warn_if_cross_node_spill(plan: &LlcPlan, topo: &HostTopology) {
1787 if let Some(msg) = cross_node_spill_warning(plan, topo) {
1788 eprintln!("{msg}");
1789 }
1790}
1791
1792/// Build the cross-node spill warning string for `plan`, or `None`
1793/// when the plan fits within a single NUMA node (the suppression
1794/// case). [`warn_if_cross_node_spill`] is a thin wrapper that
1795/// `eprintln!`s the `Some` value; this function holds the actual
1796/// gate-and-format logic so a test can pin both halves — the
1797/// predicate gate AND the rendered message — without a stderr
1798/// capture seam. The returned string is exactly the bytes
1799/// `warn_if_cross_node_spill` would emit (sans the trailing newline
1800/// that `eprintln!` appends).
1801fn cross_node_spill_warning(plan: &LlcPlan, topo: &HostTopology) -> Option<String> {
1802 if !should_warn_cross_node(&plan.mems) {
1803 return None;
1804 }
1805 Some(format!(
1806 "ktstr: reserving LLCs {list} across {n} NUMA nodes \
1807 (preferred single-node contiguous unavailable). Build \
1808 will run; memory-access latency may be higher.",
1809 list = format_llc_list(&plan.locked_llcs, topo),
1810 n = plan.mems.len(),
1811 ))
1812}
1813
1814/// Pure predicate backing [`warn_if_cross_node_spill`]. Returns
1815/// `true` when the plan spans more than one NUMA node
1816/// (`mems.len() > 1`); the warning suppression for single-node
1817/// plans follows directly from this.
1818///
1819/// Split out so tests can pin the polarity of the single-node /
1820/// multi-node decision without capturing stderr. A refactor that
1821/// accidentally flipped the comparison (`>= 1` or `== 1`) would
1822/// either warn on every plan (noise) or never warn (silent cost),
1823/// both of which the test suite catches here before the stderr
1824/// capture layer sees it.
1825fn should_warn_cross_node(mems: &std::collections::BTreeSet<usize>) -> bool {
1826 mems.len() > 1
1827}
1828
1829/// Warning text when the effective host-CPU budget is below the guest's
1830/// vCPU count, else `None`. Under `effective_host_cpus < vcpus` the host
1831/// time-slices the vCPU threads, so absolute work scales ~`1/oversub` and
1832/// guest-scheduler timing metrics (run_delay, off-CPU, wake latency, gaps)
1833/// become host-contention artifacts — the silent-wrong-answer class the
1834/// no-perf budget sizing guards against (see [`no_perf_cpu_budget`]).
1835///
1836/// `explicit` splits severity: a per-test `cpu_budget` / `--cpu-cap` below
1837/// the vCPU count is a deliberate opt-in (the test asked to oversubscribe —
1838/// an INFO note), whereas an auto-collapse (the calling process's cpuset is
1839/// smaller than the vCPU count, so [`no_perf_cpu_budget`]'s
1840/// `vcpus.min(allowed)` floored the budget to the allowed set) is the
1841/// truly-silent case nothing opted into — a louder WARNING. `watchdog_secs`
1842/// folds in the tight-watchdog false-eject caveat when small.
1843///
1844/// Pure (returns the text) so a test pins the message + the
1845/// `None`-when-not-oversubscribed polarity without capturing stderr; the
1846/// caller eprintln's it once at build time, mirroring
1847/// [`warn_if_cross_node_spill`].
1848pub(crate) fn overcommit_warning(
1849 effective_host_cpus: usize,
1850 vcpus: usize,
1851 explicit: bool,
1852 watchdog_secs: Option<u64>,
1853) -> Option<String> {
1854 if effective_host_cpus >= vcpus {
1855 return None;
1856 }
1857 let oversub = vcpus as f64 / effective_host_cpus.max(1) as f64;
1858 let mut msg = if explicit {
1859 format!(
1860 "ktstr: cpu_budget {effective_host_cpus} host CPUs < {vcpus} vCPUs \
1861 ({oversub:.1}x oversubscription, opt-in): the host time-slices the \
1862 vCPU threads, so absolute iterations scale ~1/{oversub:.0} and \
1863 guest-scheduler timing metrics (run_delay, off-CPU, wake latency, \
1864 gaps) are host-contention artifacts. Use worst_iterations_per_cpu_sec \
1865 for an overcommit-invariant per-cgroup rate."
1866 )
1867 } else {
1868 format!(
1869 "ktstr: WARNING: only {effective_host_cpus} host CPUs available for \
1870 {vcpus} vCPUs ({oversub:.1}x oversubscription) — the process cpuset \
1871 is smaller than the guest, so the auto-sized CPU budget collapsed \
1872 to it. NOTHING opted into this. The host time-slices the vCPU \
1873 threads, confounding guest-scheduler measurement (absolute work \
1874 scales ~1/{oversub:.0}; timing metrics are host artifacts). Widen \
1875 the process cpuset, or shrink the guest topology."
1876 )
1877 };
1878 if watchdog_secs.is_some_and(|w| w <= 5) {
1879 let w = watchdog_secs.unwrap();
1880 msg.push_str(&format!(
1881 " Also: watchdog_timeout_s={w} is tight under oversubscription — a \
1882 host-descheduled vCPU can trip the scheduler watchdog (false stall)."
1883 ));
1884 }
1885 Some(msg)
1886}
1887
1888/// Whether [`mbind_to_nodes`] must short-circuit before touching `addr`
1889/// or invoking the `mbind(2)` syscall. Returns `true` when there is no
1890/// work to do — an empty node set (no policy target) or a zero-length
1891/// region. This is the exact guard [`mbind_to_nodes`] consults; it is a
1892/// pure predicate so the short-circuit decision can be asserted directly
1893/// instead of inferred from a not-crashing call (whose pass condition the
1894/// syscall's own error-swallowing would satisfy regardless of the guard).
1895fn mbind_should_skip(len: usize, nodes: &[usize]) -> bool {
1896 nodes.is_empty() || len == 0
1897}
1898
1899/// Bind a memory region to specific NUMA nodes using `mbind(MPOL_BIND)`.
1900/// `nodes` is the set of NUMA node IDs. Logs a warning on error
1901/// (single-node systems, missing capabilities).
1902///
1903/// # Safety
1904///
1905/// The caller must ensure that `addr` points to a valid mmap'd region
1906/// of at least `len` bytes. The kernel will read this range via the
1907/// `mbind(2)` syscall to set its NUMA memory policy; passing a stale,
1908/// unmapped, or out-of-bounds pointer is undefined behavior from the
1909/// process's perspective (the syscall itself returns EFAULT, but the
1910/// surrounding Rust contract is violated).
1911///
1912/// When `nodes.is_empty()` or `len == 0`, the function short-circuits
1913/// without dereferencing `addr`, so a null or dangling pointer is
1914/// permitted in those cases.
1915pub unsafe fn mbind_to_nodes(addr: *mut u8, len: usize, nodes: &[usize]) {
1916 if mbind_should_skip(len, nodes) {
1917 return;
1918 }
1919 let node_set: std::collections::BTreeSet<usize> = nodes.iter().copied().collect();
1920 let (nodemask, maxnode) = crate::workload::build_nodemask(&node_set);
1921
1922 let rc = unsafe {
1923 libc::syscall(
1924 libc::SYS_mbind,
1925 addr as *mut libc::c_void,
1926 len,
1927 libc::MPOL_BIND,
1928 nodemask.as_ptr(),
1929 maxnode,
1930 0u32,
1931 )
1932 };
1933 if rc == 0 {
1934 eprintln!(
1935 "performance_mode: mbind {} MB to NUMA node(s) {:?}",
1936 len >> 20,
1937 nodes,
1938 );
1939 } else {
1940 let err = std::io::Error::last_os_error();
1941 eprintln!(
1942 "performance_mode: WARNING: mbind to node(s) {:?} failed: {err}",
1943 nodes,
1944 );
1945 }
1946}
1947
1948use crate::topology::parse_cpu_list_lenient;
1949
1950/// Number of free 2MB hugepages on the host.
1951pub fn hugepages_free() -> u64 {
1952 hugepages_free_from(std::path::Path::new(
1953 "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages",
1954 ))
1955}
1956
1957/// Path-parameterized core of [`hugepages_free`]. Reads the
1958/// `free_hugepages` sysfs file at `path`, parses the trimmed count, and
1959/// returns 0 when the file is absent, unreadable, or contains a value
1960/// that does not parse as a `u64`. Exposes a path seam so the parse and
1961/// the documented 0-fallback can be tested against fixture files without
1962/// depending on the host's hugetlbfs configuration.
1963fn hugepages_free_from(path: &std::path::Path) -> u64 {
1964 std::fs::read_to_string(path)
1965 .ok()
1966 .and_then(|s| s.trim().parse::<u64>().ok())
1967 .unwrap_or(0)
1968}
1969
1970/// Estimate the number of 2 MiB hugepages needed for a given memory size in MiB.
1971pub fn hugepages_needed(memory_mib: u32) -> u64 {
1972 // 2 MiB per hugepage.
1973 (memory_mib as u64).div_ceil(2)
1974}
1975
1976/// Estimate current host CPU load by checking /proc/stat.
1977/// Returns (busy_cpus, total_cpus) as a rough estimate.
1978pub fn host_load_estimate() -> Option<(usize, usize)> {
1979 // Count processes in R state from /proc/stat.
1980 let stat = std::fs::read_to_string("/proc/stat").ok()?;
1981 let procs_running = stat
1982 .lines()
1983 .find(|l| l.starts_with("procs_running "))?
1984 .split_whitespace()
1985 .nth(1)?
1986 .parse::<usize>()
1987 .ok()?;
1988 let online = std::fs::read_to_string("/sys/devices/system/cpu/online").ok()?;
1989 let total = parse_cpu_list_lenient(&online).len();
1990 Some((procs_running, total))
1991}
1992
1993#[cfg(test)]
1994mod tests;