ktstr/vmm/host_topology/
mod.rs

1//! Host CPU topology discovery for performance_mode.
2//!
3//! Wraps [`TestTopology`](crate::topology::TestTopology) for LLC-aware
4//! vCPU pinning and host resource validation.
5
6use anyhow::{Context, Result};
7
8// Advisory flock primitives live in `crate::flock` so both LLC +
9// per-CPU coordination here and per-cache-entry coordination in
10// `crate::cache` share one `try_flock` implementation (with a single
11// `O_CLOEXEC` source of truth) plus one `HolderInfo` /proc/locks
12// parser. Re-importing the names keeps existing in-module call sites
13// (production + `super::*` tests) compiling unchanged.
14use crate::flock::{FlockMode, try_flock};
15
16/// Resource contention error — LLC slots or CPUs unavailable.
17/// Downcast via `anyhow::Error::downcast_ref::<ResourceContention>()`
18/// to distinguish from fatal errors.
19#[derive(Debug)]
20pub struct ResourceContention {
21    pub reason: String,
22}
23
24impl std::fmt::Display for ResourceContention {
25    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
26        write!(f, "{}", self.reason)
27    }
28}
29
30impl std::error::Error for ResourceContention {}
31
32/// The requested topology cannot be realized on this host, and no retry
33/// changes that. Surfaced as a SKIP by the x86_64 VM-creation caps (guest
34/// RAM top above the host MAXPHYADDR, vCPU count above KVM_CAP_MAX_VCPUS,
35/// or max APIC id at/above KVM_CAP_MAX_VCPU_ID): these fire for ANY VM of
36/// this shape, perf-mode or not, so the test cannot run here. Also
37/// returned by the `performance_mode` planner (`compute_pinning`) when the
38/// host has too few physical CPUs / LLC groups — but that perf-mode caller
39/// RE-MAPS it to [`PerfModeUnavailable`] (a host-insufficiency: skip by
40/// default, fail under `KTSTR_NO_SKIP_MODE`). Also raised by
41/// `resolve_cpu_budget` when an author's per-test `cpu_budget` exceeds the
42/// allowed-CPU count — the author-attribute half of a provenance split (a
43/// capability requirement a bigger host satisfies → skip), mirroring the
44/// operator-knob half [`CpuBudgetUnsatisfiable`] (a concrete `--cpu-cap`
45/// number the host cannot satisfy → hard fail). Distinct
46/// from [`ResourceContention`] (a transient slot/resource shortage a retry
47/// resolves → skip); a too-small host is permanent, so the operator must
48/// provision different hardware or narrow the topology rather than retry.
49///
50/// Downcast via `anyhow::Error::downcast_ref::<TopologyInsufficient>()`
51/// (chain-aware: the `#[ktstr_test]` dispatch and `skip_on_contention!`
52/// walk the full error chain so a `.context(...)`-wrapped instance is
53/// still recognised). This typed error replaced a fragile message
54/// string-match (`"need"` + `"LLC"`/`"CPU"`) that would misclassify any
55/// unrelated error happening to contain those words.
56#[derive(Debug)]
57pub struct TopologyInsufficient {
58    pub reason: String,
59}
60
61impl std::fmt::Display for TopologyInsufficient {
62    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
63        write!(f, "{}", self.reason)
64    }
65}
66
67impl std::error::Error for TopologyInsufficient {}
68
69/// The host cannot honor the `performance_mode` guarantee (an exclusive
70/// host LLC for the test's virtual LLC topology + a service CPU), and no
71/// retry changes that — a permanent host-insufficiency (e.g. a single-LLC
72/// host whose LLC spans every CPU, so LLC + 1 service never fits). Treated
73/// like [`TopologyInsufficient`] / [`ResourceContention`]: a SKIP by
74/// default (the VM never runs unisolated — it errors at build, so a
75/// visible skip informs the operator without reddening CI on a host that
76/// can never satisfy perf-mode), promoted to a hard FAIL under
77/// `KTSTR_NO_SKIP_MODE` for runs that demand perf-mode execution. The
78/// remedy is unchanged: provision a host with a spare LLC/CPU, narrow the
79/// topology, or drop `--perf-mode`.
80///
81/// Downcast via `anyhow::Error::downcast_ref::<PerfModeUnavailable>()`
82/// (chain-aware: the dispatch + macro predicates walk the full error
83/// chain, so a `.context(...)`-wrapped instance is still recognised).
84#[derive(Debug)]
85pub struct PerfModeUnavailable {
86    pub reason: String,
87}
88
89impl std::fmt::Display for PerfModeUnavailable {
90    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91        write!(f, "{}", self.reason)
92    }
93}
94
95impl std::error::Error for PerfModeUnavailable {}
96
97/// An operator `--cpu-cap N` (or `KTSTR_CPU_CAP`) the host cannot satisfy: N
98/// exceeds the CPUs this process is allowed on. A HARD ERROR, not a skip —
99/// the operator typed a concrete number that does not exist on this host (a
100/// user-input error). This is the OPERATOR-knob half of a provenance split:
101/// an author's per-test `cpu_budget` over the allowance is instead a
102/// [`TopologyInsufficient`] SKIP (a capability request a bigger host would
103/// satisfy), raised in `resolve_cpu_budget`. Contrast [`ResourceContention`]
104/// (a transient shortage of an otherwise-satisfiable budget → skip/retry).
105///
106/// Downcast via `anyhow::Error::downcast_ref::<CpuBudgetUnsatisfiable>()`
107/// (chain-aware).
108#[derive(Debug)]
109pub struct CpuBudgetUnsatisfiable {
110    pub reason: String,
111}
112
113impl std::fmt::Display for CpuBudgetUnsatisfiable {
114    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
115        write!(f, "{}", self.reason)
116    }
117}
118
119impl std::error::Error for CpuBudgetUnsatisfiable {}
120
121/// The requested topology cannot be represented by this VMM's static
122/// device layout, and the limit is host-INDEPENDENT, so no retry and no
123/// different host changes that. Concretely: the aarch64 vCPU count
124/// exceeds `MAX_VCPUS` (the capacity of the statically sized GICv3
125/// redistributor MMIO window) — with more vCPUs the redistributor region
126/// overruns the device MMIO window and shadows serial/virtio.
127///
128/// A HARD ERROR — distinct from [`TopologyInsufficient`], its deliberate
129/// counterpart. `TopologyInsufficient` is host-DEPENDENT (the VM cannot
130/// boot on *this* host, but a bigger host could → skip);
131/// `TopologyUnrepresentable` is a fixed VMM-layout limit no aarch64 host
132/// can satisfy under this VMM, so it is a test misconfiguration — the
133/// author must narrow the topology, not provision different hardware.
134/// Routes to `EXIT_FAIL` via a DEDICATED hard-fail arm (the
135/// `is_topology_unrepresentable` predicate) in both `result_to_exit_code`
136/// and the `#[ktstr_test]` macro body, placed ABOVE the `expect_err`
137/// inversion and the skip arms — mirroring `CpuBudgetUnsatisfiable` (the
138/// other dedicated hard-fail). That placement is what makes it fail even in
139/// an `expect_err` test (the generic `expect_err` arm would otherwise
140/// invert it to a pass) and keeps it out of the `skip_on_contention!` /
141/// `is_topology_insufficient` skip paths, so the misconfiguration can
142/// never masquerade as the expected failure or be turned into a skip.
143///
144/// Downcast via `anyhow::Error::downcast_ref::<TopologyUnrepresentable>()`
145/// (chain-aware: walks `e.chain()`, so a `.context(...)`-wrapped instance
146/// is still recognised) to identify it programmatically — e.g. tests
147/// asserting the over-`MAX_VCPUS` bail is this hard-fault and not a bare
148/// string-matched error.
149// Constructed only on aarch64 (the GICv3-layout over-MAX_VCPUS bail in
150// aarch64::kvm) and in cross-arch routing tests; a non-aarch64 lib-only
151// build sees no construction site. Keep the dead-code check live on
152// aarch64 (where the bail MUST construct it — a real regression if it
153// stops) and allow it only off-arch.
154#[cfg_attr(not(target_arch = "aarch64"), allow(dead_code))]
155#[derive(Debug)]
156pub struct TopologyUnrepresentable {
157    pub reason: String,
158}
159
160impl std::fmt::Display for TopologyUnrepresentable {
161    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162        write!(f, "{}", self.reason)
163    }
164}
165
166impl std::error::Error for TopologyUnrepresentable {}
167
168/// A physical LLC group on the host, identified by its cache ID.
169#[derive(Debug, Clone)]
170pub struct LlcGroup {
171    /// CPUs sharing this LLC.
172    pub cpus: Vec<usize>,
173}
174
175/// Host CPU topology: LLC groups, NUMA nodes, and online CPU set.
176#[derive(Debug, Clone)]
177pub struct HostTopology {
178    /// LLC groups indexed by their order of discovery.
179    pub llc_groups: Vec<LlcGroup>,
180    /// All online CPUs.
181    pub online_cpus: Vec<usize>,
182    /// NUMA node ID for each online CPU, indexed by CPU ID.
183    /// CPUs not in the map default to node 0.
184    pub cpu_to_node: std::collections::HashMap<usize, usize>,
185    /// LLC indices grouped by their NUMA node. Memoized at construction
186    /// time from `llc_groups + cpu_to_node` so repeated NUMA-aware
187    /// placement queries (perf-mode rotation, `--cpu-cap` consolidation
188    /// PLAN) don't re-walk every LLC's CPU list on every call. Access
189    /// via [`HostTopology::host_llcs_by_numa_node`]. `BTreeMap` (not
190    /// `HashMap`) for deterministic iteration order — two ktstr
191    /// invocations on the same host MUST produce identical LLC
192    /// selections so their ACQUIRE phases converge on the same indices.
193    pub(crate) host_node_llcs: std::collections::BTreeMap<usize, Vec<usize>>,
194}
195
196/// Pinning plan: maps each vCPU index to a host CPU, plus a dedicated
197/// CPU for service threads (monitor, watchdog).
198#[derive(Debug)]
199pub struct PinningPlan {
200    /// vcpu_index -> host_cpu
201    pub assignments: Vec<(u32, usize)>,
202    /// Dedicated host CPU for monitor/watchdog threads. Set when
203    /// `reserve_service_cpu` is true in `compute_pinning`.
204    pub service_cpu: Option<usize>,
205    /// Host LLC group indices used by this plan, sorted.
206    pub llc_indices: Vec<usize>,
207    /// Held flock fds for resource reservation. Dropped when the plan
208    /// (and the KtstrVm holding it) is dropped, releasing all locks.
209    #[allow(dead_code)] // RAII: flock fds released on Drop, not read after construction.
210    pub(crate) locks: Vec<std::os::fd::OwnedFd>,
211}
212
213/// Process-wide cache for [`HostTopology::cached`]. Only
214/// populated on success — a failed sysfs probe retries on the
215/// next call instead of poisoning the cache.
216static CACHED_HOST_TOPOLOGY: std::sync::OnceLock<HostTopology> = std::sync::OnceLock::new();
217
218impl HostTopology {
219    /// Read host topology from sysfs via [`TestTopology::from_system()`](crate::topology::TestTopology::from_system).
220    pub fn from_sysfs() -> Result<Self> {
221        let topo = crate::topology::TestTopology::from_system()
222            .context("read host topology from sysfs")?;
223        let online_cpus = topo.all_cpus().to_vec();
224        let llc_groups: Vec<LlcGroup> = topo
225            .llcs()
226            .iter()
227            .map(|llc| LlcGroup {
228                cpus: llc.cpus().to_vec(),
229            })
230            .collect();
231        let cpu_to_node: std::collections::HashMap<usize, usize> = topo
232            .llcs()
233            .iter()
234            .flat_map(|llc| llc.cpus().iter().map(|&cpu| (cpu, llc.numa_node())))
235            .collect();
236        let host_node_llcs = Self::compute_host_node_llcs(&llc_groups, &cpu_to_node);
237        Ok(Self {
238            llc_groups,
239            online_cpus,
240            cpu_to_node,
241            host_node_llcs,
242        })
243    }
244
245    /// Return a cached host topology, populating the cache on first
246    /// successful call. Failed reads retry on the next call — the
247    /// cache only stores success so a transient sysfs issue at
248    /// process start doesn't poison every subsequent build().
249    pub fn cached() -> Result<Self> {
250        if let Some(topo) = CACHED_HOST_TOPOLOGY.get() {
251            return Ok(topo.clone());
252        }
253        let topo = Self::from_sysfs()?;
254        let _ = CACHED_HOST_TOPOLOGY.set(topo.clone());
255        Ok(topo)
256    }
257
258    /// Build a synthetic `HostTopology` from `(cpu_list, node_id)`
259    /// pairs for tests. One pair per LLC group; within a pair the
260    /// `cpu_list` becomes the group's CPUs and the `node_id` is the
261    /// NUMA node every CPU in that group is assigned to.
262    /// `online_cpus` is the flattened concatenation of every group's
263    /// CPUs in input order; `cpu_to_node` is built by broadcasting
264    /// each group's node over its CPUs; `host_node_llcs` goes through
265    /// the same [`compute_host_node_llcs`] path production uses, so
266    /// tests never diverge from the sysfs-derived memoization.
267    ///
268    /// Intended for test fixtures that want a deterministic in-memory
269    /// topology without stubbing `/sys/devices/system/cpu/*`.
270    /// Previously this logic was duplicated across three helper
271    /// functions (`synthetic_topo`, `synthetic_topo_numa`,
272    /// `synth_host_topo`) — consolidated here so the
273    /// `HostTopology` invariant is maintained in one place. The
274    /// `#[cfg(test)]` gate keeps the symbol out of release builds.
275    #[cfg(test)]
276    pub(crate) fn new_for_tests(groups: &[(Vec<usize>, usize)]) -> Self {
277        let llc_groups: Vec<LlcGroup> = groups
278            .iter()
279            .map(|(cpus, _)| LlcGroup { cpus: cpus.clone() })
280            .collect();
281        let cpu_to_node: std::collections::HashMap<usize, usize> = groups
282            .iter()
283            .flat_map(|(cpus, node)| cpus.iter().map(move |&cpu| (cpu, *node)))
284            .collect();
285        let online_cpus: Vec<usize> = groups
286            .iter()
287            .flat_map(|(cpus, _)| cpus.iter().copied())
288            .collect();
289        let host_node_llcs = HostTopology::compute_host_node_llcs(&llc_groups, &cpu_to_node);
290        HostTopology {
291            llc_groups,
292            online_cpus,
293            cpu_to_node,
294            host_node_llcs,
295        }
296    }
297
298    /// Compute the memoized `host_node_llcs` map from `llc_groups` +
299    /// `cpu_to_node`. Uses the same majority-vote NUMA-assignment rule
300    /// as [`Self::llc_numa_node`], so the memoized map and the one-off query
301    /// method never disagree. Separate fn (not inlined) so
302    /// `from_sysfs` and synthetic-test constructors share one path.
303    fn compute_host_node_llcs(
304        llc_groups: &[LlcGroup],
305        cpu_to_node: &std::collections::HashMap<usize, usize>,
306    ) -> std::collections::BTreeMap<usize, Vec<usize>> {
307        let mut node_llcs: std::collections::BTreeMap<usize, Vec<usize>> =
308            std::collections::BTreeMap::new();
309        for (idx, group) in llc_groups.iter().enumerate() {
310            // Majority-vote NUMA node for this LLC — matches
311            // `llc_numa_node` exactly. We inline the logic here rather
312            // than calling the method because we don't yet have `self`.
313            let mut counts: std::collections::HashMap<usize, usize> =
314                std::collections::HashMap::new();
315            for &cpu in &group.cpus {
316                let node = cpu_to_node.get(&cpu).copied().unwrap_or(0);
317                *counts.entry(node).or_insert(0) += 1;
318            }
319            let node = counts
320                .into_iter()
321                .max_by_key(|&(_, count)| count)
322                .map(|(node, _)| node)
323                .unwrap_or(0);
324            node_llcs.entry(node).or_default().push(idx);
325        }
326        // Within-node LLC ordering: ascending llc_idx. Callers that
327        // walk `host_node_llcs[node]` rely on this for deterministic
328        // output — two ktstr invocations with identical topology see
329        // the same walk order.
330        for llcs in node_llcs.values_mut() {
331            llcs.sort_unstable();
332        }
333        node_llcs
334    }
335
336    /// Maximum cores per LLC group on the host.
337    pub fn max_cores_per_llc(&self) -> usize {
338        self.llc_groups
339            .iter()
340            .map(|g| g.cpus.len())
341            .max()
342            .unwrap_or(0)
343    }
344
345    /// Total available host CPUs.
346    pub fn total_cpus(&self) -> usize {
347        self.online_cpus.len()
348    }
349
350    // ------------------------------------------------------------------
351    // Shared NUMA-placement primitives
352    // ------------------------------------------------------------------
353    //
354    // Used by the existing perf-mode pinning path
355    // ([`numa_aware_llc_order`]) AND the `--cpu-cap` consolidation
356    // PLAN phase. Both callers implement DIFFERENT selection algorithms
357    // on top of these queries:
358    //
359    // - Perf-mode distributes virtual NUMA nodes across host NUMA
360    //   nodes with modulo rotation; uses primitive 2
361    //   (eligibility-by-capacity). No distance lookup.
362    // - Consolidation seeds from a scored LLC list then greedily
363    //   expands within the seed's node, spilling to nearest-by-distance
364    //   when needed; uses primitive 3 (plus llc_numa_node).
365    //
366    // Kept as small orthogonal queries rather than a single mega-selector
367    // — the two algorithms genuinely do different things, but they both
368    // need the same three topology lookups.
369
370    /// Memoized map of NUMA node → LLC indices on that node. Returned
371    /// by reference so callers can iterate without cloning; `BTreeMap`
372    /// gives deterministic iteration so two invocations on identical
373    /// topologies produce identical walks.
374    ///
375    /// In-tree callers currently reach the same data via
376    /// [`Self::numa_nodes_sorted_by_distance`] and [`Self::numa_nodes_with_capacity`]
377    /// — both iterate `host_node_llcs` internally — so this accessor
378    /// has no direct consumer today. Kept as a stable handle for
379    /// future callers (e.g. a planned `ktstr topo --json` NUMA
380    /// section) and downstream tooling that wants the raw map.
381    #[allow(dead_code)]
382    pub(crate) fn host_llcs_by_numa_node(&self) -> &std::collections::BTreeMap<usize, Vec<usize>> {
383        &self.host_node_llcs
384    }
385
386    /// Return every NUMA node that has `>= min_llcs` LLCs, paired with
387    /// that node's LLC-index slice. Callers filter through this when
388    /// their algorithm requires per-node capacity guarantees (perf-mode
389    /// passes `ceil(llcs/numa_nodes)` so any guest node can land on any
390    /// host node; consolidation passes 1 so every node with at least
391    /// one free LLC is a valid spill candidate). Iteration order
392    /// follows the underlying `BTreeMap` — ascending by node id.
393    pub(crate) fn numa_nodes_with_capacity(&self, min_llcs: usize) -> Vec<(usize, &Vec<usize>)> {
394        self.host_node_llcs
395            .iter()
396            .filter(|(_, llcs)| llcs.len() >= min_llcs)
397            .map(|(&node, llcs)| (node, llcs))
398            .collect()
399    }
400
401    /// Return NUMA node ids sorted by distance from `anchor` ascending,
402    /// with unreachable nodes (distance 255 per Linux convention)
403    /// demoted to the end. Caller supplies the distance lookup via
404    /// `distance_fn` so this primitive stays independent of any
405    /// specific distance source — consolidation threads
406    /// `TestTopology::numa_distance` through a closure, while callers
407    /// without a distance matrix can pass
408    /// `|from, to| if from == to { 10 } else { 20 }` for a trivial
409    /// near/far split.
410    ///
411    /// `anchor` is included in the output (distance to self = 10 on
412    /// the Linux convention, sorting first). Nodes without any LLCs
413    /// on this host are skipped — spilling to an empty node has no
414    /// value.
415    pub(crate) fn numa_nodes_sorted_by_distance(
416        &self,
417        anchor: usize,
418        distance_fn: impl Fn(usize, usize) -> u8,
419    ) -> Vec<usize> {
420        let mut nodes: Vec<(usize, u8)> = self
421            .host_node_llcs
422            .keys()
423            .map(|&node| (node, distance_fn(anchor, node)))
424            .collect();
425        // Sort: unreachable (255) last; among reachable, ascending
426        // distance; ties broken by ascending node id via the stable
427        // sort applied over a pre-sorted (BTreeMap-ordered) input.
428        nodes.sort_by(|a, b| {
429            let a_unreachable = a.1 == 255;
430            let b_unreachable = b.1 == 255;
431            match (a_unreachable, b_unreachable) {
432                (true, false) => std::cmp::Ordering::Greater,
433                (false, true) => std::cmp::Ordering::Less,
434                _ => a.1.cmp(&b.1),
435            }
436        });
437        nodes.into_iter().map(|(node, _)| node).collect()
438    }
439
440    /// NUMA node for a host LLC group, determined by majority vote of
441    /// its CPUs' NUMA assignments. Returns 0 when the map is empty
442    /// (single-node systems).
443    ///
444    /// Production callers pre-compute the node-to-LLC mapping once at
445    /// [`HostTopology::from_sysfs`] via
446    /// [`compute_host_node_llcs`](Self::compute_host_node_llcs)
447    /// (memoized in [`host_node_llcs`](Self::host_node_llcs)); use
448    /// [`Self::host_llcs_by_numa_node`](Self::host_llcs_by_numa_node) to
449    /// iterate the pre-built map. This method stays exposed for
450    /// external callers (future `ktstr locks` NUMA column + any
451    /// downstream tooling that needs a single-LLC lookup) and
452    /// synthetic-topology tests that assert per-LLC node assignment.
453    pub fn llc_numa_node(&self, llc_idx: usize) -> usize {
454        let group = &self.llc_groups[llc_idx];
455        let mut counts: std::collections::HashMap<usize, usize> = std::collections::HashMap::new();
456        for &cpu in &group.cpus {
457            let node = self.cpu_to_node.get(&cpu).copied().unwrap_or(0);
458            *counts.entry(node).or_insert(0) += 1;
459        }
460        counts
461            .into_iter()
462            .max_by_key(|&(_, count)| count)
463            .map(|(node, _)| node)
464            .unwrap_or(0)
465    }
466
467    /// Compute a pinning plan that maps virtual LLCs to physical LLC groups.
468    ///
469    /// Each virtual LLC's vCPUs are assigned to cores within a single physical LLC.
470    /// `llc_offset` rotates the starting LLC group so concurrent VMs pin to
471    /// different physical cores. When `reserve_service_cpu` is true, one
472    /// additional host CPU is reserved for service threads (monitor, watchdog).
473    ///
474    /// When `topo.numa_nodes > 1`, virtual LLCs are grouped by guest NUMA
475    /// node and each group is placed on host LLCs within the same physical
476    /// NUMA node. Falls back to sequential placement when the host lacks
477    /// enough NUMA-aligned LLCs.
478    ///
479    /// Returns an error if the host cannot satisfy the topology.
480    pub fn compute_pinning(
481        &self,
482        topo: &super::topology::Topology,
483        reserve_service_cpu: bool,
484        llc_offset: usize,
485    ) -> Result<PinningPlan> {
486        let cores = topo.cores_per_llc;
487        let threads = topo.threads_per_core;
488        let llcs = topo.llcs;
489        let vcpus_per_llc = cores * threads;
490        let total_vcpus = llcs * vcpus_per_llc;
491        let total_needed = total_vcpus as usize + if reserve_service_cpu { 1 } else { 0 };
492
493        if total_needed > self.total_cpus() {
494            return Err(anyhow::Error::new(TopologyInsufficient {
495                reason: format!(
496                    "performance_mode: need {} CPUs ({} vCPUs + {} service) \
497                     but only {} host CPUs available",
498                    total_needed,
499                    total_vcpus,
500                    if reserve_service_cpu { 1 } else { 0 },
501                    self.total_cpus(),
502                ),
503            }));
504        }
505
506        let num_llcs = self.llc_groups.len();
507        if llcs as usize > num_llcs {
508            return Err(anyhow::Error::new(TopologyInsufficient {
509                reason: format!(
510                    "performance_mode: need {} LLCs for {} virtual LLCs, \
511                     but host has {} LLC groups",
512                    llcs, llcs, num_llcs,
513                ),
514            }));
515        }
516
517        // Build the virtual-to-host LLC index mapping. When numa_nodes > 1,
518        // try to place each guest NUMA node's LLCs on host LLCs within
519        // the same physical NUMA node.
520        let llc_order = self.numa_aware_llc_order(topo.numa_nodes, llcs, llc_offset);
521
522        let mut assignments = Vec::with_capacity(total_vcpus as usize);
523        let mut used_cpus = std::collections::HashSet::new();
524
525        for llc in 0..llcs {
526            let llc_idx = llc_order[llc as usize];
527            let group = &self.llc_groups[llc_idx];
528            let available: Vec<usize> = group
529                .cpus
530                .iter()
531                .copied()
532                .filter(|c| !used_cpus.contains(c))
533                .collect();
534
535            if available.len() < vcpus_per_llc as usize {
536                return Err(anyhow::Error::new(TopologyInsufficient {
537                    reason: format!(
538                        "performance_mode: LLC group {} has {} available CPUs, \
539                         need {} for virtual LLC {}",
540                        llc_idx,
541                        available.len(),
542                        vcpus_per_llc,
543                        llc,
544                    ),
545                }));
546            }
547
548            for vcpu_in_llc in 0..vcpus_per_llc {
549                let vcpu_id = llc * vcpus_per_llc + vcpu_in_llc;
550                let host_cpu = available[vcpu_in_llc as usize];
551                used_cpus.insert(host_cpu);
552                assignments.push((vcpu_id, host_cpu));
553            }
554        }
555
556        let service_cpu = if reserve_service_cpu {
557            let cpu = self
558                .online_cpus
559                .iter()
560                .copied()
561                .find(|c| !used_cpus.contains(c));
562            // Defensive: the total-CPU check above already folds the +1
563            // service CPU into `total_needed`, so a passing host always
564            // has at least one online CPU beyond the assigned vCPUs and
565            // this never fires today. Typed as TopologyInsufficient (not
566            // plain anyhow) so that if a future refactor of that check ever
567            // lets it through, it is handled identically to its three
568            // sibling shortfall checks: the perf-mode caller
569            // (acquire_slot_with_locks) re-maps every compute_pinning
570            // TopologyInsufficient to PerfModeUnavailable (a host-insufficiency
571            // skip, fail under KTSTR_NO_SKIP_MODE), and
572            // the non-perf caller passes reserve_service_cpu=false so this
573            // site is unreachable there.
574            if cpu.is_none() {
575                return Err(anyhow::Error::new(TopologyInsufficient {
576                    reason: format!(
577                        "performance_mode: no free host CPU for service threads \
578                         after assigning {total_vcpus} vCPUs"
579                    ),
580                }));
581            }
582            cpu
583        } else {
584            None
585        };
586
587        // Deduplicate LLC indices (multiple virtual LLCs may map to the
588        // same host LLC at different offsets, but that's prevented by the
589        // used_cpus check above — each virtual LLC consumes distinct CPUs).
590        let mut llc_indices = llc_order;
591        llc_indices.sort_unstable();
592        llc_indices.dedup();
593
594        Ok(PinningPlan {
595            assignments,
596            service_cpu,
597            llc_indices,
598            locks: Vec::new(),
599        })
600    }
601
602    /// Build the virtual LLC to host LLC index mapping.
603    ///
604    /// Falls back to sequential offset mapping when any of these hold:
605    /// `numa_nodes == 0` (avoids divide-by-zero), `numa_nodes == 1`
606    /// (no NUMA-awareness needed), `cpu_to_node` is empty (no NUMA
607    /// map available), `llcs < numa_nodes` (base-per-node would be 0
608    /// and leave guest nodes empty), or the host lacks enough
609    /// NUMA-aligned LLCs.
610    ///
611    /// Otherwise, distributes `llcs` across `numa_nodes` guest nodes:
612    /// the first `llcs % numa_nodes` guest nodes receive
613    /// `base + 1 = ceil(llcs / numa_nodes)` LLCs each; the rest
614    /// receive `base = floor(llcs / numa_nodes)` LLCs. This preserves
615    /// the remainder that floor-only division would silently drop
616    /// (e.g. `llcs=5, numa_nodes=2` yields counts 3+2 = 5).
617    /// Eligibility requires each host NUMA node to supply at least
618    /// `ceil(llcs / numa_nodes)` (the max any single guest node will
619    /// claim) — stricter than the prior floor-based check, so the
620    /// "+1" guest nodes always land on a node with capacity.
621    ///
622    /// Implementation composes [`Self::numa_nodes_with_capacity`],
623    /// which iterates the memoized `host_node_llcs` map. The
624    /// `--cpu-cap` consolidation PLAN phase instead composes
625    /// [`Self::numa_nodes_sorted_by_distance`] plus
626    /// [`Self::llc_numa_node`], so the two callers share the memoized
627    /// `host_node_llcs` map rather than the same accessor calls. The
628    /// two callers' SELECTION algorithms also differ: perf-mode does
629    /// modulo rotation of guest onto host nodes; consolidation does
630    /// score-driven greedy expansion.
631    pub(crate) fn numa_aware_llc_order(
632        &self,
633        numa_nodes: u32,
634        llcs: u32,
635        llc_offset: usize,
636    ) -> Vec<usize> {
637        let num_host_llcs = self.llc_groups.len();
638
639        // Sequential fallback used by the degenerate cases below.
640        let sequential_fallback = || -> Vec<usize> {
641            (0..llcs as usize)
642                .map(|i| (i + llc_offset) % num_host_llcs)
643                .collect()
644        };
645
646        // Defensive: zero NUMA nodes would divide-by-zero below. Also
647        // handles the single-node case (no NUMA-awareness needed) and
648        // the "cpu_to_node map unavailable" case.
649        if numa_nodes == 0 || numa_nodes == 1 || self.cpu_to_node.is_empty() {
650            return sequential_fallback();
651        }
652
653        // If the guest has fewer LLCs than NUMA nodes, a per-node base
654        // of 0 would leave some guest nodes empty. Fall back rather
655        // than silently dropping those nodes' LLCs.
656        if llcs < numa_nodes {
657            return sequential_fallback();
658        }
659
660        // Distribute LLCs across guest NUMA nodes. Integer division
661        // alone drops the remainder (e.g. llcs=5, numa_nodes=2 gave
662        // 2 per node = 4 LLCs assigned, 5th dropped). Fix: the first
663        // `remainder` nodes get `base + 1`, the rest get `base`.
664        let base_per_node = (llcs / numa_nodes) as usize;
665        let remainder = (llcs % numa_nodes) as usize;
666        // Ceiling-per-node — the largest count any single guest node
667        // will claim. Host NUMA nodes must supply at least this many
668        // to remain eligible.
669        let max_per_node = base_per_node + if remainder > 0 { 1 } else { 0 };
670
671        // Collect host NUMA nodes that can supply the ceiling (max)
672        // per-node count — so any guest node can land there regardless
673        // of whether it's one of the `remainder` "+1" nodes. Shared
674        // primitive: `numa_nodes_with_capacity` filters the memoized
675        // group-by-node map.
676        let eligible_nodes = self.numa_nodes_with_capacity(max_per_node);
677
678        // Need at least numa_nodes distinct host NUMA nodes with enough
679        // LLCs each.
680        if eligible_nodes.len() < numa_nodes as usize {
681            return sequential_fallback();
682        }
683
684        // Assign guest NUMA nodes to host NUMA nodes, rotating by
685        // llc_offset to spread concurrent VMs.
686        let mut order = Vec::with_capacity(llcs as usize);
687        let node_offset = llc_offset / max_per_node.max(1);
688        for guest_node in 0..numa_nodes as usize {
689            let host_idx = (guest_node + node_offset) % eligible_nodes.len();
690            let (_, host_llcs) = &eligible_nodes[host_idx];
691            let within_offset = llc_offset % host_llcs.len();
692            // First `remainder` guest nodes get `base + 1` LLCs; rest
693            // get `base`. Total assigned == llcs (remainder preserved).
694            let count = if guest_node < remainder {
695                base_per_node + 1
696            } else {
697                base_per_node
698            };
699            for i in 0..count {
700                let llc_idx = host_llcs[(i + within_offset) % host_llcs.len()];
701                order.push(llc_idx);
702            }
703        }
704
705        order
706    }
707}
708
709/// Lock mode for LLC reservation.
710#[derive(Debug, Clone, Copy, PartialEq, Eq)]
711pub enum LlcLockMode {
712    /// Exclusive access to the entire LLC (performance_mode tests).
713    /// Returns unavailable when any shared or exclusive holder exists.
714    Exclusive,
715    /// Shared access to the LLC (non-perf pinned tests).
716    /// Multiple shared holders coexist; returns unavailable when
717    /// exclusive holder exists.
718    #[allow(dead_code)]
719    Shared,
720}
721
722/// Resource lock acquisition outcome.
723#[derive(Debug)]
724pub enum LockOutcome {
725    /// All locks acquired successfully.
726    Acquired {
727        /// LLC offset consumed; read only by the locking test fixtures.
728        #[allow(dead_code)]
729        llc_offset: usize,
730        locks: Vec<std::os::fd::OwnedFd>,
731    },
732    /// Resources busy. The inner string carries the diagnostic reason
733    /// surfaced to test fixtures; production callers only match the
734    /// variant tag.
735    Unavailable(#[allow(dead_code)] String),
736}
737
738/// Acquire resource locks for a pinning plan (non-blocking).
739///
740/// **LLC locks** (`{lock_dir}/ktstr-llc-{N}.lock`):
741/// - `Exclusive`: `flock(LOCK_EX | LOCK_NB)` — sole access to the LLC.
742/// - `Shared`: `flock(LOCK_SH | LOCK_NB)` — multiple holders coexist.
743///
744/// **CPU locks** (`{lock_dir}/ktstr-cpu-{C}.lock`):
745/// - Always `flock(LOCK_EX | LOCK_NB)` — exclusive per CPU.
746/// - Skipped for `Exclusive` LLC mode (the LLC lock already provides
747///   exclusivity over all CPUs in the group).
748///
749/// Single non-blocking attempt. Returns `LockOutcome::Unavailable`
750/// immediately when any resource is busy. Callers rely on nextest
751/// retry backoff for contention resolution.
752///
753/// `KTSTR_CARGO_TEST_MODE` short-circuits the entire flock dance and
754/// returns `Acquired` with an empty fd list — bare `cargo test`
755/// invocations don't share the cross-process LLC reservation
756/// contract that nextest / `cargo ktstr test` peers rely on. Tests
757/// run on whatever CPUs the OS schedules them onto.
758pub fn acquire_resource_locks(
759    plan: &PinningPlan,
760    llc_indices: &[usize],
761    llc_mode: LlcLockMode,
762) -> Result<LockOutcome> {
763    if crate::cargo_test_mode::cargo_test_mode_active() {
764        return Ok(LockOutcome::Acquired {
765            llc_offset: llc_indices.first().copied().unwrap_or(0),
766            locks: Vec::new(),
767        });
768    }
769    match try_acquire_all(plan, llc_indices, llc_mode) {
770        Ok(locks) => Ok(LockOutcome::Acquired {
771            llc_offset: llc_indices.first().copied().unwrap_or(0),
772            locks,
773        }),
774        Err(reason) => Ok(LockOutcome::Unavailable(reason)),
775    }
776}
777
778/// Compose the LLC lockfile prefix from the resolved lock directory.
779/// Returns `{lock_dir}/ktstr-llc-`.
780fn llc_lock_prefix() -> String {
781    format!("{}/ktstr-llc-", crate::cache::resolve_lock_dir().display())
782}
783
784/// Compose the per-CPU lockfile prefix from the resolved lock directory.
785/// Returns `{lock_dir}/ktstr-cpu-`.
786fn cpu_lock_prefix() -> String {
787    format!("{}/ktstr-cpu-", crate::cache::resolve_lock_dir().display())
788}
789
790#[cfg(test)]
791thread_local! {
792    /// Thread-local override for the LLC lock prefix. Tests set this
793    /// to a per-test tempdir so the acquire path operates on its
794    /// own lockfile pool instead of padding the `LlcGroup` vector
795    /// to 90,000+ entries just to avoid collision with production
796    /// indices at 0..<host-llcs>. See tests `acquire_llc_plan_*`
797    /// that build a small synth topo and point the prefix at a
798    /// `TempDir`.
799    static LLC_LOCK_PREFIX_OVERRIDE: std::cell::RefCell<Option<String>> =
800        const { std::cell::RefCell::new(None) };
801
802    /// Thread-local override for the per-CPU lock prefix. Symmetric
803    /// with `LLC_LOCK_PREFIX_OVERRIDE`.
804    static CPU_LOCK_PREFIX_OVERRIDE: std::cell::RefCell<Option<String>> =
805        const { std::cell::RefCell::new(None) };
806}
807
808/// Compose the LLC lockfile path for `llc_idx`. Production resolves
809/// via `KTSTR_LOCK_DIR` (fallback `/tmp`); tests can override the
810/// prefix via `LLC_LOCK_PREFIX_OVERRIDE` to keep their lockfile
811/// pool isolated.
812fn llc_lock_path(llc_idx: usize) -> String {
813    #[cfg(test)]
814    {
815        if let Some(p) = LLC_LOCK_PREFIX_OVERRIDE.with(|p| p.borrow().clone()) {
816            return format!("{p}{llc_idx}.lock");
817        }
818    }
819    format!("{}{llc_idx}.lock", llc_lock_prefix())
820}
821
822/// Compose the per-CPU lockfile path for `cpu`. Symmetric with
823/// [`llc_lock_path`] — production resolves via `KTSTR_LOCK_DIR`;
824/// tests can override via `CPU_LOCK_PREFIX_OVERRIDE`.
825fn cpu_lock_path(cpu: usize) -> String {
826    #[cfg(test)]
827    {
828        if let Some(p) = CPU_LOCK_PREFIX_OVERRIDE.with(|p| p.borrow().clone()) {
829            return format!("{p}{cpu}.lock");
830        }
831    }
832    format!("{}{cpu}.lock", cpu_lock_prefix())
833}
834
835/// Try to acquire all resource locks (all-or-nothing).
836/// Returns the held fds on success, or an error string describing
837/// which resource was busy.
838fn try_acquire_all(
839    plan: &PinningPlan,
840    llc_indices: &[usize],
841    llc_mode: LlcLockMode,
842) -> std::result::Result<Vec<std::os::fd::OwnedFd>, String> {
843    let flock_mode = match llc_mode {
844        LlcLockMode::Exclusive => FlockMode::Exclusive,
845        LlcLockMode::Shared => FlockMode::Shared,
846    };
847    let mut locks = Vec::new();
848
849    // Lock LLC files.
850    for &llc_idx in llc_indices {
851        let path = llc_lock_path(llc_idx);
852        match try_flock(&path, flock_mode) {
853            Ok(Some(fd)) => locks.push(fd),
854            Ok(None) => return Err(format!("LLC {llc_idx} busy")),
855            Err(e) => return Err(format!("LLC {llc_idx}: {e}")),
856        }
857    }
858
859    // Per-CPU locks: skip for exclusive LLC mode (the LLC lock covers
860    // all CPUs in the group).
861    if llc_mode != LlcLockMode::Exclusive {
862        for &(_vcpu, host_cpu) in &plan.assignments {
863            let path = cpu_lock_path(host_cpu);
864            match try_flock(&path, FlockMode::Exclusive) {
865                Ok(Some(fd)) => locks.push(fd),
866                Ok(None) => return Err(format!("CPU {host_cpu} busy")),
867                Err(e) => return Err(format!("CPU {host_cpu}: {e}")),
868            }
869        }
870        if let Some(cpu) = plan.service_cpu {
871            let path = cpu_lock_path(cpu);
872            match try_flock(&path, FlockMode::Exclusive) {
873                Ok(Some(fd)) => locks.push(fd),
874                Ok(None) => return Err(format!("service CPU {cpu} busy")),
875                Err(e) => return Err(format!("service CPU {cpu}: {e}")),
876            }
877        }
878    }
879
880    Ok(locks)
881}
882
883/// Diffuse a pid across `[0, max_start)` so adjacent pids do not
884/// land on adjacent offsets. Used by the default-else run-lock path
885/// (`KtstrVm::acquire_default_run_locks`) to pick a starting LLC slot so
886/// two ktstr invocations launching simultaneously don't both probe slot 0
887/// first.
888///
889/// Bare `pid % max_start` collapses adjacent pids onto adjacent
890/// offsets (Linux's pid allocator walks `pid_max` sequentially),
891/// which is the worst spread shape for the common batch-spawn
892/// case: nextest forks N test processes back-to-back, every pid
893/// lands within a small contiguous range, every `pid % max_start`
894/// lands within an equally small contiguous slice of the offset
895/// space, and they all probe overlapping slots on the first
896/// pass. AHasher avalanche on the pid bytes diffuses adjacent
897/// pids across the whole `[0, max_start)` range, so the
898/// slot-rotation loop has a fair chance of finding a free slot
899/// without burning the entire lockfile pool.
900///
901/// The hasher is `ahash::AHasher` keyed with fixed zero seeds
902/// (`RandomState::with_seeds(0, 0, 0, 0)`); a per-run random
903/// seed would defeat reproducibility for unit-test fixtures and
904/// for any future debug logging that wants to confirm "pid X
905/// picks offset Y for slot N".
906///
907/// Caller invariant: `max_start >= 1`. Panics on `max_start == 0`
908/// (modulo-by-zero); callers must enforce this upstream (the
909/// run-lock path floors `max_slots` at 1).
910pub(crate) fn pid_window_offset(pid: u32, max_start: usize) -> usize {
911    use std::hash::{BuildHasher, Hasher};
912    let mut hasher = ahash::RandomState::with_seeds(0, 0, 0, 0).build_hasher();
913    hasher.write(&pid.to_le_bytes());
914    (hasher.finish() as usize) % max_start
915}
916
917// ===========================================================================
918// --cpu-cap PLAN pipeline — CpuCap / LlcSnapshot / LlcPlan + discover/plan/acquire
919// ===========================================================================
920//
921// Entry point [`acquire_llc_plan`] is the single non-perf-mode
922// reservation path: kernel builds and no-perf-mode VMs both call it
923// with or without `--cpu-cap N`. `--cpu-cap` is a CPU-count budget:
924// the planner reserves exactly N host CPUs by walking whole LLCs in
925// contention- / NUMA-aware order and partial-taking the last LLC
926// so `plan.cpus.len() == N`. The flock is per-LLC even when the
927// last LLC is only partially used — coordination with concurrent
928// ktstr peers is unchanged at LLC granularity. When `--cpu-cap`
929// is absent the planner defaults to 30% of the calling process's
930// sched_getaffinity cpuset (see [`default_cpu_budget`] and
931// [`host_allowed_cpus`]) — not 30% of the host's online CPU count,
932// because a CI runner whose parent cgroup pins ktstr to a 4-CPU
933// subset must plan within THAT subset or sched_setaffinity on the
934// resulting mask produces an empty effective set.
935// Perf-mode never reaches this path; it stays on
936// [`acquire_resource_locks`] for its `LOCK_EX` reservation contract.
937//
938// The pipeline has three phases: discover (snapshot holders per
939// LLC, filtered to the process's allowed cpuset), plan (NUMA-aware,
940// consolidation-aware selection), acquire (non-blocking `LOCK_SH`
941// on each selected LLC). Up to ACQUIRE_MAX_TOCTOU_RETRIES retries
942// absorb the window between the discover snapshot and the
943// non-blocking acquire; between retries the loop sleeps for an
944// ascending micro-budget (TOCTOU_RETRY_DELAYS) so a peer that
945// raced us has time to drop its fds before the next snapshot.
946// If every retry fails, the contention is persistent and the
947// caller falls back to nextest-retry / operator-wait.
948
949/// Return the CPUs the calling process is allowed to run on, per
950/// `sched_getaffinity(2)` with a `/proc/self/status` Cpus_allowed_list
951/// fallback. Every consumer of the `--cpu-cap` pipeline plans against
952/// this set instead of `HostTopology::online_cpus` so
953/// `sched_setaffinity` on the plan's CPU list never produces an empty
954/// effective mask under a cgroup-restricted runner (CI hosts, systemd
955/// slices, sudo -u under a limited cpuset).
956///
957/// Returns an empty vec only when BOTH the syscall AND procfs fail —
958/// a pathological host that can't enumerate its own affinity. Callers
959/// treat that as a bail reason, not a fallback "every CPU" permission:
960/// guessing on a misconfigured host is worse than failing visibly.
961///
962/// Tests override the return value via `ALLOWED_CPUS_OVERRIDE` so
963/// the 30% default and allowed-cpu filtering are deterministic in
964/// unit tests regardless of the CI runner's real cpuset.
965pub(crate) fn host_allowed_cpus() -> Vec<usize> {
966    #[cfg(test)]
967    {
968        if let Some(override_set) = ALLOWED_CPUS_OVERRIDE.with(|p| p.borrow().clone()) {
969            return override_set;
970        }
971    }
972    if let Some(cpus) = crate::cpu_util::read_affinity(0) {
973        return cpus.into_iter().map(|c| c as usize).collect();
974    }
975    if let Ok(raw) = std::fs::read_to_string("/proc/self/status") {
976        for line in raw.lines() {
977            if let Some(v) = line.strip_prefix("Cpus_allowed_list:")
978                && let Some(parsed) = crate::cpu_util::parse_cpu_list(v.trim())
979            {
980                return parsed.into_iter().map(|c| c as usize).collect();
981            }
982        }
983    }
984    Vec::new()
985}
986
987#[cfg(test)]
988thread_local! {
989    /// Test-only override for [`host_allowed_cpus`]. Set via
990    /// [`AllowedCpusGuard`] to make 30%-of-allowed calculations and
991    /// plan filtering deterministic in unit tests. Mirrors the
992    /// `LLC_LOCK_PREFIX_OVERRIDE` pattern.
993    pub(crate) static ALLOWED_CPUS_OVERRIDE: std::cell::RefCell<Option<Vec<usize>>> =
994        const { std::cell::RefCell::new(None) };
995}
996
997/// Default CPU budget when `--cpu-cap` is not set: 30% of the
998/// allowed-CPU count, rounded up, with a min-1 floor for small or
999/// degenerate hosts. 30% leaves enough headroom for concurrent peers
1000/// (tests, builds) while still reserving a non-trivial slice; the
1001/// min-1 floor prevents returning 0 on a 1- or 2-CPU host, where
1002/// ceil(×0.30) ≥ 1 anyway — the `.max(1)` is defense in depth for
1003/// future ratio tweaks.
1004fn default_cpu_budget(allowed_cpus: usize) -> usize {
1005    allowed_cpus.saturating_mul(30).div_ceil(100).max(1)
1006}
1007
1008/// No-perf CPU budget when no explicit `--cpu-cap` (or `cpu_budget` knob) is
1009/// set: at least the VM's own vCPU count, clamped to the allowed cpuset.
1010///
1011/// The rationale is TEST VALIDITY, not boot speed — do not "optimize" this
1012/// back to a flat 30%. A scheduler test measures how the GUEST scheduler
1013/// places tasks across the guest's CPUs. If the VM's vCPU threads are
1014/// oversubscribed on the host (256 vCPUs sharing the 30% default mask is
1015/// ~95 pCPUs = 2.7x), the HOST scheduler time-slices them, so guest vCPUs
1016/// stall for reasons unrelated to the workload — a host-contention confound
1017/// that invalidates the guest-scheduler measurement (the silent-wrong-answer
1018/// class the project guards against). Sizing the budget to `>= vcpus` gives
1019/// the guest's CPUs real host CPUs, so its scheduler view tracks real
1020/// concurrency. (A wide boot also drops ~0.7s as the kernel's parallel AP
1021/// bring-up runs unthrottled, but that is incidental.)
1022///
1023/// Floored at the 30% `default_cpu_budget` so small VMs (vcpus < 30%) keep
1024/// the cross-test concurrency headroom; clamped to `allowed_cpus` so it never
1025/// exceeds the process cpuset. An explicit cap LOWER than vcpus is the
1026/// deliberate opt-in to oversubscribe for contention testing.
1027pub(crate) fn no_perf_cpu_budget(allowed_cpus: usize, vm_vcpus: usize) -> usize {
1028    default_cpu_budget(allowed_cpus).max(vm_vcpus.min(allowed_cpus))
1029}
1030
1031/// Parsed `--cpu-cap N` value. N is a CPU count: the planner reserves
1032/// exactly N host CPUs by walking whole LLCs in contention- /
1033/// NUMA-aware order (filtered to the calling process's allowed
1034/// cpuset) and partial-taking the last LLC so `plan.cpus.len() == N`.
1035/// The flock set is still per-LLC (the last LLC is flocked whole
1036/// even when only a prefix of its CPUs enters `plan.cpus`).
1037/// Bounded to `1..=usize::MAX` at the constructor — a cap of 0 is
1038/// nonsensical (reserving zero CPUs is just "don't run") and
1039/// rejected upstream by the CLI layer, but we enforce the bound in
1040/// the type system via `NonZeroUsize` so callers can
1041/// `CpuCap::new(...)?` without a follow-up bounds check.
1042///
1043/// The runtime upper bound — "don't exceed the process's allowed
1044/// CPU count" — is enforced at acquire time via
1045/// [`CpuCap::effective_count`] because the allowed set is not known
1046/// until `host_allowed_cpus` reads `sched_getaffinity`.
1047#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1048pub struct CpuCap {
1049    n: std::num::NonZeroUsize,
1050}
1051
1052impl CpuCap {
1053    /// Construct from a raw `usize` CPU count. Returns `Err` on `0`;
1054    /// `usize::MAX` is accepted here and clamped later by
1055    /// `effective_count`.
1056    pub fn new(n: usize) -> Result<Self> {
1057        std::num::NonZeroUsize::new(n)
1058            .map(|n| CpuCap { n })
1059            .ok_or_else(|| anyhow::anyhow!("--cpu-cap must be ≥ 1 CPU (got 0)"))
1060    }
1061
1062    /// Three-tier resolution: explicit CLI flag wins over env var,
1063    /// which wins over "not set". Returns `None` when neither is present,
1064    /// meaning "use the caller's auto-sized default": the
1065    /// kernel-build/planner path expands `None` to `default_cpu_budget`
1066    /// (30% of the allowed set); the no-perf VM-builder path expands it to
1067    /// `no_perf_cpu_budget` (max(30%, min(vcpus, allowed)), usually the
1068    /// vCPU count).
1069    ///
1070    /// Env var is `KTSTR_CPU_CAP` (integer ≥ 1, CPU count). An empty
1071    /// or unset env var is treated as absent; a non-numeric value
1072    /// OR the numeric value `0` is an error — `KTSTR_CPU_CAP=0`
1073    /// flows through `CpuCap::new(0)` which rejects with "--cpu-cap
1074    /// must be ≥ 1 CPU (got 0)". Zero is not a silent fallback to
1075    /// "no cap"; it surfaces as a parse-time error so typos and
1076    /// scripting mistakes don't accidentally disable the resource
1077    /// contract.
1078    pub fn resolve(cli_flag: Option<usize>) -> Result<Option<CpuCap>> {
1079        if let Some(n) = cli_flag {
1080            return Ok(Some(CpuCap::new(n)?));
1081        }
1082        match std::env::var(crate::KTSTR_CPU_CAP_ENV) {
1083            Ok(s) if s.is_empty() => Ok(None),
1084            Ok(s) => {
1085                let n: usize = s
1086                    .parse()
1087                    .with_context(|| format!("KTSTR_CPU_CAP is not a valid integer: {s:?}"))?;
1088                Ok(Some(CpuCap::new(n)?))
1089            }
1090            Err(std::env::VarError::NotPresent) => Ok(None),
1091            Err(std::env::VarError::NotUnicode(raw)) => {
1092                anyhow::bail!(
1093                    "KTSTR_CPU_CAP contains non-UTF-8 bytes ({} bytes): {raw:?}. \
1094                     Set an integer value or unset.",
1095                    raw.len(),
1096                )
1097            }
1098        }
1099    }
1100
1101    /// Runtime-bounded cap: returns the inner count unless it exceeds
1102    /// `allowed_cpus` (the calling process's sched_getaffinity cpuset
1103    /// count), in which case a `CpuBudgetUnsatisfiable` hard error (an
1104    /// explicit cap the host cannot satisfy is a FAIL, not a transient
1105    /// skip) steers the caller toward an actionable message. This check
1106    /// lives at acquire time — not at construction — because the allowed
1107    /// set is not known until `host_allowed_cpus` reads the syscall.
1108    pub fn effective_count(&self, allowed_cpus: usize) -> Result<usize> {
1109        let n = self.n.get();
1110        if n > allowed_cpus {
1111            // An explicit --cpu-cap the host cannot satisfy is a hard ERROR
1112            // (the author typed a concrete number that does not exist here),
1113            // not transient contention: CpuBudgetUnsatisfiable, not
1114            // ResourceContention, so it fails rather than skips.
1115            return Err(anyhow::Error::new(CpuBudgetUnsatisfiable {
1116                reason: format!(
1117                    "--cpu-cap N = {n} exceeds the {allowed_cpus} CPUs this \
1118                     process is allowed on (from sched_getaffinity / \
1119                     Cpus_allowed_list). Pick a value ≤ {allowed_cpus}, \
1120                     release the cgroup/taskset constraint restricting this \
1121                     process, or omit --cpu-cap to use the auto-sized default \
1122                     (30% of the allowed set for kernel builds; the vCPU \
1123                     count, floored at 30%, for VMs)."
1124                ),
1125            }));
1126        }
1127        Ok(n)
1128    }
1129}
1130
1131/// Per-LLC discover snapshot: identity + current holder set.
1132/// Constructed by [`discover_llc_snapshots`] before the PLAN phase.
1133/// `pub(crate)` so the in-crate PLAN pipeline and this module's tests
1134/// can construct and inspect it; the `ktstr locks` observational
1135/// command shares only [`crate::flock::HolderInfo`], not this
1136/// structure. External callers have no reason to construct one.
1137#[derive(Debug, Clone)]
1138pub(crate) struct LlcSnapshot {
1139    /// Host LLC index — matches [`HostTopology::llc_groups`] ordering.
1140    pub(crate) llc_idx: usize,
1141    /// Canonical `{lock_dir}/ktstr-llc-{N}.lock` path. Stored so the
1142    /// ACQUIRE phase doesn't re-format the string per LLC.
1143    pub(crate) lockfile_path: std::path::PathBuf,
1144    /// Processes currently holding this LLC's flock (any mode). Empty
1145    /// when no peer holds the lock. Derived from a single `/proc/locks`
1146    /// read shared across every LLC in the discover phase.
1147    pub(crate) holders: Vec<crate::flock::HolderInfo>,
1148    /// `holders.len()`, cached so the PLAN sort can access it without
1149    /// re-traversing the holder list per candidate.
1150    pub(crate) holder_count: usize,
1151}
1152
1153/// Output of [`acquire_llc_plan`]: the concrete LLC reservation plus
1154/// every piece of diagnostic context a downstream consumer could
1155/// want.
1156///
1157/// `mems` is the union of NUMA nodes containing the selected CPUs —
1158/// `BuildSandbox::try_create` writes this to the child cgroup's
1159/// `cpuset.mems` so memory allocations respect the same NUMA locality
1160/// the CPU reservation already implies.
1161///
1162/// `locks` holds the RAII file descriptors whose `OwnedFd::drop`
1163/// releases the kernel-side flock; the field is `pub(crate)` because
1164/// direct manipulation from outside the crate would defeat the drop
1165/// guarantee.
1166#[derive(Debug)]
1167pub struct LlcPlan {
1168    /// Selected host LLC indices, sorted ASCENDING. Acquire order
1169    /// matches this slice — two callers with the same target see the
1170    /// same ordering and converge on the same one-wins-the-others-retry
1171    /// livelock-proof sequence.
1172    pub locked_llcs: Vec<usize>,
1173    /// Flattened host CPU list, sized exactly `target_cpus`. The last
1174    /// locked LLC may contribute only a prefix of its allowed CPUs.
1175    /// Preserves LLC ordering: CPUs from `locked_llcs[0]` come
1176    /// before CPUs from `locked_llcs[1]`, etc.
1177    pub cpus: Vec<usize>,
1178    /// Union of NUMA nodes hosting the locked LLCs. When the plan
1179    /// spans > 1 node (cross-node spill — seed node exhausted, plan
1180    /// spilled to nearest-by-distance neighbors), `mems`
1181    /// contains every node — not just the seed node's.
1182    pub mems: std::collections::BTreeSet<usize>,
1183    /// Per-LLC discovery trail. Preserved through the lifetime of the
1184    /// plan so error-formatting (via `acquire_llc_plan`'s final
1185    /// fresh snapshot) and future `ktstr locks` rendering don't
1186    /// re-probe `/proc/locks`. In-tree consumers currently re-read
1187    /// the snapshot only on the TOCTOU failure path; the field is
1188    /// kept populated so downstream tooling can inspect the
1189    /// plan-at-acquire holder set without a second pass.
1190    #[allow(dead_code)]
1191    pub(crate) snapshot: Vec<LlcSnapshot>,
1192    /// RAII flock holders. Dropped when the plan goes out of scope,
1193    /// releasing each LLC's `LOCK_SH` in declared order.
1194    #[allow(dead_code)] // RAII only — Drop releases flocks, no reads.
1195    pub(crate) locks: Vec<std::os::fd::OwnedFd>,
1196}
1197
1198/// Maximum TOCTOU retry budget for the DISCOVER → PLAN → ACQUIRE
1199/// pipeline. Production sees up to `RETRIES + 1 = 4` attempts: one
1200/// initial DISCOVER and three retries. Between retries the caller
1201/// sleeps for an ascending micro-budget (10ms, 50ms, 200ms — see
1202/// [`TOCTOU_RETRY_DELAYS`]) so two peers that initially raced on the
1203/// same LLC have time to drop their fds before the next snapshot.
1204/// Without the sleep the second DISCOVER often sees the same holder
1205/// state and bails on a transient race; the in-process micro-sleep
1206/// absorbs that without paying the nextest-retry cost.
1207const ACQUIRE_MAX_TOCTOU_RETRIES: u32 = 3;
1208
1209/// Per-retry sleep durations between DISCOVER attempts. Indexed by
1210/// the retry index: after attempt 0 fails the loop sleeps
1211/// `TOCTOU_RETRY_DELAYS[0]`, after attempt 1 fails it sleeps
1212/// `TOCTOU_RETRY_DELAYS[1]`, etc. Length must equal
1213/// [`ACQUIRE_MAX_TOCTOU_RETRIES`] — there are exactly that many
1214/// sleeps before the final attempt that can still bail.
1215const TOCTOU_RETRY_DELAYS: [std::time::Duration; ACQUIRE_MAX_TOCTOU_RETRIES as usize] = [
1216    std::time::Duration::from_millis(10),
1217    std::time::Duration::from_millis(50),
1218    std::time::Duration::from_millis(200),
1219];
1220
1221/// DISCOVER phase — read-only LLC snapshot.
1222///
1223/// Walks ONLY the LLCs whose CPUs overlap `allowed` (the calling
1224/// process's `sched_getaffinity` cpuset). LLCs entirely outside the
1225/// cpuset are skipped — locking one would never contribute a
1226/// schedulable CPU to `plan.cpus`, and on a heavily-pinned runner
1227/// (CI cgroup with N out of M CPUs allowed) skipping them avoids
1228/// O(host_llcs - allowed_llcs) lockfile materializations and
1229/// /proc/locks lookups per attempt. The PLAN phase still receives a
1230/// snapshot vector indexed by `LlcSnapshot.llc_idx`, not by
1231/// position, so a sparse snapshot set works without any further
1232/// adjustment downstream.
1233///
1234/// For every selected LLC: stat the canonical lockfile (materializing
1235/// it with `O_CREAT | O_CLOEXEC | 0o666` if absent so subsequent
1236/// ACQUIRE has a stable inode), then parse one `/proc/locks` read to
1237/// populate every snapshot's holder list in a single pass. No flock
1238/// acquires — DISCOVER never contends.
1239///
1240/// `mountinfo` is the `/proc/self/mountinfo` text read once per
1241/// `acquire_llc_plan` invocation at [`acquire_llc_plan_with_acquire_fn`]
1242/// and threaded through here so a host with N LLCs pays for exactly
1243/// one mountinfo read per DISCOVER pass (DISCOVER runs once per retry
1244/// attempt — up to ACQUIRE_MAX_TOCTOU_RETRIES+1 — plus once on the
1245/// retry-exhausted diagnostic path, up to 5 passes, hence caching at
1246/// the plan level rather than per snapshot walk).
1247///
1248/// Returns `Ok(snapshots)` on success. Propagates opening + stat
1249/// errors so a missing `/tmp` or permission failure surfaces
1250/// actionably.
1251fn discover_llc_snapshots(
1252    topo: &HostTopology,
1253    allowed: &std::collections::BTreeSet<usize>,
1254    mountinfo: &str,
1255) -> Result<Vec<LlcSnapshot>> {
1256    let mut snapshots: Vec<LlcSnapshot> = Vec::with_capacity(topo.llc_groups.len());
1257    for llc_idx in 0..topo.llc_groups.len() {
1258        // Skip LLCs whose CPUs are entirely outside the calling
1259        // process's allowed cpuset — they cannot contribute a
1260        // schedulable CPU to `plan.cpus`, and locking one would just
1261        // pay for a lockfile + /proc/locks pass without coordination
1262        // value. The sparse snapshot vector keeps llc_idx as the
1263        // identity key, so PLAN's index-based iteration is
1264        // unaffected.
1265        if !topo.llc_groups[llc_idx]
1266            .cpus
1267            .iter()
1268            .any(|c| allowed.contains(c))
1269        {
1270            continue;
1271        }
1272        let path = std::path::PathBuf::from(llc_lock_path(llc_idx));
1273        // Ensure the lockfile inode exists so `read_holders_with_mountinfo`
1274        // can key /proc/locks lookups on it. Deliberately takes no
1275        // flock — DISCOVER is observational. Also runs the NFS/FUSE
1276        // reject check inside `materialize`, so a misconfigured
1277        // `/tmp` mount surfaces here instead of silently at ACQUIRE
1278        // time.
1279        crate::flock::materialize(&path)?;
1280        let holders =
1281            crate::flock::read_holders_with_mountinfo(&path, mountinfo).unwrap_or_default();
1282        let holder_count = holders.len();
1283        snapshots.push(LlcSnapshot {
1284            llc_idx,
1285            lockfile_path: path,
1286            holders,
1287            holder_count,
1288        });
1289    }
1290    Ok(snapshots)
1291}
1292
1293/// PLAN phase — NUMA-aware placement over discover snapshots.
1294///
1295/// Composite sort driven by three ordered keys:
1296///   1. Consolidation — prefer LLCs already holding peers.
1297///   2. NUMA locality — after seeding on the highest-scored LLC's
1298///      node, greedily fill the seed node before spilling.
1299///   3. LLC index ASC — tiebreak + final ACQUIRE ordering for livelock
1300///      safety.
1301///
1302/// `target_cpus` is the exact number of allowed CPUs the plan
1303/// reserves. The walk selects whole LLCs (filtered to their
1304/// allowed-CPU overlap) until the accumulated contribution meets
1305/// the budget. The LAST selected LLC may contribute more allowed
1306/// CPUs than the remaining budget needs; the materialization layer
1307/// at [`acquire_llc_plan_with_acquire_fn`] takes only the needed
1308/// prefix of that LLC's allowed CPUs into `plan.cpus`. The flock
1309/// is always held at LLC granularity — coordination with concurrent
1310/// ktstr peers happens per-LLC, regardless of how many of the LLC's
1311/// CPUs are consumed here. LLCs whose CPUs are all outside
1312/// `allowed` are skipped entirely — locking one would never
1313/// contribute a schedulable CPU to `plan.cpus`.
1314///
1315/// Distance fallback: callers without a distance matrix pass a closure
1316/// that returns `10` for equal nodes and `20` otherwise — primitive 3
1317/// keeps the spill order reasonable even on hosts whose
1318/// `/sys/devices/system/node/*/distance` is unavailable.
1319fn plan_from_snapshots(
1320    snapshots: &[LlcSnapshot],
1321    target_cpus: usize,
1322    topo: &HostTopology,
1323    allowed: &std::collections::BTreeSet<usize>,
1324    distance_fn: impl Fn(usize, usize) -> u8,
1325) -> Vec<usize> {
1326    if target_cpus == 0 {
1327        return Vec::new();
1328    }
1329
1330    // Allowed-CPU count contributed by each LLC. An LLC with zero
1331    // overlap contributes no schedulable CPUs to `plan.cpus`, so
1332    // reserving it adds a useless flock and no planning value — drop
1333    // those up front so every subsequent walk only considers
1334    // candidates that can actually carry budget.
1335    let llc_allowed_cpus = |idx: usize| -> usize {
1336        topo.llc_groups[idx]
1337            .cpus
1338            .iter()
1339            .filter(|c| allowed.contains(c))
1340            .count()
1341    };
1342    let total_allowed_in_llcs: usize = (0..snapshots.len()).map(llc_allowed_cpus).sum();
1343    if target_cpus >= total_allowed_in_llcs {
1344        // Budget ≥ sum of per-LLC contributions: select every LLC
1345        // that has at least one allowed CPU, in ascending order.
1346        // Short-circuits the scoring walk when the cap degenerates
1347        // to "reserve everything we can schedule on."
1348        let mut all: Vec<usize> = (0..snapshots.len())
1349            .filter(|&idx| llc_allowed_cpus(idx) > 0)
1350            .collect();
1351        all.sort_unstable();
1352        return all;
1353    }
1354
1355    // Step a: partition + sort. Only LLCs with at least one allowed
1356    // CPU are eligible — locking an out-of-cpuset LLC is useless.
1357    // Consolidation candidates first (holder_count DESC, llc_idx ASC);
1358    // fresh candidates after, sorted by llc_idx ASC. A single
1359    // composite sort would do the same work but the two-partition
1360    // form is easier to read and lets future "prefer consolidation
1361    // only if score ≥ threshold" tweaks slot in.
1362    let eligible = |s: &&LlcSnapshot| -> bool { llc_allowed_cpus(s.llc_idx) > 0 };
1363    let mut consolidation: Vec<&LlcSnapshot> = snapshots
1364        .iter()
1365        .filter(|s| s.holder_count > 0)
1366        .filter(eligible)
1367        .collect();
1368    let mut fresh: Vec<&LlcSnapshot> = snapshots
1369        .iter()
1370        .filter(|s| s.holder_count == 0)
1371        .filter(eligible)
1372        .collect();
1373    consolidation.sort_by(|a, b| {
1374        b.holder_count
1375            .cmp(&a.holder_count)
1376            .then(a.llc_idx.cmp(&b.llc_idx))
1377    });
1378    fresh.sort_by_key(|s| s.llc_idx);
1379    let ranked: Vec<&LlcSnapshot> = consolidation.into_iter().chain(fresh).collect();
1380    if ranked.is_empty() {
1381        // No LLC on this host overlaps the caller's allowed cpuset.
1382        // Bail upstream handles this as ResourceContention; here we
1383        // just return empty so the caller can surface the diagnostic.
1384        return Vec::new();
1385    }
1386
1387    // Step b: seed. Highest-scored eligible LLC; its NUMA node
1388    // anchors the greedy expansion.
1389    let seed = ranked[0];
1390    let seed_node = topo.llc_numa_node(seed.llc_idx);
1391
1392    // Step c–d: walk seed-node LLCs first, then spill to
1393    // nearest-by-distance nodes. Primitives 1 + 3 drive the node
1394    // ordering; the per-node LLC lists come from primitive 1. Within
1395    // each node, we still honour the composite score by walking
1396    // `ranked` and skipping LLCs not on the current target node.
1397    // Accumulation is by allowed-CPU contribution — an LLC with 4
1398    // CPUs of which 2 are in `allowed` counts as 2 toward the
1399    // budget and the other 2 never appear in `plan.cpus`.
1400    let node_order = topo.numa_nodes_sorted_by_distance(seed_node, distance_fn);
1401    let mut selected: Vec<usize> = Vec::new();
1402    let mut picked: std::collections::HashSet<usize> = std::collections::HashSet::new();
1403    let mut accumulated: usize = 0;
1404    for node in node_order {
1405        if accumulated >= target_cpus {
1406            break;
1407        }
1408        // Ranked walk, taking every candidate on this node in
1409        // score-order until we've filled `target_cpus` or exhausted
1410        // the node.
1411        for snap in &ranked {
1412            if accumulated >= target_cpus {
1413                break;
1414            }
1415            if picked.contains(&snap.llc_idx) {
1416                continue;
1417            }
1418            if topo.llc_numa_node(snap.llc_idx) != node {
1419                continue;
1420            }
1421            selected.push(snap.llc_idx);
1422            picked.insert(snap.llc_idx);
1423            accumulated += llc_allowed_cpus(snap.llc_idx);
1424        }
1425    }
1426
1427    // Step e: livelock-proof acquire order — ascending index.
1428    selected.sort_unstable();
1429    selected
1430}
1431
1432/// ACQUIRE phase — non-blocking `LOCK_SH` on every selected LLC.
1433///
1434/// All-or-nothing. A single `EWOULDBLOCK` releases every held fd (via
1435/// `drop(locks)`) and returns `Ok(None)` so the caller re-runs
1436/// discover + plan with a fresh snapshot. Non-retryable errors
1437/// (unexpected errno, path open failures) propagate unchanged.
1438fn try_acquire_llc_plan_locks(
1439    selected: &[usize],
1440    snapshots: &[LlcSnapshot],
1441) -> Result<Option<Vec<std::os::fd::OwnedFd>>> {
1442    let mut locks: Vec<std::os::fd::OwnedFd> = Vec::with_capacity(selected.len());
1443    for &idx in selected {
1444        let snap = snapshots
1445            .iter()
1446            .find(|s| s.llc_idx == idx)
1447            .expect("selected index must come from snapshots — plan invariant");
1448        match crate::flock::try_flock(&snap.lockfile_path, FlockMode::Shared)? {
1449            Some(fd) => locks.push(fd),
1450            None => {
1451                // Drop previously-held fds so the peer racing us sees
1452                // a consistent post-bail state, then signal "retry".
1453                drop(locks);
1454                return Ok(None);
1455            }
1456        }
1457    }
1458    Ok(Some(locks))
1459}
1460
1461/// Entry point for the `--cpu-cap` PLAN pipeline.
1462///
1463/// Runs DISCOVER → PLAN → ACQUIRE with up to
1464/// [`ACQUIRE_MAX_TOCTOU_RETRIES`] retries (each separated by a
1465/// per-retry sleep from [`TOCTOU_RETRY_DELAYS`]). On
1466/// success returns an [`LlcPlan`] holding the selected LLCs, their
1467/// flattened CPUs (intersected with the calling process's allowed
1468/// cpuset), the derived `mems` set, the diagnostic snapshot, and the
1469/// RAII flock handles.
1470///
1471/// `cpu_cap == None` means "reserve 30% of the allowed-CPU set" (see
1472/// [`default_cpu_budget`]). `cpu_cap == Some(cap)` where
1473/// `cap > allowed_cpus` errors at acquire time via
1474/// [`CpuCap::effective_count`]. The allowed-CPU set comes from
1475/// [`host_allowed_cpus`] — `sched_getaffinity(0)` with a procfs
1476/// fallback — so plans are always schedulable under cgroup-restricted
1477/// runners (CI hosts, systemd slices, sudo under a limited cpuset).
1478///
1479/// Consolidation uses the host distance matrix from [`crate::topology::TestTopology`]
1480/// so spill order matches actual NUMA cost. Hosts whose
1481/// `/sys/devices/system/node/*/distance` failed to parse degrade to a
1482/// numerically-adjacent ordering via the distance closure (`10` for
1483/// same-node, `20` for cross-node).
1484pub fn acquire_llc_plan(
1485    topo: &HostTopology,
1486    test_topo: &crate::topology::TestTopology,
1487    cpu_cap: Option<CpuCap>,
1488) -> Result<LlcPlan> {
1489    if crate::cargo_test_mode::cargo_test_mode_active() {
1490        // Bare `cargo test` mode: no peer-coordination contract.
1491        // Synthesise a degenerate plan that names every LLC and
1492        // every allowed CPU but holds no flocks. The vmm caller
1493        // strips `locks` after build (see `KtstrVmBuilder::build`)
1494        // and re-acquires via `acquire_resource_locks` at run time
1495        // — also short-circuited above. `cpus` is the calling
1496        // process's allowed cpuset so the `sched_setaffinity`
1497        // sites inside the vmm have a valid mask to apply
1498        // (allowed cpuset = whatever the OS schedules us onto).
1499        let allowed = host_allowed_cpus();
1500        if allowed.is_empty() {
1501            return Err(ResourceContention {
1502                reason: "could not determine allowed CPU set \
1503                         (sched_getaffinity and /proc/self/status both failed)"
1504                    .into(),
1505            }
1506            .into());
1507        }
1508        let _ = test_topo;
1509        let _ = cpu_cap;
1510        let allowed_set: std::collections::BTreeSet<usize> = allowed.iter().copied().collect();
1511        let locked_llcs: Vec<usize> = topo
1512            .llc_groups
1513            .iter()
1514            .enumerate()
1515            .filter_map(|(idx, group)| {
1516                if group.cpus.iter().any(|c| allowed_set.contains(c)) {
1517                    Some(idx)
1518                } else {
1519                    None
1520                }
1521            })
1522            .collect();
1523        let mems: std::collections::BTreeSet<usize> = locked_llcs
1524            .iter()
1525            .filter_map(|&idx| {
1526                topo.llc_groups
1527                    .get(idx)
1528                    .and_then(|g| g.cpus.first().copied())
1529                    .and_then(|c| topo.cpu_to_node.get(&c).copied())
1530            })
1531            .collect();
1532        return Ok(LlcPlan {
1533            locked_llcs,
1534            cpus: allowed,
1535            mems,
1536            snapshot: Vec::new(),
1537            locks: Vec::new(),
1538        });
1539    }
1540    acquire_llc_plan_with_acquire_fn(topo, test_topo, cpu_cap, try_acquire_llc_plan_locks)
1541}
1542
1543/// Parameterized form of [`acquire_llc_plan`] that takes the
1544/// ACQUIRE closure as a seam. Production calls this with
1545/// [`try_acquire_llc_plan_locks`] (non-blocking `LOCK_SH` per LLC);
1546/// tests can pass a closure that returns `Ok(None)` on attempt 0 and
1547/// forwards on attempt 1 to simulate a peer winning the first race,
1548/// or an attempt-counting closure that always fails to exercise the
1549/// retry-exhausted error path.
1550///
1551/// `acquire_fn` receives `(selected, snapshots)` and returns
1552/// `Ok(Some(locks))` on success, `Ok(None)` to trigger a retry, or
1553/// propagates hard errors unchanged. Production closure is the
1554/// free-standing [`try_acquire_llc_plan_locks`]; the test closure
1555/// can track its own attempt counter via interior mutability
1556/// ([`std::cell::Cell`], `Mutex`, atomic int).
1557///
1558/// The outer loop body — DISCOVER, PLAN, retry budget, final
1559/// holder diagnostics — is shared between both entry points so the
1560/// test seam exercises the exact retry-and-diagnose sequence
1561/// production uses, not a parallel implementation.
1562fn acquire_llc_plan_with_acquire_fn<F>(
1563    topo: &HostTopology,
1564    test_topo: &crate::topology::TestTopology,
1565    cpu_cap: Option<CpuCap>,
1566    mut acquire_fn: F,
1567) -> Result<LlcPlan>
1568where
1569    F: FnMut(&[usize], &[LlcSnapshot]) -> Result<Option<Vec<std::os::fd::OwnedFd>>>,
1570{
1571    // Resolve the calling process's allowed cpuset. Plans must fit
1572    // inside this set — sched_setaffinity against a mask outside the
1573    // process's cgroup cpuset either fails outright or produces an
1574    // empty effective set (the vCPU thread then cannot run). Reading
1575    // the syscall ONCE here and threading it through means every
1576    // TOCTOU retry sees the same baseline; a cgroup change mid-plan
1577    // is a host-reconfiguration event the retry budget does not
1578    // attempt to absorb.
1579    let allowed_vec = host_allowed_cpus();
1580    if allowed_vec.is_empty() {
1581        return Err(ResourceContention {
1582            reason: "could not determine allowed CPU set \
1583                     (sched_getaffinity and /proc/self/status both failed)"
1584                .into(),
1585        }
1586        .into());
1587    }
1588    let allowed: std::collections::BTreeSet<usize> = allowed_vec.iter().copied().collect();
1589    let allowed_cpus = allowed.len();
1590
1591    let target_cpus = match cpu_cap {
1592        Some(cap) => cap.effective_count(allowed_cpus)?,
1593        None => default_cpu_budget(allowed_cpus),
1594    };
1595    if target_cpus == 0 {
1596        // Defense in depth. `default_cpu_budget` has a `.max(1)`
1597        // floor and `effective_count` on a `NonZeroUsize` cap can
1598        // never return 0, but surfacing this as an explicit bail
1599        // catches future regressions (e.g. someone wires a signed
1600        // integer into the budget math) instead of silently
1601        // producing a plan with no locks.
1602        return Err(ResourceContention {
1603            reason: "CPU budget resolved to zero".into(),
1604        }
1605        .into());
1606    }
1607
1608    // Read /proc/self/mountinfo ONCE per acquire_llc_plan invocation.
1609    // Every DISCOVER pass re-uses this text to derive per-LLC
1610    // /proc/locks needles (major:minor:inode). Without this cache, a
1611    // host with N LLCs would re-read mountinfo N× per DISCOVER pass,
1612    // and DISCOVER itself runs up to ACQUIRE_MAX_TOCTOU_RETRIES+1
1613    // times in the retry loop, plus once on the retry-exhausted
1614    // diagnostic path (up to 5 total). Mount points are
1615    // effectively static during a plan acquisition — a bind mount
1616    // changing under us mid-acquire is a host-reconfiguration event
1617    // that invalidates every parallel acquirer anyway, not something
1618    // we need to re-read to observe.
1619    let mountinfo = crate::flock::read_mountinfo().map_err(|e| ResourceContention {
1620        reason: format!("read /proc/self/mountinfo: {e}"),
1621    })?;
1622
1623    let mut attempt: u32 = 0;
1624    loop {
1625        let snapshots =
1626            discover_llc_snapshots(topo, &allowed, &mountinfo).map_err(|e| ResourceContention {
1627                reason: format!("discover LLC snapshots: {e}"),
1628            })?;
1629        let selected = plan_from_snapshots(&snapshots, target_cpus, topo, &allowed, |from, to| {
1630            test_topo.numa_distance(from, to)
1631        });
1632        if selected.is_empty() {
1633            // Every LLC's CPU set lies outside the allowed cpuset —
1634            // sysfs disagrees with sched_getaffinity. This is a host
1635            // misconfiguration (stale sysfs after hotplug, cgroup
1636            // pinned to a CPU range the kernel no longer reports in
1637            // llc_groups, etc.). Bail with actionable text rather
1638            // than looping through retries that cannot change the
1639            // outcome.
1640            return Err(ResourceContention {
1641                reason: format!(
1642                    "no host LLC overlaps the process's \
1643                     {allowed_cpus}-CPU allowed set — sysfs LLC groups \
1644                     and sched_getaffinity disagree"
1645                ),
1646            }
1647            .into());
1648        }
1649        match acquire_fn(&selected, &snapshots).map_err(|e| ResourceContention {
1650            reason: format!("acquire LLC locks: {e}"),
1651        })? {
1652            Some(locks) => {
1653                // Success — materialize cpus + mems from the selected
1654                // indices, intersecting each LLC's CPU list with
1655                // `allowed` so `plan.cpus` never contains a CPU the
1656                // process cannot run on, and TRUNCATING at exactly
1657                // `target_cpus` so the last-LLC overshoot
1658                // contributes only the prefix the budget needs. The
1659                // full LLC is still flocked (the coordination unit
1660                // is per-LLC), but the CPUs beyond `target_cpus`
1661                // never appear in `plan.cpus` — sched_setaffinity
1662                // masks and cgroup cpuset.cpus writes reflect the
1663                // exact budget. `mems` collects the NUMA nodes of
1664                // CPUs that actually appear in `plan.cpus`; an LLC
1665                // that contributes a partial slice on a cross-node
1666                // split only registers the nodes of its
1667                // actually-used CPUs.
1668                let mut cpus: Vec<usize> = Vec::new();
1669                let mut mems: std::collections::BTreeSet<usize> = std::collections::BTreeSet::new();
1670                'outer: for &idx in &selected {
1671                    let group = &topo.llc_groups[idx];
1672                    for &cpu in &group.cpus {
1673                        if !allowed.contains(&cpu) {
1674                            continue;
1675                        }
1676                        if cpus.len() >= target_cpus {
1677                            break 'outer;
1678                        }
1679                        cpus.push(cpu);
1680                        let node = topo.cpu_to_node.get(&cpu).copied().unwrap_or(0);
1681                        mems.insert(node);
1682                    }
1683                }
1684                return Ok(LlcPlan {
1685                    locked_llcs: selected,
1686                    cpus,
1687                    mems,
1688                    snapshot: snapshots,
1689                    locks,
1690                });
1691            }
1692            None => {
1693                if attempt >= ACQUIRE_MAX_TOCTOU_RETRIES {
1694                    // Rebuild holder diagnostics from a FRESH read so
1695                    // the error points at the peer that actually won.
1696                    let final_snapshots = discover_llc_snapshots(topo, &allowed, &mountinfo)?;
1697                    let holders: Vec<String> = final_snapshots
1698                        .iter()
1699                        .filter(|s| !s.holders.is_empty())
1700                        .map(|s| {
1701                            format!(
1702                                "LLC {}: {}",
1703                                s.llc_idx,
1704                                crate::flock::format_holder_list(&s.holders)
1705                            )
1706                        })
1707                        .collect();
1708                    let holder_text = if holders.is_empty() {
1709                        "<none recorded>".to_string()
1710                    } else {
1711                        holders.join("; ")
1712                    };
1713                    return Err(anyhow::Error::new(ResourceContention {
1714                        reason: format!(
1715                            "acquire_llc_plan: could not reserve {target_cpus} \
1716                             CPU(s) after {attempts} attempts; holders: \
1717                             {holder_text}. Run `ktstr locks --json` to see \
1718                             every ktstr lock on this host.",
1719                            attempts = ACQUIRE_MAX_TOCTOU_RETRIES + 1,
1720                        ),
1721                    }));
1722                }
1723                // Sleep between attempts so a racing peer has time
1724                // to drop its fds before the next DISCOVER. Indexed
1725                // by `attempt` (0..RETRIES) — see TOCTOU_RETRY_DELAYS.
1726                std::thread::sleep(TOCTOU_RETRY_DELAYS[attempt as usize]);
1727                attempt += 1;
1728            }
1729        }
1730    }
1731}
1732
1733/// Parallelism hint for `make -j{N}` when running under an
1734/// [`LlcPlan`] reservation. Returns the flattened host-CPU count
1735/// (`plan.cpus.len()`), clamped to at least 1 so a pathological empty
1736/// plan still produces a runnable command.
1737///
1738/// Rationale: without this hint, `make -j$(nproc)` fans gcc
1739/// children across every online CPU, defeating the --cpu-cap
1740/// reservation — the build escapes the cgroup cpuset in scheduling
1741/// terms even though the kernel enforces CPU membership. Passing
1742/// `plan.cpus.len()` to make keeps gcc's parallel width aligned with
1743/// the reserved capacity.
1744pub fn make_jobs_for_plan(plan: &LlcPlan) -> usize {
1745    plan.cpus.len().max(1)
1746}
1747
1748/// Render selected LLC indices for user-facing warning text.
1749///
1750/// Format is compact and stable: `[0 (node 0), 2 (node 1)]` when the
1751/// host exposes NUMA information, `[0, 2]` on degraded hosts whose
1752/// `cpu_to_node` map is empty. Used by
1753/// [`warn_if_cross_node_spill`] to render the `ktstr: reserving LLCs
1754/// …` message when an `--cpu-cap` plan spills across nodes.
1755pub fn format_llc_list(locked: &[usize], topo: &HostTopology) -> String {
1756    let parts: Vec<String> = locked
1757        .iter()
1758        .map(|&idx| {
1759            if topo.cpu_to_node.is_empty() {
1760                idx.to_string()
1761            } else {
1762                let node = topo.llc_numa_node(idx);
1763                format!("{idx} (node {node})")
1764            }
1765        })
1766        .collect();
1767    format!("[{}]", parts.join(", "))
1768}
1769
1770/// Emit the cross-node spill warning when an `--cpu-cap` plan's
1771/// `mems` set spans more than one NUMA node. No-op for single-node
1772/// plans.
1773///
1774/// `eprintln!`, not `tracing::warn!`: this is user-visible
1775/// UX feedback (the operator picked a cap that couldn't fit in one
1776/// NUMA node), not operational instrumentation. Fires at most once
1777/// per plan — there is nothing in the plan lifecycle that causes a
1778/// re-trigger. Single-node plans (including single-socket hosts and
1779/// caps that fit within a single node) never emit.
1780///
1781/// Placement: called by `kernel_build_pipeline` and friends right
1782/// after [`acquire_llc_plan`] returns, before the sandbox mount.
1783/// Extracting this into a helper rather than inlining at the call
1784/// site lets the message body be unit-tested via
1785/// [`cross_node_spill_warning`] without capturing stderr.
1786pub fn warn_if_cross_node_spill(plan: &LlcPlan, topo: &HostTopology) {
1787    if let Some(msg) = cross_node_spill_warning(plan, topo) {
1788        eprintln!("{msg}");
1789    }
1790}
1791
1792/// Build the cross-node spill warning string for `plan`, or `None`
1793/// when the plan fits within a single NUMA node (the suppression
1794/// case). [`warn_if_cross_node_spill`] is a thin wrapper that
1795/// `eprintln!`s the `Some` value; this function holds the actual
1796/// gate-and-format logic so a test can pin both halves — the
1797/// predicate gate AND the rendered message — without a stderr
1798/// capture seam. The returned string is exactly the bytes
1799/// `warn_if_cross_node_spill` would emit (sans the trailing newline
1800/// that `eprintln!` appends).
1801fn cross_node_spill_warning(plan: &LlcPlan, topo: &HostTopology) -> Option<String> {
1802    if !should_warn_cross_node(&plan.mems) {
1803        return None;
1804    }
1805    Some(format!(
1806        "ktstr: reserving LLCs {list} across {n} NUMA nodes \
1807         (preferred single-node contiguous unavailable). Build \
1808         will run; memory-access latency may be higher.",
1809        list = format_llc_list(&plan.locked_llcs, topo),
1810        n = plan.mems.len(),
1811    ))
1812}
1813
1814/// Pure predicate backing [`warn_if_cross_node_spill`]. Returns
1815/// `true` when the plan spans more than one NUMA node
1816/// (`mems.len() > 1`); the warning suppression for single-node
1817/// plans follows directly from this.
1818///
1819/// Split out so tests can pin the polarity of the single-node /
1820/// multi-node decision without capturing stderr. A refactor that
1821/// accidentally flipped the comparison (`>= 1` or `== 1`) would
1822/// either warn on every plan (noise) or never warn (silent cost),
1823/// both of which the test suite catches here before the stderr
1824/// capture layer sees it.
1825fn should_warn_cross_node(mems: &std::collections::BTreeSet<usize>) -> bool {
1826    mems.len() > 1
1827}
1828
1829/// Warning text when the effective host-CPU budget is below the guest's
1830/// vCPU count, else `None`. Under `effective_host_cpus < vcpus` the host
1831/// time-slices the vCPU threads, so absolute work scales ~`1/oversub` and
1832/// guest-scheduler timing metrics (run_delay, off-CPU, wake latency, gaps)
1833/// become host-contention artifacts — the silent-wrong-answer class the
1834/// no-perf budget sizing guards against (see [`no_perf_cpu_budget`]).
1835///
1836/// `explicit` splits severity: a per-test `cpu_budget` / `--cpu-cap` below
1837/// the vCPU count is a deliberate opt-in (the test asked to oversubscribe —
1838/// an INFO note), whereas an auto-collapse (the calling process's cpuset is
1839/// smaller than the vCPU count, so [`no_perf_cpu_budget`]'s
1840/// `vcpus.min(allowed)` floored the budget to the allowed set) is the
1841/// truly-silent case nothing opted into — a louder WARNING. `watchdog_secs`
1842/// folds in the tight-watchdog false-eject caveat when small.
1843///
1844/// Pure (returns the text) so a test pins the message + the
1845/// `None`-when-not-oversubscribed polarity without capturing stderr; the
1846/// caller eprintln's it once at build time, mirroring
1847/// [`warn_if_cross_node_spill`].
1848pub(crate) fn overcommit_warning(
1849    effective_host_cpus: usize,
1850    vcpus: usize,
1851    explicit: bool,
1852    watchdog_secs: Option<u64>,
1853) -> Option<String> {
1854    if effective_host_cpus >= vcpus {
1855        return None;
1856    }
1857    let oversub = vcpus as f64 / effective_host_cpus.max(1) as f64;
1858    let mut msg = if explicit {
1859        format!(
1860            "ktstr: cpu_budget {effective_host_cpus} host CPUs < {vcpus} vCPUs \
1861             ({oversub:.1}x oversubscription, opt-in): the host time-slices the \
1862             vCPU threads, so absolute iterations scale ~1/{oversub:.0} and \
1863             guest-scheduler timing metrics (run_delay, off-CPU, wake latency, \
1864             gaps) are host-contention artifacts. Use worst_iterations_per_cpu_sec \
1865             for an overcommit-invariant per-cgroup rate."
1866        )
1867    } else {
1868        format!(
1869            "ktstr: WARNING: only {effective_host_cpus} host CPUs available for \
1870             {vcpus} vCPUs ({oversub:.1}x oversubscription) — the process cpuset \
1871             is smaller than the guest, so the auto-sized CPU budget collapsed \
1872             to it. NOTHING opted into this. The host time-slices the vCPU \
1873             threads, confounding guest-scheduler measurement (absolute work \
1874             scales ~1/{oversub:.0}; timing metrics are host artifacts). Widen \
1875             the process cpuset, or shrink the guest topology."
1876        )
1877    };
1878    if watchdog_secs.is_some_and(|w| w <= 5) {
1879        let w = watchdog_secs.unwrap();
1880        msg.push_str(&format!(
1881            " Also: watchdog_timeout_s={w} is tight under oversubscription — a \
1882             host-descheduled vCPU can trip the scheduler watchdog (false stall)."
1883        ));
1884    }
1885    Some(msg)
1886}
1887
1888/// Whether [`mbind_to_nodes`] must short-circuit before touching `addr`
1889/// or invoking the `mbind(2)` syscall. Returns `true` when there is no
1890/// work to do — an empty node set (no policy target) or a zero-length
1891/// region. This is the exact guard [`mbind_to_nodes`] consults; it is a
1892/// pure predicate so the short-circuit decision can be asserted directly
1893/// instead of inferred from a not-crashing call (whose pass condition the
1894/// syscall's own error-swallowing would satisfy regardless of the guard).
1895fn mbind_should_skip(len: usize, nodes: &[usize]) -> bool {
1896    nodes.is_empty() || len == 0
1897}
1898
1899/// Bind a memory region to specific NUMA nodes using `mbind(MPOL_BIND)`.
1900/// `nodes` is the set of NUMA node IDs. Logs a warning on error
1901/// (single-node systems, missing capabilities).
1902///
1903/// # Safety
1904///
1905/// The caller must ensure that `addr` points to a valid mmap'd region
1906/// of at least `len` bytes. The kernel will read this range via the
1907/// `mbind(2)` syscall to set its NUMA memory policy; passing a stale,
1908/// unmapped, or out-of-bounds pointer is undefined behavior from the
1909/// process's perspective (the syscall itself returns EFAULT, but the
1910/// surrounding Rust contract is violated).
1911///
1912/// When `nodes.is_empty()` or `len == 0`, the function short-circuits
1913/// without dereferencing `addr`, so a null or dangling pointer is
1914/// permitted in those cases.
1915pub unsafe fn mbind_to_nodes(addr: *mut u8, len: usize, nodes: &[usize]) {
1916    if mbind_should_skip(len, nodes) {
1917        return;
1918    }
1919    let node_set: std::collections::BTreeSet<usize> = nodes.iter().copied().collect();
1920    let (nodemask, maxnode) = crate::workload::build_nodemask(&node_set);
1921
1922    let rc = unsafe {
1923        libc::syscall(
1924            libc::SYS_mbind,
1925            addr as *mut libc::c_void,
1926            len,
1927            libc::MPOL_BIND,
1928            nodemask.as_ptr(),
1929            maxnode,
1930            0u32,
1931        )
1932    };
1933    if rc == 0 {
1934        eprintln!(
1935            "performance_mode: mbind {} MB to NUMA node(s) {:?}",
1936            len >> 20,
1937            nodes,
1938        );
1939    } else {
1940        let err = std::io::Error::last_os_error();
1941        eprintln!(
1942            "performance_mode: WARNING: mbind to node(s) {:?} failed: {err}",
1943            nodes,
1944        );
1945    }
1946}
1947
1948use crate::topology::parse_cpu_list_lenient;
1949
1950/// Number of free 2MB hugepages on the host.
1951pub fn hugepages_free() -> u64 {
1952    hugepages_free_from(std::path::Path::new(
1953        "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages",
1954    ))
1955}
1956
1957/// Path-parameterized core of [`hugepages_free`]. Reads the
1958/// `free_hugepages` sysfs file at `path`, parses the trimmed count, and
1959/// returns 0 when the file is absent, unreadable, or contains a value
1960/// that does not parse as a `u64`. Exposes a path seam so the parse and
1961/// the documented 0-fallback can be tested against fixture files without
1962/// depending on the host's hugetlbfs configuration.
1963fn hugepages_free_from(path: &std::path::Path) -> u64 {
1964    std::fs::read_to_string(path)
1965        .ok()
1966        .and_then(|s| s.trim().parse::<u64>().ok())
1967        .unwrap_or(0)
1968}
1969
1970/// Estimate the number of 2 MiB hugepages needed for a given memory size in MiB.
1971pub fn hugepages_needed(memory_mib: u32) -> u64 {
1972    // 2 MiB per hugepage.
1973    (memory_mib as u64).div_ceil(2)
1974}
1975
1976/// Estimate current host CPU load by checking /proc/stat.
1977/// Returns (busy_cpus, total_cpus) as a rough estimate.
1978pub fn host_load_estimate() -> Option<(usize, usize)> {
1979    // Count processes in R state from /proc/stat.
1980    let stat = std::fs::read_to_string("/proc/stat").ok()?;
1981    let procs_running = stat
1982        .lines()
1983        .find(|l| l.starts_with("procs_running "))?
1984        .split_whitespace()
1985        .nth(1)?
1986        .parse::<usize>()
1987        .ok()?;
1988    let online = std::fs::read_to_string("/sys/devices/system/cpu/online").ok()?;
1989    let total = parse_cpu_list_lenient(&online).len();
1990    Some((procs_running, total))
1991}
1992
1993#[cfg(test)]
1994mod tests;