ktstr/test_support/
host_class.rs

1//! Shared classification of test-body host-insufficiency errors.
2//!
3//! Single source of truth for the guard ORDER and the per-class
4//! skip/fail policy applied to the typed host-resource errors a test
5//! build/run can surface. Both consumers call [`classify_host_error`]
6//! and only choose the rendering:
7//! - `err_to_exit_code` (in `super::dispatch`) maps [`HostClass`] to a
8//!   process exit code (skip → `EXIT_PASS`, fail → `EXIT_FAIL`).
9//! - the `#[ktstr_test]` macro body maps it to libtest control flow
10//!   (skip → `eprintln!` + `return`, fail → `panic!`).
11//!
12//! Keeping the classification here — not duplicated in each consumer —
13//! means a reorder or a new host-class type is a one-function edit both
14//! sites inherit, eliminating the dispatch-vs-codegen guard-order
15//! divergence (the two sites previously ordered the same guards
16//! differently, correct only by the types' mutual exclusivity).
17//!
18//! Scope: the SIX host-insufficiency types BOTH sites classify —
19//! [`KernelUnavailable`] (no kernel image resolved — the harness cannot
20//! boot a VM here), [`PerfModeUnavailable`], [`CpuBudgetUnsatisfiable`],
21//! [`TopologyUnrepresentable`], [`ResourceContention`], and
22//! [`TopologyInsufficient`]. A `KernelUnavailable` reaches this classifier
23//! on every nextest invocation: nextest suppresses the plain `#[test]`
24//! wrapper, so the entry runs as `ktstr/{name}` via the `--exact` dispatch
25//! → `run_named_test` → `err_to_exit_code`, NOT the macro body. It is a SKIP
26//! by default — a developer running `cargo nextest run`, or `cargo ktstr
27//! test` without `--kernel`, on a kernel-less host gets a clean skip rather
28//! than a hard fail on every entry — promoted to a FAIL under
29//! `KTSTR_NO_SKIP_MODE`. This cannot mask a CI kernel-build failure: a
30//! `--kernel` the orchestrator FAILS to build bails in cargo-ktstr
31//! (`resolve_kernel_set`) before nextest is spawned, so `KernelUnavailable`
32//! here only ever means "no kernel was requested", never "a requested
33//! kernel failed to build".
34//!
35//! [`PerfModeUnavailable`]: crate::vmm::host_topology::PerfModeUnavailable
36//! [`CpuBudgetUnsatisfiable`]: crate::vmm::host_topology::CpuBudgetUnsatisfiable
37//! [`TopologyUnrepresentable`]: crate::vmm::host_topology::TopologyUnrepresentable
38//! [`ResourceContention`]: crate::vmm::host_topology::ResourceContention
39//! [`TopologyInsufficient`]: crate::vmm::host_topology::TopologyInsufficient
40//! [`KernelUnavailable`]: crate::test_support::eval::KernelUnavailable
41
42use super::{
43    is_cpu_budget_unsatisfiable, is_kernel_unavailable, is_perf_mode_unavailable,
44    is_resource_contention, is_topology_insufficient, is_topology_unrepresentable,
45};
46use crate::test_support::eval::KernelUnavailable;
47use crate::vmm::host_topology::{
48    CpuBudgetUnsatisfiable, PerfModeUnavailable, ResourceContention, TopologyInsufficient,
49    TopologyUnrepresentable,
50};
51
52/// Outcome of classifying a test-body error against the
53/// host-insufficiency taxonomy.
54///
55/// The `reason` strings are BARE — they carry NO `ktstr: SKIP:` /
56/// `ktstr: FAIL:` prefix. Each consumer adds the prefix in its own
57/// channel: dispatch routes [`Skip`](HostClass::Skip) through
58/// `report::test_skip` (which prepends `ktstr: SKIP:`) and `eprintln!`s
59/// [`Fail`](HostClass::Fail) as `ktstr: FAIL: {reason}`; the macro
60/// `eprintln!`s the skip and `panic!`s the fail with the same two
61/// prefixes.
62#[derive(Debug, Clone, PartialEq, Eq)]
63pub enum HostClass {
64    /// Not one of the six host-insufficiency types — the consumer
65    /// applies its own per-site handling (dispatch: the
66    /// `PostVmAssertionFailure` / `ExpectAutoReproSatisfied` /
67    /// `expect_err` / catch-all arms; macro: the `expect_err` swallow or
68    /// the `expect_ok` panic).
69    NotHostClass,
70    /// The host cannot run the test and no retry changes that
71    /// (`KTSTR_NO_SKIP_MODE` unset). A visible, non-failing skip.
72    Skip { reason: String },
73    /// A hard failure: an unconditional hard-fail type
74    /// (`CpuBudgetUnsatisfiable` / `TopologyUnrepresentable`) OR a
75    /// skip-class type promoted to a failure under `KTSTR_NO_SKIP_MODE`.
76    Fail { reason: String },
77}
78
79/// Walk the error chain for a `T` cause and clone its reason string.
80///
81/// Chain-aware (mirrors the `is_*` predicates): a typed error wrapped in
82/// `.context(...)` (e.g. the eval-layer `"build ktstr_test VM"` /
83/// `"run ktstr_test VM"` wrappers) is still found. Falls back to
84/// `"<unknown>"` if the cause is somehow absent — only reachable if an
85/// `is_*` predicate matched a `T` this extractor then missed, which the
86/// shared chain walk makes impossible in practice.
87fn extract_reason<T, F>(e: &anyhow::Error, reason: F) -> String
88where
89    T: std::error::Error + Send + Sync + 'static,
90    F: Fn(&T) -> String,
91{
92    e.chain()
93        .find_map(|cause| cause.downcast_ref::<T>().map(&reason))
94        .unwrap_or_else(|| "<unknown>".to_string())
95}
96
97/// Classify a test-body error against the host-insufficiency taxonomy.
98///
99/// `no_skip` is `KTSTR_NO_SKIP_MODE` — passed in (not read from the
100/// environment here) so the function stays pure and unit-testable
101/// without env mutation. Each caller reads the env once
102/// (`result_to_exit_code` for dispatch; the generated body for the
103/// macro) and threads it in.
104///
105/// The guard ORDER and the per-class skip/fail policy below are the
106/// single source of truth shared by both consumers. `expect_err` is
107/// deliberately NOT a parameter: a host-class outcome is invariant under
108/// it (a skip stays a skip, a hard fail stays a hard fail) — `expect_err`
109/// is a test-outcome concern each consumer handles after a
110/// [`HostClass::NotHostClass`] result. The `reason` strings reconstruct
111/// the exact banners the two sites emitted before this was extracted
112/// (minus the prefix, which the consumer adds).
113pub fn classify_host_error(e: &anyhow::Error, no_skip: bool) -> HostClass {
114    if is_kernel_unavailable(e) {
115        // No kernel image resolved: the harness cannot boot a VM here (the
116        // binary was run outside `cargo ktstr test`, or `cargo ktstr test`
117        // was run without `--kernel` on a host with no cached/discoverable
118        // kernel). A skip by default — a missing kernel on the runner is a
119        // "not configured here" condition, not a test failure — promoted to
120        // a FAIL under KTSTR_NO_SKIP_MODE for runs that demand execution. A
121        // requested-but-unbuildable `--kernel` bails in cargo-ktstr before
122        // nextest spawns, so this never masks a CI kernel-build failure.
123        let reason = extract_reason::<KernelUnavailable, _>(e, |k| k.diagnostic.clone());
124        return if no_skip {
125            HostClass::Fail {
126                reason: format!(
127                    "harness not configured under --no-skip-mode: {reason}. \
128                     Provide a kernel via --kernel or KTSTR_TEST_KERNEL, or drop \
129                     --no-skip-mode."
130                ),
131            }
132        } else {
133            HostClass::Skip {
134                reason: format!("harness not configured: {reason}"),
135            }
136        };
137    }
138    if is_perf_mode_unavailable(e) {
139        let reason = extract_reason::<PerfModeUnavailable, _>(e, |p| p.reason.clone());
140        return if no_skip {
141            HostClass::Fail {
142                reason: format!(
143                    "performance mode unavailable under --no-skip-mode: {reason}. \
144                     Provision a host with the required CPU / LLC count, narrow the \
145                     test topology, or drop --perf-mode / --no-skip-mode."
146                ),
147            }
148        } else {
149            HostClass::Skip {
150                reason: format!("performance mode unavailable: {reason}"),
151            }
152        };
153    }
154    if is_cpu_budget_unsatisfiable(e) {
155        let reason = extract_reason::<CpuBudgetUnsatisfiable, _>(e, |b| b.reason.clone());
156        return HostClass::Fail {
157            reason: format!("cpu budget unsatisfiable: {reason}"),
158        };
159    }
160    if is_topology_unrepresentable(e) {
161        let reason = extract_reason::<TopologyUnrepresentable, _>(e, |t| t.reason.clone());
162        return HostClass::Fail {
163            reason: format!("topology unrepresentable: {reason}"),
164        };
165    }
166    if is_resource_contention(e) {
167        let reason = extract_reason::<ResourceContention, _>(e, |rc| rc.reason.clone());
168        return if no_skip {
169            HostClass::Fail {
170                reason: format!(
171                    "resource contention under --no-skip-mode: {reason}. \
172                     Either provision hardware that satisfies the test's topology \
173                     requirement, or drop --no-skip-mode / KTSTR_NO_SKIP_MODE to \
174                     accept the skip."
175                ),
176            }
177        } else {
178            HostClass::Skip {
179                reason: format!("resource contention: {reason}"),
180            }
181        };
182    }
183    if is_topology_insufficient(e) {
184        let reason = extract_reason::<TopologyInsufficient, _>(e, |ti| ti.reason.clone());
185        return if no_skip {
186            HostClass::Fail {
187                reason: format!(
188                    "host topology insufficient under --no-skip-mode: {reason}. \
189                     Either provision a host with the required CPU / LLC count, or drop \
190                     --no-skip-mode / KTSTR_NO_SKIP_MODE to accept the skip."
191                ),
192            }
193        } else {
194            HostClass::Skip {
195                reason: format!("host topology insufficient: {reason}"),
196            }
197        };
198    }
199    HostClass::NotHostClass
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    /// A no-kernel host (KernelUnavailable) skips by default — a missing
207    /// kernel on the runner is "not configured here", not a test failure —
208    /// and is promoted to a hard fail under `no_skip`. The bare reason is
209    /// the extracted diagnostic.
210    #[test]
211    fn kernel_unavailable_skip_then_fail() {
212        let mk = || {
213            anyhow::Error::new(KernelUnavailable {
214                diagnostic: "no kernel image resolved".into(),
215            })
216        };
217        match classify_host_error(&mk(), false) {
218            HostClass::Skip { reason } => {
219                assert_eq!(reason, "harness not configured: no kernel image resolved");
220            }
221            other => panic!("expected Skip, got {other:?}"),
222        }
223        match classify_host_error(&mk(), true) {
224            HostClass::Fail { reason } => {
225                assert!(reason.starts_with("harness not configured under --no-skip-mode:"));
226                assert!(reason.contains("no kernel image resolved"));
227            }
228            other => panic!("expected Fail, got {other:?}"),
229        }
230    }
231
232    /// A perf-mode-too-small error skips by default and is promoted to a
233    /// hard fail only under `no_skip`. The reason text is the bare,
234    /// prefix-free form each consumer renders.
235    #[test]
236    fn perf_mode_unavailable_skip_then_fail() {
237        let mk = || {
238            anyhow::Error::new(PerfModeUnavailable {
239                reason: "host too small for perf topology".into(),
240            })
241        };
242        match classify_host_error(&mk(), false) {
243            HostClass::Skip { reason } => {
244                assert_eq!(
245                    reason,
246                    "performance mode unavailable: host too small for perf topology"
247                );
248            }
249            other => panic!("expected Skip, got {other:?}"),
250        }
251        match classify_host_error(&mk(), true) {
252            HostClass::Fail { reason } => {
253                assert!(reason.starts_with("performance mode unavailable under --no-skip-mode:"));
254                assert!(reason.contains("host too small for perf topology"));
255            }
256            other => panic!("expected Fail, got {other:?}"),
257        }
258    }
259
260    /// Resource contention: skip default, fail under `no_skip`.
261    #[test]
262    fn resource_contention_skip_then_fail() {
263        let mk = || {
264            anyhow::Error::new(ResourceContention {
265                reason: "all 3 LLC slots busy".into(),
266            })
267        };
268        assert_eq!(
269            classify_host_error(&mk(), false),
270            HostClass::Skip {
271                reason: "resource contention: all 3 LLC slots busy".into()
272            }
273        );
274        match classify_host_error(&mk(), true) {
275            HostClass::Fail { reason } => {
276                assert!(reason.starts_with("resource contention under --no-skip-mode:"));
277            }
278            other => panic!("expected Fail, got {other:?}"),
279        }
280    }
281
282    /// Topology insufficient: skip default, fail under `no_skip`.
283    #[test]
284    fn topology_insufficient_skip_then_fail() {
285        let mk = || {
286            anyhow::Error::new(TopologyInsufficient {
287                reason: "host has too few CPUs".into(),
288            })
289        };
290        assert_eq!(
291            classify_host_error(&mk(), false),
292            HostClass::Skip {
293                reason: "host topology insufficient: host has too few CPUs".into()
294            }
295        );
296        match classify_host_error(&mk(), true) {
297            HostClass::Fail { reason } => {
298                assert!(reason.starts_with("host topology insufficient under --no-skip-mode:"));
299            }
300            other => panic!("expected Fail, got {other:?}"),
301        }
302    }
303
304    /// Cpu-budget-unsatisfiable is an UNCONDITIONAL hard fail — `no_skip`
305    /// does not change it (it is already a failure).
306    #[test]
307    fn cpu_budget_unsatisfiable_always_fails() {
308        let mk = || {
309            anyhow::Error::new(CpuBudgetUnsatisfiable {
310                reason: "--cpu-cap exceeds allowed CPUs".into(),
311            })
312        };
313        for no_skip in [false, true] {
314            match classify_host_error(&mk(), no_skip) {
315                HostClass::Fail { reason } => {
316                    assert_eq!(
317                        reason,
318                        "cpu budget unsatisfiable: --cpu-cap exceeds allowed CPUs"
319                    );
320                }
321                other => panic!("expected Fail (no_skip={no_skip}), got {other:?}"),
322            }
323        }
324    }
325
326    /// Topology-unrepresentable is an UNCONDITIONAL hard fail.
327    #[test]
328    fn topology_unrepresentable_always_fails() {
329        let mk = || {
330            anyhow::Error::new(TopologyUnrepresentable {
331                reason: "aarch64 vcpus exceed GICv3 redistributor capacity".into(),
332            })
333        };
334        for no_skip in [false, true] {
335            match classify_host_error(&mk(), no_skip) {
336                HostClass::Fail { reason } => {
337                    assert!(reason.starts_with("topology unrepresentable:"));
338                }
339                other => panic!("expected Fail (no_skip={no_skip}), got {other:?}"),
340            }
341        }
342    }
343
344    /// A plain (non-typed) error and the test-outcome markers are NOT
345    /// host-class — the classifier returns `NotHostClass` so each
346    /// consumer's own marker / expect_err / catch-all handling runs. A
347    /// classifier that swallowed these would erase real failures.
348    #[test]
349    fn non_host_error_is_not_host_class() {
350        let plain = anyhow::anyhow!("scheduler regression: workload did not get the CPU it needs");
351        assert_eq!(classify_host_error(&plain, false), HostClass::NotHostClass);
352        assert_eq!(classify_host_error(&plain, true), HostClass::NotHostClass);
353    }
354
355    /// Chain-aware: a typed error wrapped in `.context(...)` (the
356    /// production shape — the eval layer wraps every build/run error)
357    /// still classifies, and the extracted reason is the inner typed
358    /// reason, NOT the wrapping context layer.
359    #[test]
360    fn classifies_through_context_wrap() {
361        let wrapped = anyhow::Error::new(ResourceContention {
362            reason: "all 3 LLC slots busy".into(),
363        })
364        .context("build ktstr_test VM")
365        .context("run ktstr_test VM");
366        assert_eq!(
367            classify_host_error(&wrapped, false),
368            HostClass::Skip {
369                reason: "resource contention: all 3 LLC slots busy".into()
370            }
371        );
372    }
373}