ktstr/test_support/host_class.rs
1//! Shared classification of test-body host-insufficiency errors.
2//!
3//! Single source of truth for the guard ORDER and the per-class
4//! skip/fail policy applied to the typed host-resource errors a test
5//! build/run can surface. Both consumers call [`classify_host_error`]
6//! and only choose the rendering:
7//! - `err_to_exit_code` (in `super::dispatch`) maps [`HostClass`] to a
8//! process exit code (skip → `EXIT_PASS`, fail → `EXIT_FAIL`).
9//! - the `#[ktstr_test]` macro body maps it to libtest control flow
10//! (skip → `eprintln!` + `return`, fail → `panic!`).
11//!
12//! Keeping the classification here — not duplicated in each consumer —
13//! means a reorder or a new host-class type is a one-function edit both
14//! sites inherit, eliminating the dispatch-vs-codegen guard-order
15//! divergence (the two sites previously ordered the same guards
16//! differently, correct only by the types' mutual exclusivity).
17//!
18//! Scope: the SIX host-insufficiency types BOTH sites classify —
19//! [`KernelUnavailable`] (no kernel image resolved — the harness cannot
20//! boot a VM here), [`PerfModeUnavailable`], [`CpuBudgetUnsatisfiable`],
21//! [`TopologyUnrepresentable`], [`ResourceContention`], and
22//! [`TopologyInsufficient`]. A `KernelUnavailable` reaches this classifier
23//! on every nextest invocation: nextest suppresses the plain `#[test]`
24//! wrapper, so the entry runs as `ktstr/{name}` via the `--exact` dispatch
25//! → `run_named_test` → `err_to_exit_code`, NOT the macro body. It is a SKIP
26//! by default — a developer running `cargo nextest run`, or `cargo ktstr
27//! test` without `--kernel`, on a kernel-less host gets a clean skip rather
28//! than a hard fail on every entry — promoted to a FAIL under
29//! `KTSTR_NO_SKIP_MODE`. This cannot mask a CI kernel-build failure: a
30//! `--kernel` the orchestrator FAILS to build bails in cargo-ktstr
31//! (`resolve_kernel_set`) before nextest is spawned, so `KernelUnavailable`
32//! here only ever means "no kernel was requested", never "a requested
33//! kernel failed to build".
34//!
35//! [`PerfModeUnavailable`]: crate::vmm::host_topology::PerfModeUnavailable
36//! [`CpuBudgetUnsatisfiable`]: crate::vmm::host_topology::CpuBudgetUnsatisfiable
37//! [`TopologyUnrepresentable`]: crate::vmm::host_topology::TopologyUnrepresentable
38//! [`ResourceContention`]: crate::vmm::host_topology::ResourceContention
39//! [`TopologyInsufficient`]: crate::vmm::host_topology::TopologyInsufficient
40//! [`KernelUnavailable`]: crate::test_support::eval::KernelUnavailable
41
42use super::{
43 is_cpu_budget_unsatisfiable, is_kernel_unavailable, is_perf_mode_unavailable,
44 is_resource_contention, is_topology_insufficient, is_topology_unrepresentable,
45};
46use crate::test_support::eval::KernelUnavailable;
47use crate::vmm::host_topology::{
48 CpuBudgetUnsatisfiable, PerfModeUnavailable, ResourceContention, TopologyInsufficient,
49 TopologyUnrepresentable,
50};
51
52/// Outcome of classifying a test-body error against the
53/// host-insufficiency taxonomy.
54///
55/// The `reason` strings are BARE — they carry NO `ktstr: SKIP:` /
56/// `ktstr: FAIL:` prefix. Each consumer adds the prefix in its own
57/// channel: dispatch routes [`Skip`](HostClass::Skip) through
58/// `report::test_skip` (which prepends `ktstr: SKIP:`) and `eprintln!`s
59/// [`Fail`](HostClass::Fail) as `ktstr: FAIL: {reason}`; the macro
60/// `eprintln!`s the skip and `panic!`s the fail with the same two
61/// prefixes.
62#[derive(Debug, Clone, PartialEq, Eq)]
63pub enum HostClass {
64 /// Not one of the six host-insufficiency types — the consumer
65 /// applies its own per-site handling (dispatch: the
66 /// `PostVmAssertionFailure` / `ExpectAutoReproSatisfied` /
67 /// `expect_err` / catch-all arms; macro: the `expect_err` swallow or
68 /// the `expect_ok` panic).
69 NotHostClass,
70 /// The host cannot run the test and no retry changes that
71 /// (`KTSTR_NO_SKIP_MODE` unset). A visible, non-failing skip.
72 Skip { reason: String },
73 /// A hard failure: an unconditional hard-fail type
74 /// (`CpuBudgetUnsatisfiable` / `TopologyUnrepresentable`) OR a
75 /// skip-class type promoted to a failure under `KTSTR_NO_SKIP_MODE`.
76 Fail { reason: String },
77}
78
79/// Walk the error chain for a `T` cause and clone its reason string.
80///
81/// Chain-aware (mirrors the `is_*` predicates): a typed error wrapped in
82/// `.context(...)` (e.g. the eval-layer `"build ktstr_test VM"` /
83/// `"run ktstr_test VM"` wrappers) is still found. Falls back to
84/// `"<unknown>"` if the cause is somehow absent — only reachable if an
85/// `is_*` predicate matched a `T` this extractor then missed, which the
86/// shared chain walk makes impossible in practice.
87fn extract_reason<T, F>(e: &anyhow::Error, reason: F) -> String
88where
89 T: std::error::Error + Send + Sync + 'static,
90 F: Fn(&T) -> String,
91{
92 e.chain()
93 .find_map(|cause| cause.downcast_ref::<T>().map(&reason))
94 .unwrap_or_else(|| "<unknown>".to_string())
95}
96
97/// Classify a test-body error against the host-insufficiency taxonomy.
98///
99/// `no_skip` is `KTSTR_NO_SKIP_MODE` — passed in (not read from the
100/// environment here) so the function stays pure and unit-testable
101/// without env mutation. Each caller reads the env once
102/// (`result_to_exit_code` for dispatch; the generated body for the
103/// macro) and threads it in.
104///
105/// The guard ORDER and the per-class skip/fail policy below are the
106/// single source of truth shared by both consumers. `expect_err` is
107/// deliberately NOT a parameter: a host-class outcome is invariant under
108/// it (a skip stays a skip, a hard fail stays a hard fail) — `expect_err`
109/// is a test-outcome concern each consumer handles after a
110/// [`HostClass::NotHostClass`] result. The `reason` strings reconstruct
111/// the exact banners the two sites emitted before this was extracted
112/// (minus the prefix, which the consumer adds).
113pub fn classify_host_error(e: &anyhow::Error, no_skip: bool) -> HostClass {
114 if is_kernel_unavailable(e) {
115 // No kernel image resolved: the harness cannot boot a VM here (the
116 // binary was run outside `cargo ktstr test`, or `cargo ktstr test`
117 // was run without `--kernel` on a host with no cached/discoverable
118 // kernel). A skip by default — a missing kernel on the runner is a
119 // "not configured here" condition, not a test failure — promoted to
120 // a FAIL under KTSTR_NO_SKIP_MODE for runs that demand execution. A
121 // requested-but-unbuildable `--kernel` bails in cargo-ktstr before
122 // nextest spawns, so this never masks a CI kernel-build failure.
123 let reason = extract_reason::<KernelUnavailable, _>(e, |k| k.diagnostic.clone());
124 return if no_skip {
125 HostClass::Fail {
126 reason: format!(
127 "harness not configured under --no-skip-mode: {reason}. \
128 Provide a kernel via --kernel or KTSTR_TEST_KERNEL, or drop \
129 --no-skip-mode."
130 ),
131 }
132 } else {
133 HostClass::Skip {
134 reason: format!("harness not configured: {reason}"),
135 }
136 };
137 }
138 if is_perf_mode_unavailable(e) {
139 let reason = extract_reason::<PerfModeUnavailable, _>(e, |p| p.reason.clone());
140 return if no_skip {
141 HostClass::Fail {
142 reason: format!(
143 "performance mode unavailable under --no-skip-mode: {reason}. \
144 Provision a host with the required CPU / LLC count, narrow the \
145 test topology, or drop --perf-mode / --no-skip-mode."
146 ),
147 }
148 } else {
149 HostClass::Skip {
150 reason: format!("performance mode unavailable: {reason}"),
151 }
152 };
153 }
154 if is_cpu_budget_unsatisfiable(e) {
155 let reason = extract_reason::<CpuBudgetUnsatisfiable, _>(e, |b| b.reason.clone());
156 return HostClass::Fail {
157 reason: format!("cpu budget unsatisfiable: {reason}"),
158 };
159 }
160 if is_topology_unrepresentable(e) {
161 let reason = extract_reason::<TopologyUnrepresentable, _>(e, |t| t.reason.clone());
162 return HostClass::Fail {
163 reason: format!("topology unrepresentable: {reason}"),
164 };
165 }
166 if is_resource_contention(e) {
167 let reason = extract_reason::<ResourceContention, _>(e, |rc| rc.reason.clone());
168 return if no_skip {
169 HostClass::Fail {
170 reason: format!(
171 "resource contention under --no-skip-mode: {reason}. \
172 Either provision hardware that satisfies the test's topology \
173 requirement, or drop --no-skip-mode / KTSTR_NO_SKIP_MODE to \
174 accept the skip."
175 ),
176 }
177 } else {
178 HostClass::Skip {
179 reason: format!("resource contention: {reason}"),
180 }
181 };
182 }
183 if is_topology_insufficient(e) {
184 let reason = extract_reason::<TopologyInsufficient, _>(e, |ti| ti.reason.clone());
185 return if no_skip {
186 HostClass::Fail {
187 reason: format!(
188 "host topology insufficient under --no-skip-mode: {reason}. \
189 Either provision a host with the required CPU / LLC count, or drop \
190 --no-skip-mode / KTSTR_NO_SKIP_MODE to accept the skip."
191 ),
192 }
193 } else {
194 HostClass::Skip {
195 reason: format!("host topology insufficient: {reason}"),
196 }
197 };
198 }
199 HostClass::NotHostClass
200}
201
202#[cfg(test)]
203mod tests {
204 use super::*;
205
206 /// A no-kernel host (KernelUnavailable) skips by default — a missing
207 /// kernel on the runner is "not configured here", not a test failure —
208 /// and is promoted to a hard fail under `no_skip`. The bare reason is
209 /// the extracted diagnostic.
210 #[test]
211 fn kernel_unavailable_skip_then_fail() {
212 let mk = || {
213 anyhow::Error::new(KernelUnavailable {
214 diagnostic: "no kernel image resolved".into(),
215 })
216 };
217 match classify_host_error(&mk(), false) {
218 HostClass::Skip { reason } => {
219 assert_eq!(reason, "harness not configured: no kernel image resolved");
220 }
221 other => panic!("expected Skip, got {other:?}"),
222 }
223 match classify_host_error(&mk(), true) {
224 HostClass::Fail { reason } => {
225 assert!(reason.starts_with("harness not configured under --no-skip-mode:"));
226 assert!(reason.contains("no kernel image resolved"));
227 }
228 other => panic!("expected Fail, got {other:?}"),
229 }
230 }
231
232 /// A perf-mode-too-small error skips by default and is promoted to a
233 /// hard fail only under `no_skip`. The reason text is the bare,
234 /// prefix-free form each consumer renders.
235 #[test]
236 fn perf_mode_unavailable_skip_then_fail() {
237 let mk = || {
238 anyhow::Error::new(PerfModeUnavailable {
239 reason: "host too small for perf topology".into(),
240 })
241 };
242 match classify_host_error(&mk(), false) {
243 HostClass::Skip { reason } => {
244 assert_eq!(
245 reason,
246 "performance mode unavailable: host too small for perf topology"
247 );
248 }
249 other => panic!("expected Skip, got {other:?}"),
250 }
251 match classify_host_error(&mk(), true) {
252 HostClass::Fail { reason } => {
253 assert!(reason.starts_with("performance mode unavailable under --no-skip-mode:"));
254 assert!(reason.contains("host too small for perf topology"));
255 }
256 other => panic!("expected Fail, got {other:?}"),
257 }
258 }
259
260 /// Resource contention: skip default, fail under `no_skip`.
261 #[test]
262 fn resource_contention_skip_then_fail() {
263 let mk = || {
264 anyhow::Error::new(ResourceContention {
265 reason: "all 3 LLC slots busy".into(),
266 })
267 };
268 assert_eq!(
269 classify_host_error(&mk(), false),
270 HostClass::Skip {
271 reason: "resource contention: all 3 LLC slots busy".into()
272 }
273 );
274 match classify_host_error(&mk(), true) {
275 HostClass::Fail { reason } => {
276 assert!(reason.starts_with("resource contention under --no-skip-mode:"));
277 }
278 other => panic!("expected Fail, got {other:?}"),
279 }
280 }
281
282 /// Topology insufficient: skip default, fail under `no_skip`.
283 #[test]
284 fn topology_insufficient_skip_then_fail() {
285 let mk = || {
286 anyhow::Error::new(TopologyInsufficient {
287 reason: "host has too few CPUs".into(),
288 })
289 };
290 assert_eq!(
291 classify_host_error(&mk(), false),
292 HostClass::Skip {
293 reason: "host topology insufficient: host has too few CPUs".into()
294 }
295 );
296 match classify_host_error(&mk(), true) {
297 HostClass::Fail { reason } => {
298 assert!(reason.starts_with("host topology insufficient under --no-skip-mode:"));
299 }
300 other => panic!("expected Fail, got {other:?}"),
301 }
302 }
303
304 /// Cpu-budget-unsatisfiable is an UNCONDITIONAL hard fail — `no_skip`
305 /// does not change it (it is already a failure).
306 #[test]
307 fn cpu_budget_unsatisfiable_always_fails() {
308 let mk = || {
309 anyhow::Error::new(CpuBudgetUnsatisfiable {
310 reason: "--cpu-cap exceeds allowed CPUs".into(),
311 })
312 };
313 for no_skip in [false, true] {
314 match classify_host_error(&mk(), no_skip) {
315 HostClass::Fail { reason } => {
316 assert_eq!(
317 reason,
318 "cpu budget unsatisfiable: --cpu-cap exceeds allowed CPUs"
319 );
320 }
321 other => panic!("expected Fail (no_skip={no_skip}), got {other:?}"),
322 }
323 }
324 }
325
326 /// Topology-unrepresentable is an UNCONDITIONAL hard fail.
327 #[test]
328 fn topology_unrepresentable_always_fails() {
329 let mk = || {
330 anyhow::Error::new(TopologyUnrepresentable {
331 reason: "aarch64 vcpus exceed GICv3 redistributor capacity".into(),
332 })
333 };
334 for no_skip in [false, true] {
335 match classify_host_error(&mk(), no_skip) {
336 HostClass::Fail { reason } => {
337 assert!(reason.starts_with("topology unrepresentable:"));
338 }
339 other => panic!("expected Fail (no_skip={no_skip}), got {other:?}"),
340 }
341 }
342 }
343
344 /// A plain (non-typed) error and the test-outcome markers are NOT
345 /// host-class — the classifier returns `NotHostClass` so each
346 /// consumer's own marker / expect_err / catch-all handling runs. A
347 /// classifier that swallowed these would erase real failures.
348 #[test]
349 fn non_host_error_is_not_host_class() {
350 let plain = anyhow::anyhow!("scheduler regression: workload did not get the CPU it needs");
351 assert_eq!(classify_host_error(&plain, false), HostClass::NotHostClass);
352 assert_eq!(classify_host_error(&plain, true), HostClass::NotHostClass);
353 }
354
355 /// Chain-aware: a typed error wrapped in `.context(...)` (the
356 /// production shape — the eval layer wraps every build/run error)
357 /// still classifies, and the extracted reason is the inner typed
358 /// reason, NOT the wrapping context layer.
359 #[test]
360 fn classifies_through_context_wrap() {
361 let wrapped = anyhow::Error::new(ResourceContention {
362 reason: "all 3 LLC slots busy".into(),
363 })
364 .context("build ktstr_test VM")
365 .context("run ktstr_test VM");
366 assert_eq!(
367 classify_host_error(&wrapped, false),
368 HostClass::Skip {
369 reason: "resource contention: all 3 LLC slots busy".into()
370 }
371 );
372 }
373}