ktstr/test_support/eval/post_vm.rs
1//! Host-side post_vm plumbing: the post_vm error marker types
2//! (ScxBpfErrorMatcherMismatch,
3//! PostVmAssertionFailure, HostSkipRequest, ExpectAutoReproSatisfied),
4//! the conditional/unconditional callback combiner + dispatch, the
5//! post_vm_skip helper, and skip-sidecar recording. Split out of
6//! eval/mod.rs to keep the module under the size ceiling.
7
8use super::*;
9
10/// Marker error type attached as `anyhow::Context` to the failure
11/// `Err` produced when an scx_bpf_error matcher
12/// ([`crate::assert::Assert::expect_scx_bpf_error_contains`] or
13/// [`crate::assert::Assert::expect_scx_bpf_error_matches`]) mismatched
14/// the captured scheduler log / sched_ext dump corpus.
15///
16/// Dispatch (`crate::test_support::dispatch::result_to_exit_code`)
17/// downcasts the error chain for this marker in the `expect_err = true`
18/// branch and refuses to invert the verdict to a pass — a reproducer
19/// that fired the WRONG bug must fail loudly, not silently invert to
20/// "test passed" via `expect_err`. Without the marker, the matcher's
21/// diagnostic surfaces in stderr but the exit code follows the normal
22/// expect_err inversion path.
23#[derive(Debug, Clone, Copy)]
24pub(crate) struct ScxBpfErrorMatcherMismatch;
25
26impl std::fmt::Display for ScxBpfErrorMatcherMismatch {
27 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28 write!(
29 f,
30 "scx_bpf_error matcher mismatch — the reproducer matcher rejected \
31 this failure mode; expect_err inversion bypassed"
32 )
33 }
34}
35
36impl std::error::Error for ScxBpfErrorMatcherMismatch {}
37
38/// Marker error type attached as `anyhow::Context` to the `Err`
39/// `resolve_scheduler` returns when an orchestrated `cargo build -p
40/// <sched>` (expected to succeed in the non-cargo-test `Discover` path)
41/// FAILED and the operator did not set
42/// `KTSTR_SCHEDULER_ALLOW_STALE_FALLBACK` — the resolver refuses to
43/// validate the test against a possibly-stale pre-built binary.
44///
45/// Dispatch (`crate::test_support::dispatch`) downcasts the error chain
46/// for this marker and forces a hard FAIL EVEN under `expect_err = true`.
47/// The semantic boundary mirrors [`PostVmAssertionFailure`]: `expect_err`
48/// inverts a GUEST-side expected failure, but a build-infra failure is a
49/// HOST-side fault that must never masquerade as the expected guest
50/// failure — without the marker an `expect_err` test whose scheduler
51/// build broke would silently invert to PASS, re-creating the
52/// stale-validation hazard the refusal exists to eliminate. Same
53/// `anyhow::Context` attachment + `downcast_ref` chain-walk as the
54/// sibling markers; the dispatch guard sits with the other host-side
55/// hard-fail markers, before the `expect_err` inversion.
56#[derive(Debug, Clone, Copy)]
57pub(crate) struct SchedulerBuildRefused;
58
59impl std::fmt::Display for SchedulerBuildRefused {
60 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61 write!(
62 f,
63 "scheduler build refused — an orchestrated build expected to \
64 succeed failed; refusing to validate against a possibly-stale \
65 pre-built binary (expect_err inversion bypassed)"
66 )
67 }
68}
69
70impl std::error::Error for SchedulerBuildRefused {}
71
72/// Marker error type attached as `anyhow::Context` to the failure
73/// `Err` produced by `run_ktstr_test_inner_impl` when a host-side
74/// `post_vm` / `post_vm_unconditional` callback returned `Err`
75/// (which `evaluate_vm_result` has already folded into the verdict —
76/// as an `Other` detail in the parse-success arm, as a message prefix
77/// in the parse-fail arms).
78///
79/// Dispatch (`crate::test_support::dispatch::result_to_exit_code`)
80/// downcasts the error chain for this marker and refuses to invert the
81/// verdict to a pass — even under `expect_err = true`. The semantic
82/// boundary: `expect_err` inverts a GUEST-side expected failure (the
83/// scheduler stalled, the workload bailed), but a HOST-side `post_vm`
84/// assertion is always honored. A failure-dump render test that
85/// triggers an expected stall to PRODUCE the dump, then asserts the
86/// dump's contents in `post_vm`, must fail loudly when the dump renders
87/// wrong — not silently invert to "passed" because the stall it relied
88/// on was "expected". Without the marker, the post_vm diagnostic
89/// surfaces in stderr but the exit code follows the normal expect_err
90/// inversion path (a false PASS).
91///
92/// Mirrors [`ScxBpfErrorMatcherMismatch`]: same `anyhow::Context`
93/// attachment, same `downcast_ref` chain-walk at the dispatch arm. The
94/// dispatch arm is positioned AFTER the resource-contention / topology
95/// skip arms (a skip means the test never ran) but BEFORE the
96/// [`ExpectAutoReproSatisfied`] and `expect_err` inversion arms, so a
97/// real host-side regression wins over any inversion.
98#[derive(Debug, Clone, Copy)]
99pub(crate) struct PostVmAssertionFailure;
100
101impl std::fmt::Display for PostVmAssertionFailure {
102 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
103 write!(
104 f,
105 "host-side post_vm assertion failed — expect_err inversion bypassed \
106 (a host-side check is honored even when the accompanying guest-side \
107 failure is expected)"
108 )
109 }
110}
111
112impl std::error::Error for PostVmAssertionFailure {}
113
114/// Marker error type attached as `anyhow::Context` to a `post_vm` /
115/// `post_vm_unconditional` `Err` to request a test SKIP (not a
116/// failure): the host-side callback determined the run is
117/// INCONCLUSIVE — the VM could not produce the artifact the assertion
118/// needs (e.g. a load-starved VM whose BPF probe never attached, so
119/// the failure dump is a placeholder), as opposed to a real
120/// regression. The eval fn detects this marker (context-aware
121/// `downcast_ref`, at the `HostSkipRequest` gate) and returns
122/// [`crate::assert::AssertResult::skip`] instead of folding the `Err`
123/// into the verdict.
124///
125/// A real [`PostVmAssertionFailure`] in a sibling callback DOMINATES:
126/// [`combine_post_vm_errs`] preserves the skip marker only when BOTH
127/// callbacks request skip (or only one callback ran); a genuine
128/// failure alongside a skip request collapses to a failure, so a skip
129/// request can never mask a regression.
130#[derive(Debug, Clone, Copy)]
131pub(crate) struct HostSkipRequest;
132
133impl std::fmt::Display for HostSkipRequest {
134 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
135 write!(
136 f,
137 "host-side post_vm requested skip — the run is inconclusive \
138 (the VM could not produce the artifact the assertion needs)"
139 )
140 }
141}
142
143impl std::error::Error for HostSkipRequest {}
144
145/// Marker error type attached as `anyhow::Context` to the failure
146/// `Err` produced by `evaluate_vm_result` when
147/// [`apply_expect_auto_repro_inversion`] has set
148/// `result.expect_auto_repro_satisfied = true`: the primary VM
149/// produced a Fail AND a shape-valid `.repro.wprof.pb` artifact
150/// landed on disk from the auto-repro VM.
151///
152/// Dispatch (`crate::test_support::dispatch::result_to_exit_code`)
153/// downcasts the error chain for this marker and routes the verdict
154/// to `EXIT_PASS`. The underlying `AssertResult` is NOT mutated —
155/// the original failure detail still surfaces in stderr/dump
156/// rendering so an operator chasing why `expect_auto_repro` fired
157/// sees the original failure trail alongside the inversion notice.
158#[derive(Debug, Clone, Copy)]
159pub(crate) struct ExpectAutoReproSatisfied;
160
161impl std::fmt::Display for ExpectAutoReproSatisfied {
162 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163 write!(
164 f,
165 "expect_auto_repro satisfied — the primary test failed and the \
166 auto-repro VM produced a shape-valid .repro.wprof.pb artifact; \
167 verdict inverted to PASS"
168 )
169 }
170}
171
172impl std::error::Error for ExpectAutoReproSatisfied {}
173
174/// Marker error type attached as `anyhow::Context` to the failure `Err`
175/// [`render_failure_verdict_message`]
176/// builds when `entry.survives_storm` is set AND the failing
177/// [`AssertResult`] carries a scheduler-death
178/// `DetailKind` (`SchedulerCrashed` / `SchedulerExitedCleanly` /
179/// `SchedulerDiedUnknownReason`). `err_to_exit_code` downcasts it and forces
180/// `EXIT_FAIL` with a survival-specific explainer, positioned BEFORE the
181/// [`ExpectAutoReproSatisfied`] / `expect_err` inversion arms so a survival
182/// violation can never be inverted to PASS (the validate-time
183/// `survives_storm`/`expect_err` mutex already forbids that combination;
184/// the ordering is defense-in-depth). Mirrors [`ScxBpfErrorMatcherMismatch`].
185#[derive(Debug, Clone, Copy)]
186pub(crate) struct SurvivesStormViolated;
187
188impl std::fmt::Display for SurvivesStormViolated {
189 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190 write!(
191 f,
192 "survives_storm asserted but the scx scheduler did not survive \
193 the run — it died or was ejected during a hold"
194 )
195 }
196}
197
198impl std::error::Error for SurvivesStormViolated {}
199
200/// Combine the conditional and unconditional `post_vm` failure
201/// signals. When both callbacks fail in the same run, surface
202/// BOTH errors in a single chained message so a debugging
203/// operator sees both regressions on the first pass — a `.or()`
204/// would silently drop the unconditional signal whenever the
205/// conditional also fired, defeating the whole point of the
206/// unconditional callback.
207pub(crate) fn combine_post_vm_errs(
208 conditional: Option<anyhow::Error>,
209 unconditional: Option<anyhow::Error>,
210) -> Option<anyhow::Error> {
211 match (conditional, unconditional) {
212 (Some(c), Some(u)) => {
213 // A genuine failure dominates a skip request: collapse to a
214 // skip only when BOTH callbacks requested skip (both
215 // inconclusive). Otherwise a real PostVmAssertionFailure
216 // must surface, so the chained message wins and the
217 // HostSkipRequest marker is intentionally dropped.
218 let both_skip = c.downcast_ref::<HostSkipRequest>().is_some()
219 && u.downcast_ref::<HostSkipRequest>().is_some();
220 let combined = anyhow::anyhow!("post_vm: {c:#}; post_vm_unconditional: {u:#}");
221 Some(if both_skip {
222 combined.context(HostSkipRequest)
223 } else {
224 combined
225 })
226 }
227 (Some(c), None) => Some(c),
228 (None, Some(u)) => Some(u),
229 (None, None) => None,
230 }
231}
232
233/// Request a test SKIP from a `post_vm` / `post_vm_unconditional`
234/// callback: `return Err(post_vm_skip(reason))` when the run is
235/// INCONCLUSIVE — the VM could not produce the artifact the assertion
236/// needs (e.g. a load-starved VM whose BPF probe never attached,
237/// leaving a placeholder failure dump), as distinct from a real
238/// regression. The framework detects the attached `HostSkipRequest`
239/// marker and converts the run to
240/// [`crate::assert::AssertResult::skip`] instead of a failure.
241///
242/// A genuine `Err` from a sibling callback dominates (see
243/// `combine_post_vm_errs`): a skip request never masks a regression.
244pub fn post_vm_skip(reason: impl Into<String>) -> anyhow::Error {
245 anyhow::anyhow!("{}", reason.into()).context(HostSkipRequest)
246}
247
248/// Dispatch the entry's `post_vm` + `post_vm_unconditional`
249/// callbacks and combine their failure signals.
250///
251/// - `post_vm` runs only when the guest reported a non-Fail
252/// `AssertResult` (Skip / Inconclusive / Pass) — the
253/// `guest_already_failed` parameter folds the
254/// `parse_assert_result_from_drain` lookup the call site does.
255/// The skip mirrors the suppression contract documented on
256/// `KtstrTestEntry::post_vm`.
257///
258/// - `post_vm_unconditional` ALWAYS runs — bypasses the
259/// guest-fail suppression that gates `post_vm`. The callback
260/// owns its own skip-on-crash logic (or doesn't, when the
261/// intent is "assert on host-side artifact regardless of
262/// guest-side outcome").
263///
264/// Both callbacks route through [`invoke_post_vm_callback`] so a
265/// panic in either body becomes an `anyhow::Error` rather than
266/// unwinding past the call site (which would leak VM resources;
267/// see the helper doc).
268///
269/// Returns the combined `Option<anyhow::Error>` via
270/// [`combine_post_vm_errs`]: when both callbacks fail, the
271/// chained message names both errors so the operator sees both
272/// regressions on the first pass instead of a two-pass debug
273/// cycle. `.or()` would silently drop the unconditional fail
274/// when the conditional also fired.
275pub(crate) fn run_post_vm_callbacks(
276 entry: &KtstrTestEntry,
277 result: &crate::vmm::VmResult,
278 guest_already_failed: bool,
279) -> Option<anyhow::Error> {
280 let conditional = if guest_already_failed {
281 None
282 } else {
283 entry
284 .post_vm
285 .and_then(|cb| invoke_post_vm_callback(cb, result, "post_vm"))
286 };
287 let unconditional = entry
288 .post_vm_unconditional
289 .and_then(|cb| invoke_post_vm_callback(cb, result, "post_vm_unconditional"));
290 combine_post_vm_errs(conditional, unconditional)
291}
292
293/// Invoke a `post_vm` / `post_vm_unconditional` callback with panic
294/// catch. Converts a panic to `anyhow::Error` so the panic message
295/// surfaces in the test failure output AND the rest of the
296/// post-VM teardown (`write_placeholder_failure_dump_if_missing`,
297/// `drop(vm)` releasing CPU/LLC flocks + guest memory + kernel-cache
298/// reader flock) still runs.
299///
300/// Without the catch, a panicking callback would unwind past the
301/// placeholder-dump emission and past `drop(vm)`, leaking VM
302/// resources (flocks, guest memory) until process exit or the next
303/// test's drop reclaims them. Same hazard for `Ok` returns from
304/// callbacks that subsequently panic in their inner state — both
305/// paths fold into this single guard.
306///
307/// `label` is woven into the error message so the operator sees
308/// which callback panicked (`post_vm` vs `post_vm_unconditional`)
309/// when both are wired and both fire.
310///
311/// Returns `Some(err)` when the callback returns `Err` OR panics;
312/// returns `None` when the callback returns `Ok(())`. Mirrors the
313/// shape `.err()` produces from `Result` so the caller's
314/// `.and_then(|cb| ...)` flows unchanged.
315///
316/// Under `panic = "abort"` (release builds — see `Cargo.toml
317/// [profile.release]`), `catch_unwind` is a no-op: a panic aborts
318/// the process before this function returns. The wrap is still
319/// safe — `catch_unwind` is always defined, just inert — and the
320/// debug builds get the leak protection that exposes regressions
321/// before they ship.
322pub(crate) fn invoke_post_vm_callback(
323 cb: super::super::PostVmCallback,
324 result: &crate::vmm::VmResult,
325 label: &'static str,
326) -> Option<anyhow::Error> {
327 match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| cb(result))) {
328 Ok(Ok(())) => None,
329 Ok(Err(e)) => Some(e),
330 Err(payload) => {
331 let msg = if let Some(s) = payload.downcast_ref::<&'static str>() {
332 (*s).to_string()
333 } else if let Some(s) = payload.downcast_ref::<String>() {
334 s.clone()
335 } else {
336 "<non-string panic payload>".to_string()
337 };
338 Some(anyhow::anyhow!("{label} callback panicked: {msg}"))
339 }
340 }
341}
342
343/// Write a skip sidecar for `entry`, logging to stderr on failure
344/// without propagating the error. Called wherever a run is skipped
345/// before producing a real result: the skip-class catch-all and the
346/// VM build / VM run arms in [`run_ktstr_test_inner`] (each fires on a
347/// `ResourceContention` or `TopologyInsufficient`), and the
348/// performance-mode / coverage gates at the plain-run entry points in
349/// the crate `dispatch` module. All must record the skip for stats
350/// tooling but cannot meaningfully handle a sidecar-write failure
351/// beyond logging it — the skip itself is still valid; only post-run
352/// stats tooling loses visibility.
353pub(crate) fn record_skip_sidecar(
354 entry: &KtstrTestEntry,
355 topo: Option<&crate::test_support::topo::TopoOverride>,
356) {
357 // Resolve the topology the run of this (entry, override) WOULD boot,
358 // via the same resolve_vm_topology the run path uses, so a preset's
359 // skip and run record the identical topology -> identical
360 // variant_hash -> the retry overwrites instead of coexisting. For a
361 // plain test (topo = None) this is entry.topology.
362 let (resolved_topology, _memory_mib) =
363 crate::test_support::runtime::resolve_vm_topology(entry, topo);
364 if let Err(e) = write_skip_sidecar(entry, &resolved_topology) {
365 // Dual-emit at warn level: an unwritten skip sidecar costs
366 // the run no correctness — the test still skipped — but
367 // silently drops post-run stats tooling's visibility into
368 // the skip, so operators debugging a missing row in a
369 // gauntlet report need a loud-enough log to notice. The
370 // eprintln surfaces under direct nextest / cargo-ktstr
371 // invocations where no tracing subscriber is installed;
372 // the tracing::warn lands in every structured-log consumer
373 // (cargo-ktstr, downstream pipelines) at warn level rather
374 // than the previous implicit debug visibility.
375 let entry_name = entry.name;
376 let rendered = format!("{e:#}");
377 eprintln!("ktstr_test: warn: skip-sidecar write failed for {entry_name}: {rendered}");
378 tracing::warn!(
379 test = %entry_name,
380 err = %rendered,
381 "skip-sidecar write failed — stats tooling will not see this skip",
382 );
383 }
384}
385
386#[cfg(test)]
387mod post_vm_skip_tests {
388 //! Locks in the post_vm→skip mechanism. `post_vm_skip` attaches the
389 //! [`HostSkipRequest`] marker (found by the context-aware
390 //! `downcast_ref` the eval gate uses); `combine_post_vm_errs`
391 //! preserves a lone skip request but lets a genuine sibling failure
392 //! DOMINATE — a skip request must never mask a real regression. A
393 //! revert of either the marker attach or the both-skip gate flips a
394 //! cell here.
395 use super::{HostSkipRequest, PostVmAssertionFailure, combine_post_vm_errs, post_vm_skip};
396
397 fn real_fail() -> anyhow::Error {
398 anyhow::anyhow!("real host-side regression").context(PostVmAssertionFailure)
399 }
400
401 #[test]
402 fn post_vm_skip_carries_marker() {
403 assert!(
404 post_vm_skip("inconclusive: placeholder dump")
405 .downcast_ref::<HostSkipRequest>()
406 .is_some()
407 );
408 }
409
410 #[test]
411 fn combine_lone_unconditional_skip_preserved() {
412 let c = combine_post_vm_errs(None, Some(post_vm_skip("ph"))).unwrap();
413 assert!(c.downcast_ref::<HostSkipRequest>().is_some());
414 }
415
416 #[test]
417 fn combine_lone_conditional_skip_preserved() {
418 let c = combine_post_vm_errs(Some(post_vm_skip("ph")), None).unwrap();
419 assert!(c.downcast_ref::<HostSkipRequest>().is_some());
420 }
421
422 #[test]
423 fn combine_both_skip_yields_skip() {
424 let c = combine_post_vm_errs(Some(post_vm_skip("a")), Some(post_vm_skip("b"))).unwrap();
425 assert!(c.downcast_ref::<HostSkipRequest>().is_some());
426 }
427
428 #[test]
429 fn combine_skip_plus_real_fail_does_not_skip() {
430 // A genuine failure alongside a skip request collapses to a
431 // failure: the combined Err must NOT carry HostSkipRequest, so the
432 // eval gate folds it as a failure (re-attaching PostVmAssertionFailure)
433 // rather than skipping — a regression is never masked.
434 let c = combine_post_vm_errs(Some(post_vm_skip("ph")), Some(real_fail())).unwrap();
435 assert!(c.downcast_ref::<HostSkipRequest>().is_none());
436 }
437
438 #[test]
439 fn combine_real_fail_plus_skip_does_not_skip() {
440 let c = combine_post_vm_errs(Some(real_fail()), Some(post_vm_skip("ph"))).unwrap();
441 assert!(c.downcast_ref::<HostSkipRequest>().is_none());
442 }
443}