ktstr/scenario/snapshot/error.rs
1//! [`SnapshotError`] (every fallible accessor's structured error) plus
2//! its [`std::fmt::Display`] impl and [`SnapshotResult`] alias. Lives in its own
3//! file so the variant catalogue is easy to scan when adding a new
4//! accessor — `cargo doc` surfaces the same single-page view as the
5//! source.
6
7use super::HEX_KEY_PREFIX;
8
9// ---------------------------------------------------------------------------
10// Missing-stats reason
11// ---------------------------------------------------------------------------
12
13/// Why a sample's `stats` slot is unavailable — carried on
14/// [`SnapshotError::MissingStats`] so operator diagnostics name
15/// the specific failure mode rather than the generic "stats
16/// absent". Built by `From<&crate::vmm::sched_stats::SchedStatsError>`
17/// for the relay-failure path, plus dedicated variants for the
18/// pre-client gates that the `crate::vmm::SchedStatsError` enum doesn't
19/// cover (no scheduler binary configured).
20#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
21#[non_exhaustive]
22pub enum MissingStatsReason {
23 /// No `scheduler_binary` was configured on the run, so the
24 /// freeze coordinator never wired a `crate::vmm::SchedStatsClient`.
25 /// Every periodic sample bypasses the stats request entirely
26 /// and lands here.
27 NoSchedulerBinary,
28 /// The guest relay never connected to the scheduler's Unix
29 /// socket (no scheduler running, or the scheduler refused the
30 /// connection).
31 NoScheduler { reason: String },
32 /// The host-side coordinator marked the run as freezing while
33 /// this stats request was in flight (or about to start);
34 /// scx_stats responses are undefined while the scheduler's
35 /// userspace thread is paused.
36 DuringFreeze,
37 /// The run-wide cancel flag was set (watchdog fired or the
38 /// run is shutting down) while this stats request was in
39 /// flight or about to start.
40 Cancelled,
41 /// The scheduler returned a non-zero `errno` in the typed
42 /// `crate::vmm::StatsResponse` envelope. The `args` payload is preserved
43 /// so operators can render the scheduler-side message.
44 SchedulerError { errno: i32, args: serde_json::Value },
45 /// The typed envelope was decoded but the inner `args` map
46 /// did not contain the expected `"resp"` key — protocol
47 /// mismatch with the scheduler.
48 MissingResp { args: serde_json::Value },
49 /// The caller passed a stats request larger than the client's
50 /// `crate::vmm::sched_stats::MAX_REQUEST_BYTES` cap.
51 RequestTooLarge { size: usize, max: usize },
52 /// The scheduler's response grew past
53 /// `crate::vmm::sched_stats::MAX_RESPONSE_BYTES` without ever emitting a newline.
54 ResponseTooLarge { size: usize, max: usize },
55 /// The shared response mutex was poisoned by a previous
56 /// panic; the stats client cannot recover for this sample.
57 MutexPoisoned,
58}
59
60impl std::fmt::Display for MissingStatsReason {
61 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62 match self {
63 Self::NoSchedulerBinary => {
64 write!(f, "no scheduler_binary configured for this run")
65 }
66 Self::NoScheduler { reason } => {
67 write!(f, "guest relay reports no scheduler: {reason}")
68 }
69 Self::DuringFreeze => {
70 write!(
71 f,
72 "stats request cancelled — freeze coordinator paused the scheduler"
73 )
74 }
75 Self::Cancelled => {
76 write!(
77 f,
78 "stats request cancelled — run-wide cancel flag was set (watchdog or shutdown)"
79 )
80 }
81 Self::SchedulerError { errno, args } => {
82 write!(f, "scheduler returned errno={errno} (args={args})")
83 }
84 Self::MissingResp { args } => {
85 write!(f, "scheduler envelope missing 'resp' key (args={args})")
86 }
87 Self::RequestTooLarge { size, max } => {
88 write!(f, "stats request {size} bytes exceeds {max}-byte cap")
89 }
90 Self::ResponseTooLarge { size, max } => {
91 write!(f, "stats response {size} bytes exceeds {max}-byte cap")
92 }
93 Self::MutexPoisoned => {
94 write!(f, "stats client response mutex was poisoned")
95 }
96 }
97 }
98}
99
100impl From<&anyhow::Error> for MissingStatsReason {
101 /// Downcast the anyhow chain to a typed
102 /// `crate::vmm::SchedStatsError`
103 /// when one is present (every `SchedStatsClient` failure path
104 /// boxes a typed variant via `anyhow::anyhow!(SchedStatsError::…)`,
105 /// so the downcast succeeds on every well-formed sched_stats
106 /// error). Falls back to [`MissingStatsReason::NoScheduler`]
107 /// carrying the rendered display when the downcast fails — that
108 /// covers serde / IO / other errors that didn't originate inside
109 /// `crate::vmm::SchedStatsClient` but still surface through the same
110 /// `Result<_, anyhow::Error>` return.
111 fn from(e: &anyhow::Error) -> Self {
112 if let Some(typed) = e.downcast_ref::<crate::vmm::sched_stats::SchedStatsError>() {
113 return Self::from(typed);
114 }
115 Self::NoScheduler {
116 reason: e.to_string(),
117 }
118 }
119}
120
121impl From<&crate::vmm::sched_stats::SchedStatsError> for MissingStatsReason {
122 fn from(e: &crate::vmm::sched_stats::SchedStatsError) -> Self {
123 use crate::vmm::sched_stats::SchedStatsError as S;
124 match e {
125 S::Poisoned => Self::MutexPoisoned,
126 S::RequestTooLarge { size, max } => Self::RequestTooLarge {
127 size: *size,
128 max: *max,
129 },
130 S::ResponseTooLarge { size, max } => Self::ResponseTooLarge {
131 size: *size,
132 max: *max,
133 },
134 S::DuringFreeze => Self::DuringFreeze,
135 S::Cancelled => Self::Cancelled,
136 S::NoScheduler { reason } => Self::NoScheduler {
137 reason: reason.clone(),
138 },
139 S::SchedulerError { errno, args } => Self::SchedulerError {
140 errno: *errno,
141 args: args.clone(),
142 },
143 S::MissingResp { args } => Self::MissingResp { args: args.clone() },
144 }
145 }
146}
147
148// ---------------------------------------------------------------------------
149// Excluded map payload
150// ---------------------------------------------------------------------------
151
152/// One captured map that the KVA-whitelist filter rejected.
153/// Payload for [`SnapshotError::ActiveFilterExcludedMaps::excluded_maps`].
154/// The `map_kva` field name matches
155/// [`crate::monitor::dump::FailureDumpMap::map_kva`] (the
156/// source-of-truth field), and a `map_kva == 0` here flags a
157/// capture where the per-map KVA was not recorded (synthetic
158/// fixture or capture-path bug — production captures filter zero
159/// KVAs out at the walker level).
160#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
161#[non_exhaustive]
162pub struct ExcludedMap {
163 pub name: String,
164 pub map_kva: u64,
165}
166
167// ---------------------------------------------------------------------------
168// Error type
169// ---------------------------------------------------------------------------
170
171/// Reason a snapshot accessor or terminal read could not resolve.
172///
173/// Returned by every fallible accessor (`Snapshot::map`,
174/// `SnapshotEntry::get`, `SnapshotField::as_u64`, …) so a missing
175/// field, type mismatch, or absent map surfaces as a structured
176/// error the test author can `?`-propagate. Each variant carries
177/// the path / alternatives needed to fix the call site without
178/// re-running the test.
179#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
180#[non_exhaustive]
181pub enum SnapshotError {
182 /// No map matched the requested name. `available` enumerates
183 /// the captured map names so a typo surfaces in test output.
184 MapNotFound {
185 requested: String,
186 available: Vec<String>,
187 },
188 /// No top-level global variable matched the requested name in
189 /// any `*.bss` / `*.data` / `*.rodata` global-section map.
190 /// `available` lists the union of every section's top-level
191 /// member names.
192 VarNotFound {
193 requested: String,
194 available: Vec<String>,
195 },
196 /// More than one global-section map exposes a top-level member
197 /// with the requested name, so [`super::Snapshot::var`] cannot pick a
198 /// deterministic answer. `found_in` lists every map (in capture
199 /// order) where the name was seen — the caller should disambiguate
200 /// via [`super::Snapshot::map`] and walk into the named map directly
201 /// (e.g. `snap.map("scx_obj.bss")?.at(0).get("nr_cpus")`).
202 AmbiguousVar {
203 requested: String,
204 found_in: Vec<String>,
205 },
206 /// A path component did not match any
207 /// `crate::monitor::btf_render::RenderedValue::Struct` member at that depth. `requested`
208 /// is the user-supplied lookup string; `walked` is the prefix
209 /// that resolved successfully; `component` is the failing
210 /// segment; `available` lists the struct's actual member names.
211 FieldNotFound {
212 requested: String,
213 walked: String,
214 component: String,
215 available: Vec<String>,
216 },
217 /// A path component reached a non-Struct value where a struct
218 /// was expected (e.g. descending into a `Uint` leaf).
219 /// `requested` is the user-supplied lookup string; `kind` names
220 /// the actual variant for diagnostics.
221 NotAStruct {
222 requested: String,
223 walked: String,
224 component: String,
225 kind: String,
226 },
227 /// A typed accessor (`as_u64` etc.) was called on a rendered
228 /// shape it cannot decode (e.g. `as_str` on a `Struct`).
229 /// `expected` names the scalar type the accessor requires;
230 /// `actual` names the rendered variant; `requested` is the
231 /// user-supplied lookup string (empty when the accessor was
232 /// invoked on a leaf without a path walk).
233 TypeMismatch {
234 expected: String,
235 actual: String,
236 requested: String,
237 },
238 /// A map index was out of range for the underlying entry list.
239 IndexOutOfRange {
240 map: String,
241 index: usize,
242 len: usize,
243 },
244 /// A per-CPU slot was out of range or unmapped.
245 PerCpuSlot {
246 map: String,
247 cpu: u32,
248 len: usize,
249 unmapped: bool,
250 },
251 /// A predicate-based lookup (`find`, `max_by`) found no match.
252 /// `len` is the number of entries the lookup traversed before
253 /// giving up; `available_keys` is a small sample (up to
254 /// `NO_MATCH_KEY_SAMPLE` entries) of rendered keys seen during
255 /// the traversal so an operator can distinguish "empty map"
256 /// (`len == 0`) from "populated map with no predicate hit"
257 /// (`len > 0`) and inspect the sample to debug the predicate.
258 /// Keys are rendered via `crate::monitor::btf_render::RenderedValue`'s `Display` impl and
259 /// each is capped at `NO_MATCH_KEY_CHAR_CAP` chars with an
260 /// ellipsis to keep the failure message readable for wide struct
261 /// keys.
262 ///
263 /// Aggregation methods (`max_by`, `cpu_max_u64` / `cpu_min_u64`
264 /// / `cpu_max_f64` / `cpu_min_f64`) produce this variant for
265 /// empty / all-None inputs; their NoMatch always carries
266 /// `len == 0` and empty `available_keys`. Only `find` can
267 /// produce `len > 0` here.
268 NoMatch {
269 map: String,
270 op: String,
271 len: usize,
272 available_keys: Vec<String>,
273 },
274 /// A path string contained an empty component (e.g. `"a..b"`).
275 /// `requested` is the user-supplied lookup string.
276 EmptyPathComponent { requested: String },
277 /// [`super::SnapshotEntry::get`] was called on a per-CPU entry
278 /// without narrowing to a CPU first via [`super::SnapshotMap::cpu`].
279 PerCpuNotNarrowed { map: String },
280 /// Hash entry has no rendered key/value side (BTF type id was
281 /// missing at capture time, leaving the hex bytes only).
282 NoRendered { map: String, side: String },
283 /// The sample's underlying `crate::monitor::dump::FailureDumpReport`
284 /// is a placeholder produced by
285 /// `crate::monitor::dump::FailureDumpReport::placeholder` —
286 /// the freeze-rendezvous path could not collect real data
287 /// (typical cause: vCPU rendezvous timed out). Temporal
288 /// patterns in [`crate::assert::temporal`] route this variant
289 /// through their per-sample skip handling so a placeholder
290 /// sample never falsely registers as zero progress against a
291 /// monotonicity / rate / steady / ratio band. The `reason`
292 /// string mirrors `FailureDumpReport::scx_walker_unavailable`
293 /// when present (set by `placeholder()` to the constructor
294 /// argument), giving the operator the cause without re-walking
295 /// the report.
296 PlaceholderSample { tag: String, reason: String },
297 /// A [`SampleSeries::stats`](crate::scenario::sample::SampleSeries::stats)
298 /// projection ran on a sample whose `stats` field carries an
299 /// `Err` — the stats client was not wired (no
300 /// `scheduler_binary`) or the per-sample stats request failed.
301 /// The carried [`MissingStatsReason`] identifies the *why* so
302 /// operator diagnostics distinguish "no scheduler configured"
303 /// from "scheduler refused the request" from "watchdog
304 /// cancelled the request" without re-walking the source error.
305 /// Distinguishes a per-sample stats coverage gap from an
306 /// in-stats-JSON path miss (`TypeMismatch` /
307 /// `FieldNotFound`) so the temporal-assertion site can
308 /// branch on the cause without re-walking the source.
309 MissingStats {
310 tag: String,
311 reason: MissingStatsReason,
312 },
313 /// A [`SampleSeries::host`](crate::scenario::sample::SampleSeries::host)
314 /// projection ran on a sample whose `per_cpu_time` slice did
315 /// not include `cpu` — placeholder report (freeze rendezvous
316 /// timed out), or a kernel that didn't surface per-CPU
317 /// `kernel_stat`/`tick_cpu_sched`/`kernel_cpustat` resolution
318 /// for the requested CPU. Distinguishes a per-sample host-data
319 /// coverage gap from a kernel-walker failure (`Unavailable` on
320 /// the broader Snapshot accessor) so the temporal-assertion
321 /// site can decide whether to fail strict or skip with a
322 /// rendered Note.
323 HostFieldUnavailable { tag: String, cpu: u32 },
324 /// [`super::Snapshot::var`] / [`super::Snapshot::live_var`] /
325 /// [`super::Snapshot::map`] was called on a snapshot whose
326 /// underlying `crate::monitor::dump::FailureDumpReport` is a
327 /// placeholder (the freeze-rendezvous path could not collect
328 /// real data — typical cause: vCPU rendezvous timed out). The
329 /// captured `report.maps` is empty by construction so the
330 /// var/map lookup has nothing to walk. Distinct from
331 /// [`Self::VarNotFound`] (which means "the captured report did
332 /// not contain a global by this name") so the assertion site
333 /// can distinguish "freeze failed" from "typo in field name".
334 /// `tag` carries the capture tag (if any).
335 PlaceholderSnapshot { tag: Option<String> },
336 /// [`super::Snapshot::active`] / [`super::Snapshot::live_var`]
337 /// could not identify a currently-active scheduler from the
338 /// snapshot's `*scx_root` + `prog_runtime_stats`. Typical
339 /// causes: snapshot taken in the dead window between
340 /// [`crate::scenario::ops::Op::DetachScheduler`] +
341 /// [`crate::scenario::ops::Op::AttachScheduler`]; snapshot
342 /// taken in the post-swap settle window before the new
343 /// scheduler's progs have advanced their run counter; snapshot
344 /// captured before any scheduler attached. Distinct from
345 /// [`Self::AmbiguousVar`] (which means "the snapshot has
346 /// multiple scheduler bss copies and the call did not opt
347 /// into active-only filtering") so the assertion site can
348 /// distinguish "no scheduler is running right now" from
349 /// "multiple are running, pick one".
350 NoActiveScheduler { reason: String },
351 /// [`super::Snapshot::var`] / [`super::Snapshot::map`] (or one
352 /// of the `live_*` shortcuts) ran against an active-filtered
353 /// view where the KVA whitelist excluded EVERY captured map
354 /// that shared the active obj prefix (i.e. the admitted set
355 /// for this obj was empty). Distinct from [`Self::VarNotFound`]
356 /// — `VarNotFound` means "the active filter admitted maps but
357 /// none carry the requested name"; this variant means "the
358 /// active filter admitted zero maps for this obj, so the
359 /// lookup never got the chance to walk anything."
360 ///
361 /// The variant never fires when at least one captured
362 /// `<active_obj>.*` map passes the KVA whitelist — in that
363 /// case the lookup miss is a real typo or absent symbol and
364 /// the standard `VarNotFound` / `MapNotFound` carries the
365 /// admitted list. This narrow firing scope prevents
366 /// false-positives that would otherwise mask genuine typos
367 /// in same-binary post-swap captures.
368 ///
369 /// Typical causes when this DOES fire: stale walker capture
370 /// (captured KVAs predate the most recent struct_ops swap),
371 /// same-binary post-swap window where the report still
372 /// carries the old instance's maps, or a walker bug that
373 /// resolved `*scx_root` against a different binary's map set.
374 ActiveFilterExcludedMaps {
375 /// User-supplied lookup string (the `var` / `map`
376 /// argument). For [`super::Snapshot::live_vars_via`] this
377 /// carries the joined name list `"[a, b, c]"`.
378 requested: String,
379 /// Obj name the active filter pinned to
380 /// (`*scx_root → struct_ops map → obj prefix` resolution).
381 active_obj: String,
382 /// Maps captured under the active obj prefix that the KVA
383 /// whitelist rejected.
384 excluded_maps: Vec<ExcludedMap>,
385 /// KVA whitelist the walker populated for the active obj.
386 /// A non-empty set whose every entry mismatched the
387 /// captured `map_kva` values points at stale capture or
388 /// KVA aliasing; an empty set is unreachable through this
389 /// variant (no filter means no exclusion).
390 whitelist_kvas: Vec<u64>,
391 },
392 /// A walker-resolved [`crate::scenario::sample::SampleSeries::bpf_live_u64`]
393 /// / `bpf_live_i64` / `bpf_live_f64` projection detected that
394 /// the snapshot's per-snapshot walker output
395 /// ([`crate::monitor::dump::FailureDumpReport::active_map_kvas`])
396 /// disagrees with an earlier same-phase snapshot's walker
397 /// output for the same lookup. The framework pins the first
398 /// non-empty walker output it sees per phase and surfaces this
399 /// variant for every later same-phase snapshot whose walker
400 /// resolved to a different KVA set — without this gate the
401 /// projected series would silently switch between bss copies
402 /// mid-phase (typical cause: post-`Op::ReplaceScheduler` swap
403 /// window where the walker re-publishes mid-phase) and
404 /// downstream reducers like
405 /// [`crate::assert::temporal::SeriesField::counter_delta_per_phase`]
406 /// would see non-monotonic counter values. The drifted
407 /// samples become per-sample `Err` slots; the temporal
408 /// patterns' standard error-skip semantics apply.
409 WalkerDriftedWithinPhase {
410 phase: crate::assert::Phase,
411 pinned_kvas: Vec<u64>,
412 sample_kvas: Vec<u64>,
413 requested: String,
414 },
415 /// A user-supplied projection closure (the kind passed to
416 /// [`crate::scenario::sample::SampleSeries::bpf`]) signalled
417 /// failure for reasons that don't fit the structured variants
418 /// above. `reason` is the closure's free-form explanation —
419 /// "lookup returned None for sched_id A, B, C" — so the failure
420 /// message stays diagnostic without forcing the closure to
421 /// synthesize an `available: Vec<String>` it cannot populate.
422 ///
423 /// Closures should reach for the structured variants
424 /// ([`Self::VarNotFound`], [`Self::MapNotFound`], etc.) when
425 /// they can; this variant is the escape hatch for higher-level
426 /// disambiguation logic (e.g. "I walked vars(name) and none of
427 /// the candidates matched my active-instance fingerprint").
428 /// Surfaces in temporal-assertion failure messages as
429 /// `projection failed: <reason>`.
430 ProjectionFailed { reason: String },
431 /// A captured map's contents could not be rendered at dump time:
432 /// `crate::monitor::dump::FailureDumpMap::error` is set and the
433 /// map carries no entries / value to walk. Surfaced by
434 /// [`super::Snapshot::var`] / [`super::SnapshotMap::at`] /
435 /// [`super::SnapshotMap::find`] / [`super::SnapshotMap::max_by`]
436 /// instead of [`Self::VarNotFound`] /
437 /// [`Self::IndexOutOfRange`] `{ len: 0 }` /
438 /// [`Self::NoMatch`] `{ len: 0 }` so a guest-memory render
439 /// failure is distinguishable from a genuinely-absent variable
440 /// or a legitimately-empty map. Without this distinction a map
441 /// whose contents failed to read reads identically to "the
442 /// symbol does not exist", masking the capture failure.
443 /// `map` names the owning map; `error` mirrors
444 /// `FailureDumpMap.error`.
445 MapRenderIncomplete { map: String, error: String },
446}
447
448impl std::fmt::Display for SnapshotError {
449 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
450 match self {
451 SnapshotError::MapNotFound {
452 requested,
453 available,
454 } => {
455 write!(
456 f,
457 "snapshot has no map '{requested}' (captured maps: {available:?})"
458 )
459 }
460 SnapshotError::VarNotFound {
461 requested,
462 available,
463 } => {
464 write!(
465 f,
466 "snapshot has no global variable '{requested}' in any \
467 *.bss/*.data/*.rodata map (available globals: {available:?})"
468 )
469 }
470 SnapshotError::AmbiguousVar {
471 requested,
472 found_in,
473 } => {
474 write!(
475 f,
476 "snapshot global '{requested}' is ambiguous (found in \
477 {found_in:?}); use Snapshot::active().var(name) (or the \
478 shorthand Snapshot::live_var(name)) to pick the active \
479 scheduler's copy automatically, or Snapshot::map(name) \
480 to address a specific scheduler's bss explicitly"
481 )
482 }
483 SnapshotError::FieldNotFound {
484 requested,
485 walked,
486 component,
487 available,
488 } => {
489 write!(
490 f,
491 "path '{requested}': component '{component}' (after walking '{walked}') \
492 not found (members at this depth: {available:?})"
493 )
494 }
495 SnapshotError::NotAStruct {
496 requested,
497 walked,
498 component,
499 kind,
500 } => {
501 write!(
502 f,
503 "path '{requested}': component '{component}' (after walking '{walked}') \
504 expected a Struct, got {kind}"
505 )
506 }
507 SnapshotError::TypeMismatch {
508 expected,
509 actual,
510 requested,
511 } => {
512 write!(
513 f,
514 "path '{requested}': cannot read as {expected} — actual rendered \
515 variant is {actual}"
516 )
517 }
518 SnapshotError::IndexOutOfRange { map, index, len } => {
519 write!(f, "map '{map}': index {index} out of range (length {len})")
520 }
521 SnapshotError::PerCpuSlot {
522 map,
523 cpu,
524 len,
525 unmapped,
526 } => {
527 if *unmapped {
528 write!(f, "map '{map}': cpu {cpu} per-CPU slot is unmapped (None)")
529 } else {
530 write!(
531 f,
532 "map '{map}': cpu {cpu} out of range (have {len} per-CPU slots)"
533 )
534 }
535 }
536 SnapshotError::NoMatch {
537 map,
538 op,
539 len,
540 available_keys,
541 } => fmt_no_match(f, map, op, *len, available_keys),
542 SnapshotError::EmptyPathComponent { requested } => {
543 write!(
544 f,
545 "path '{requested}' has an empty component (consecutive '.')"
546 )
547 }
548 SnapshotError::PerCpuNotNarrowed { map } => {
549 write!(
550 f,
551 "map '{map}': per-CPU entry without a CPU narrow — call .cpu(N) first"
552 )
553 }
554 SnapshotError::NoRendered { map, side } => {
555 write!(
556 f,
557 "map '{map}': {side} has no rendered structure (no BTF type at capture time)"
558 )
559 }
560 SnapshotError::PlaceholderSample { tag, reason } => {
561 write!(
562 f,
563 "sample '{tag}' is a placeholder report (capture pipeline did not land): \
564 {reason}"
565 )
566 }
567 SnapshotError::MissingStats { tag, reason } => {
568 write!(f, "sample '{tag}': stats absent ({reason})")
569 }
570 SnapshotError::HostFieldUnavailable { tag, cpu } => {
571 write!(
572 f,
573 "sample '{tag}': per_cpu_time has no entry for cpu {cpu} \
574 (placeholder report or kernel-walker resolution failure)"
575 )
576 }
577 SnapshotError::PlaceholderSnapshot { tag } => match tag {
578 Some(t) => write!(
579 f,
580 "snapshot '{t}' is a placeholder — the freeze-rendezvous \
581 path could not capture real data; no maps to walk"
582 ),
583 None => f.write_str(
584 "snapshot is a placeholder — the freeze-rendezvous path \
585 could not capture real data; no maps to walk",
586 ),
587 },
588 SnapshotError::NoActiveScheduler { reason } => {
589 write!(
590 f,
591 "snapshot has no currently-active scheduler ({reason}); \
592 use Snapshot::vars(name) to enumerate every observed \
593 copy explicitly, Snapshot::live_var(name) to keep the \
594 typed error path while opting into the active filter, \
595 or Snapshot::map(\"<obj>.<section>\") to address a \
596 specific scheduler's bss directly"
597 )
598 }
599 SnapshotError::ActiveFilterExcludedMaps {
600 requested,
601 active_obj,
602 excluded_maps,
603 whitelist_kvas,
604 } => {
605 fmt_active_filter_excluded(f, requested, active_obj, excluded_maps, whitelist_kvas)
606 }
607 SnapshotError::WalkerDriftedWithinPhase {
608 phase,
609 pinned_kvas,
610 sample_kvas,
611 requested,
612 } => fmt_walker_drift(f, phase, pinned_kvas, sample_kvas, requested),
613 SnapshotError::ProjectionFailed { reason } => {
614 write!(f, "projection failed: {reason}")
615 }
616 SnapshotError::MapRenderIncomplete { map, error } => {
617 write!(
618 f,
619 "map '{map}' contents unavailable (render failed at capture): {error}"
620 )
621 }
622 }
623 }
624}
625
626/// Render the `NoMatch` arm: an empty-map message, a no-sample-keys
627/// message, or the matched-none message plus the BTF-missing hint when
628/// every sample key fell back to the hex-bytes form.
629fn fmt_no_match(
630 f: &mut std::fmt::Formatter<'_>,
631 map: &str,
632 op: &str,
633 len: usize,
634 available_keys: &[String],
635) -> std::fmt::Result {
636 if len == 0 {
637 write!(f, "map '{map}': {op} matched no entries (map is empty)")
638 } else if available_keys.is_empty() {
639 write!(
640 f,
641 "map '{map}': {op} matched none of {len} entries (sample keys unavailable)"
642 )
643 } else {
644 write!(
645 f,
646 "map '{map}': {op} matched none of {len} entries (first {sampled}: {available_keys:?})",
647 sampled = available_keys.len(),
648 )?;
649 // The `hex:` prefix is only ever produced by
650 // `render_entry_key`'s fallback path when the
651 // entry's `key` field was `None` at capture time.
652 // Typed `RenderedValue::Display` does not emit
653 // this prefix for any scalar variant; `Struct`
654 // emits `TypeName{...}` inline or `TypeName:`
655 // breadcrumb, where a `hex:` collision would
656 // require a BTF struct literally named `hex` —
657 // no real kernel scheduler does that. The hint
658 // therefore fires only when BTF was uniformly
659 // absent for this map's key type at capture time,
660 // and names the kernel-side fix so the operator
661 // does not have to reverse-engineer the `hex:`
662 // discriminator.
663 if available_keys.iter().all(|k| k.starts_with(HEX_KEY_PREFIX)) {
664 write!(
665 f,
666 " (BTF missing at capture — keys shown as hex bytes; \
667 rebuild guest kernel with CONFIG_DEBUG_INFO_BTF=y for \
668 typed keys)"
669 )?;
670 }
671 Ok(())
672 }
673}
674
675/// Render the `ActiveFilterExcludedMaps` arm: classify why the walker's
676/// KVA whitelist excluded the captured maps sharing the requested obj
677/// prefix, and name the explicit-picker escape hatches.
678fn fmt_active_filter_excluded(
679 f: &mut std::fmt::Formatter<'_>,
680 requested: &str,
681 active_obj: &str,
682 excluded_maps: &[ExcludedMap],
683 whitelist_kvas: &[u64],
684) -> std::fmt::Result {
685 let excluded_rendered = excluded_maps
686 .iter()
687 .map(|m| format!("{}@{:#x}", m.name, m.map_kva))
688 .collect::<Vec<_>>()
689 .join(", ");
690 let some_zero = excluded_maps.iter().any(|m| m.map_kva == 0);
691 let some_alias = excluded_maps
692 .iter()
693 .any(|m| m.map_kva != 0 && !whitelist_kvas.contains(&m.map_kva));
694 let cause = match (some_zero, some_alias) {
695 (false, true) => {
696 "this snapshot pre-dates your most recent \
697 Op::ReplaceScheduler / Op::AttachScheduler — \
698 wait for the next periodic boundary (or re-run \
699 the test) so the walker re-publishes the live \
700 scheduler's KVAs"
701 }
702 (true, false) => {
703 "the captured maps have no recorded KVAs — \
704 the snapshot pre-dates the walker plumbing, \
705 or the capture path failed to record per-map KVAs"
706 }
707 (true, true) => {
708 "some captured maps lack KVAs and some disagree \
709 with the walker's whitelist — both \
710 pre-walker-capture state and a post-swap window \
711 can produce this; re-run the test to regenerate \
712 the snapshot"
713 }
714 (false, false) => "captured KVAs were neither absent nor in disagreement",
715 };
716 write!(
717 f,
718 "snapshot lookup '{requested}' returned no hits under the \
719 active filter (obj='{active_obj}'): the walker's KVA \
720 whitelist {whitelist_kvas:#x?} excluded {n} captured map(s) \
721 sharing the obj prefix: {excluded_rendered} — {cause}. \
722 Reach for Snapshot::vars('{requested}') to enumerate every \
723 copy across all obj prefixes, or Snapshot::map(\"<name>\") \
724 to address one of the excluded maps directly.",
725 n = excluded_maps.len(),
726 )
727}
728
729/// Render the `WalkerDriftedWithinPhase` arm: the walker re-published
730/// its KVA set mid-phase, so the drifted sample is surfaced as Err to
731/// keep per-phase reducers on a single walker decision.
732fn fmt_walker_drift(
733 f: &mut std::fmt::Formatter<'_>,
734 phase: &crate::assert::Phase,
735 pinned_kvas: &[u64],
736 sample_kvas: &[u64],
737 requested: &str,
738) -> std::fmt::Result {
739 write!(
740 f,
741 "walker drift within {phase:?}: lookup '{requested}' resolved against \
742 KVA set {sample_kvas:#x?}, but an earlier same-phase snapshot pinned \
743 {pinned_kvas:#x?}. The walker re-published mid-phase (typical cause: \
744 a post-Op::ReplaceScheduler swap window). The drifted sample is \
745 surfaced as Err so per-phase reducers (counter_delta_per_phase, \
746 ratio_across_phases) see monotonic Ok-sequences from one walker \
747 decision; address by stepping the phase past the swap settle window \
748 or by reading via the explicit picker form."
749 )
750}
751
752impl std::error::Error for SnapshotError {}
753
754/// Result alias for snapshot accessors.
755pub type SnapshotResult<T> = std::result::Result<T, SnapshotError>;
756
757/// Typed shape of one entry drained from the snapshot bridge's
758/// ordered per-tag store. Fields:
759/// * `tag`: snapshot name the report was stored under.
760/// * `report`: `crate::monitor::dump::FailureDumpReport` of the
761/// captured guest state.
762/// * `stats`: scheduler-side stats JSON or a typed
763/// [`MissingStatsReason`] when capture happened without a
764/// wired stats client.
765/// * `elapsed_ms`: optional wall-clock anchor (ms since run-start).
766/// * `step_index`: scenario phase index stamped at capture time.
767/// `Some(idx)` for captures stored via the step-aware entry
768/// points ([`crate::scenario::snapshot::SnapshotBridge::capture_with_step`]
769/// or [`crate::scenario::snapshot::SnapshotBridge::store_with_stats_and_step`]);
770/// `None` for fixture-injected captures via the unstamped legacy
771/// paths ([`crate::scenario::snapshot::SnapshotBridge::capture`]
772/// / [`crate::scenario::snapshot::SnapshotBridge::store`]
773/// / [`crate::scenario::snapshot::SnapshotBridge::store_with_stats`]).
774///
775/// Used by [`crate::scenario::snapshot::SnapshotBridge::drain_ordered_with_stats`]
776/// and [`crate::scenario::sample::SampleSeries::from_drained_typed`].
777/// `#[non_exhaustive]` so future additive fields stay
778/// pattern-match-compatible via rest-pattern destructure
779/// (`DrainedSnapshotEntry { tag, report, .. }`).
780#[derive(Debug)]
781#[non_exhaustive]
782pub struct DrainedSnapshotEntry {
783 pub tag: String,
784 pub report: crate::monitor::dump::FailureDumpReport,
785 pub stats: std::result::Result<serde_json::Value, MissingStatsReason>,
786 pub elapsed_ms: Option<u64>,
787 /// Workload-relative boundary offset (ms) a periodic capture was
788 /// scheduled for (`boundary_ns - scenario_anchor_ns`); `None` for
789 /// non-periodic / on-demand captures. Distinct from `elapsed_ms`
790 /// (run_start-relative fire time). See
791 /// [`crate::scenario::snapshot::SnapshotBridge`]'s store doc.
792 pub boundary_offset_ms: Option<u64>,
793 pub step_index: Option<u16>,
794}
795
796#[cfg(test)]
797mod tests_api_gaps {
798 use super::*;
799
800 /// Pin: `SnapshotError::ProjectionFailed { reason }` renders as
801 /// `projection failed: <reason>` so the temporal-assertion
802 /// failure path surfaces the closure's diagnostic without
803 /// re-wrapping. Closure call-sites synthesize this variant
804 /// when the structured variants (`VarNotFound`, `MapNotFound`,
805 /// `AmbiguousVar`) require an `available: Vec<String>` they
806 /// cannot populate.
807 #[test]
808 fn projection_failed_display_carries_reason() {
809 let e = SnapshotError::ProjectionFailed {
810 reason: "live_var_via picker rejected all 2 candidates".to_string(),
811 };
812 let rendered = format!("{e}");
813 assert_eq!(
814 rendered,
815 "projection failed: live_var_via picker rejected all 2 candidates"
816 );
817 }
818
819 /// Pin: `ProjectionFailed` participates in the same
820 /// `PartialEq` / `Hash` derive set as every other variant —
821 /// pattern-match callers can assert "yes, my projection
822 /// closure failed" without falling through to a `_` arm.
823 #[test]
824 fn projection_failed_eq_and_hash_round_trip() {
825 let a = SnapshotError::ProjectionFailed {
826 reason: "x".to_string(),
827 };
828 let b = a.clone();
829 assert_eq!(a, b);
830 let mut seen = std::collections::HashSet::new();
831 seen.insert(a);
832 assert!(seen.contains(&b));
833 }
834
835 /// `From<&SchedStatsError>` must map EVERY scheduler-stats failure
836 /// to its corresponding MissingStatsReason. A swapped arm would
837 /// mislabel a stats failure (e.g. a size-cap breach reported as a
838 /// poisoned mutex), and the operator's diagnostic would point at the
839 /// wrong cause. Pin each variant's mapping.
840 #[test]
841 fn missing_stats_reason_from_sched_stats_error_maps_each_variant() {
842 use crate::vmm::sched_stats::SchedStatsError as S;
843 use serde_json::json;
844 assert_eq!(
845 MissingStatsReason::from(&S::Poisoned),
846 MissingStatsReason::MutexPoisoned,
847 );
848 assert_eq!(
849 MissingStatsReason::from(&S::DuringFreeze),
850 MissingStatsReason::DuringFreeze,
851 );
852 assert_eq!(
853 MissingStatsReason::from(&S::Cancelled),
854 MissingStatsReason::Cancelled,
855 );
856 assert_eq!(
857 MissingStatsReason::from(&S::RequestTooLarge { size: 10, max: 5 }),
858 MissingStatsReason::RequestTooLarge { size: 10, max: 5 },
859 );
860 assert_eq!(
861 MissingStatsReason::from(&S::ResponseTooLarge { size: 20, max: 8 }),
862 MissingStatsReason::ResponseTooLarge { size: 20, max: 8 },
863 );
864 assert_eq!(
865 MissingStatsReason::from(&S::NoScheduler {
866 reason: "no sock".into()
867 }),
868 MissingStatsReason::NoScheduler {
869 reason: "no sock".into()
870 },
871 );
872 assert_eq!(
873 MissingStatsReason::from(&S::SchedulerError {
874 errno: 13,
875 args: json!({"k": 1}),
876 }),
877 MissingStatsReason::SchedulerError {
878 errno: 13,
879 args: json!({"k": 1}),
880 },
881 );
882 assert_eq!(
883 MissingStatsReason::from(&S::MissingResp {
884 args: json!({"a": 2}),
885 }),
886 MissingStatsReason::MissingResp {
887 args: json!({"a": 2}),
888 },
889 );
890 }
891
892 /// `From<&anyhow::Error>` downcasts a typed SchedStatsError in the
893 /// chain to its mapped variant; a non-typed error falls back to
894 /// NoScheduler carrying the rendered display (so serde/IO errors
895 /// surfacing through the same Result still classify).
896 #[test]
897 fn missing_stats_reason_from_anyhow_downcasts_typed_else_no_scheduler() {
898 use crate::vmm::sched_stats::SchedStatsError as S;
899 let typed: anyhow::Error = anyhow::Error::new(S::DuringFreeze);
900 assert_eq!(
901 MissingStatsReason::from(&typed),
902 MissingStatsReason::DuringFreeze,
903 "a typed SchedStatsError in the anyhow chain must downcast",
904 );
905 let other = anyhow::anyhow!("plain io failure");
906 match MissingStatsReason::from(&other) {
907 MissingStatsReason::NoScheduler { reason } => {
908 assert!(reason.contains("plain io failure"));
909 }
910 x => panic!("expected NoScheduler fallback, got {x:?}"),
911 }
912 }
913
914 /// Every MissingStatsReason renders a non-empty operator message,
915 /// and the value-bearing variants surface their payload.
916 #[test]
917 fn missing_stats_reason_display_covers_each_variant() {
918 use serde_json::json;
919 let cases = [
920 MissingStatsReason::NoSchedulerBinary,
921 MissingStatsReason::NoScheduler { reason: "r".into() },
922 MissingStatsReason::DuringFreeze,
923 MissingStatsReason::Cancelled,
924 MissingStatsReason::SchedulerError {
925 errno: 1,
926 args: json!({}),
927 },
928 MissingStatsReason::MissingResp { args: json!({}) },
929 MissingStatsReason::RequestTooLarge { size: 1, max: 2 },
930 MissingStatsReason::ResponseTooLarge { size: 3, max: 4 },
931 MissingStatsReason::MutexPoisoned,
932 ];
933 for c in &cases {
934 assert!(
935 !format!("{c}").is_empty(),
936 "Display must be non-empty for {c:?}"
937 );
938 }
939 assert!(
940 format!(
941 "{}",
942 MissingStatsReason::RequestTooLarge { size: 10, max: 5 }
943 )
944 .contains("10"),
945 );
946 assert!(
947 format!(
948 "{}",
949 MissingStatsReason::SchedulerError {
950 errno: 13,
951 args: json!({})
952 }
953 )
954 .contains("13"),
955 );
956 }
957}