ktstr/scenario/snapshot/
error.rs

1//! [`SnapshotError`] (every fallible accessor's structured error) plus
2//! its [`std::fmt::Display`] impl and [`SnapshotResult`] alias. Lives in its own
3//! file so the variant catalogue is easy to scan when adding a new
4//! accessor — `cargo doc` surfaces the same single-page view as the
5//! source.
6
7use super::HEX_KEY_PREFIX;
8
9// ---------------------------------------------------------------------------
10// Missing-stats reason
11// ---------------------------------------------------------------------------
12
13/// Why a sample's `stats` slot is unavailable — carried on
14/// [`SnapshotError::MissingStats`] so operator diagnostics name
15/// the specific failure mode rather than the generic "stats
16/// absent". Built by `From<&crate::vmm::sched_stats::SchedStatsError>`
17/// for the relay-failure path, plus dedicated variants for the
18/// pre-client gates that the `crate::vmm::SchedStatsError` enum doesn't
19/// cover (no scheduler binary configured).
20#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
21#[non_exhaustive]
22pub enum MissingStatsReason {
23    /// No `scheduler_binary` was configured on the run, so the
24    /// freeze coordinator never wired a `crate::vmm::SchedStatsClient`.
25    /// Every periodic sample bypasses the stats request entirely
26    /// and lands here.
27    NoSchedulerBinary,
28    /// The guest relay never connected to the scheduler's Unix
29    /// socket (no scheduler running, or the scheduler refused the
30    /// connection).
31    NoScheduler { reason: String },
32    /// The host-side coordinator marked the run as freezing while
33    /// this stats request was in flight (or about to start);
34    /// scx_stats responses are undefined while the scheduler's
35    /// userspace thread is paused.
36    DuringFreeze,
37    /// The run-wide cancel flag was set (watchdog fired or the
38    /// run is shutting down) while this stats request was in
39    /// flight or about to start.
40    Cancelled,
41    /// The scheduler returned a non-zero `errno` in the typed
42    /// `crate::vmm::StatsResponse` envelope. The `args` payload is preserved
43    /// so operators can render the scheduler-side message.
44    SchedulerError { errno: i32, args: serde_json::Value },
45    /// The typed envelope was decoded but the inner `args` map
46    /// did not contain the expected `"resp"` key — protocol
47    /// mismatch with the scheduler.
48    MissingResp { args: serde_json::Value },
49    /// The caller passed a stats request larger than the client's
50    /// `crate::vmm::sched_stats::MAX_REQUEST_BYTES` cap.
51    RequestTooLarge { size: usize, max: usize },
52    /// The scheduler's response grew past
53    /// `crate::vmm::sched_stats::MAX_RESPONSE_BYTES` without ever emitting a newline.
54    ResponseTooLarge { size: usize, max: usize },
55    /// The shared response mutex was poisoned by a previous
56    /// panic; the stats client cannot recover for this sample.
57    MutexPoisoned,
58}
59
60impl std::fmt::Display for MissingStatsReason {
61    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62        match self {
63            Self::NoSchedulerBinary => {
64                write!(f, "no scheduler_binary configured for this run")
65            }
66            Self::NoScheduler { reason } => {
67                write!(f, "guest relay reports no scheduler: {reason}")
68            }
69            Self::DuringFreeze => {
70                write!(
71                    f,
72                    "stats request cancelled — freeze coordinator paused the scheduler"
73                )
74            }
75            Self::Cancelled => {
76                write!(
77                    f,
78                    "stats request cancelled — run-wide cancel flag was set (watchdog or shutdown)"
79                )
80            }
81            Self::SchedulerError { errno, args } => {
82                write!(f, "scheduler returned errno={errno} (args={args})")
83            }
84            Self::MissingResp { args } => {
85                write!(f, "scheduler envelope missing 'resp' key (args={args})")
86            }
87            Self::RequestTooLarge { size, max } => {
88                write!(f, "stats request {size} bytes exceeds {max}-byte cap")
89            }
90            Self::ResponseTooLarge { size, max } => {
91                write!(f, "stats response {size} bytes exceeds {max}-byte cap")
92            }
93            Self::MutexPoisoned => {
94                write!(f, "stats client response mutex was poisoned")
95            }
96        }
97    }
98}
99
100impl From<&anyhow::Error> for MissingStatsReason {
101    /// Downcast the anyhow chain to a typed
102    /// `crate::vmm::SchedStatsError`
103    /// when one is present (every `SchedStatsClient` failure path
104    /// boxes a typed variant via `anyhow::anyhow!(SchedStatsError::…)`,
105    /// so the downcast succeeds on every well-formed sched_stats
106    /// error). Falls back to [`MissingStatsReason::NoScheduler`]
107    /// carrying the rendered display when the downcast fails — that
108    /// covers serde / IO / other errors that didn't originate inside
109    /// `crate::vmm::SchedStatsClient` but still surface through the same
110    /// `Result<_, anyhow::Error>` return.
111    fn from(e: &anyhow::Error) -> Self {
112        if let Some(typed) = e.downcast_ref::<crate::vmm::sched_stats::SchedStatsError>() {
113            return Self::from(typed);
114        }
115        Self::NoScheduler {
116            reason: e.to_string(),
117        }
118    }
119}
120
121impl From<&crate::vmm::sched_stats::SchedStatsError> for MissingStatsReason {
122    fn from(e: &crate::vmm::sched_stats::SchedStatsError) -> Self {
123        use crate::vmm::sched_stats::SchedStatsError as S;
124        match e {
125            S::Poisoned => Self::MutexPoisoned,
126            S::RequestTooLarge { size, max } => Self::RequestTooLarge {
127                size: *size,
128                max: *max,
129            },
130            S::ResponseTooLarge { size, max } => Self::ResponseTooLarge {
131                size: *size,
132                max: *max,
133            },
134            S::DuringFreeze => Self::DuringFreeze,
135            S::Cancelled => Self::Cancelled,
136            S::NoScheduler { reason } => Self::NoScheduler {
137                reason: reason.clone(),
138            },
139            S::SchedulerError { errno, args } => Self::SchedulerError {
140                errno: *errno,
141                args: args.clone(),
142            },
143            S::MissingResp { args } => Self::MissingResp { args: args.clone() },
144        }
145    }
146}
147
148// ---------------------------------------------------------------------------
149// Excluded map payload
150// ---------------------------------------------------------------------------
151
152/// One captured map that the KVA-whitelist filter rejected.
153/// Payload for [`SnapshotError::ActiveFilterExcludedMaps::excluded_maps`].
154/// The `map_kva` field name matches
155/// [`crate::monitor::dump::FailureDumpMap::map_kva`] (the
156/// source-of-truth field), and a `map_kva == 0` here flags a
157/// capture where the per-map KVA was not recorded (synthetic
158/// fixture or capture-path bug — production captures filter zero
159/// KVAs out at the walker level).
160#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
161#[non_exhaustive]
162pub struct ExcludedMap {
163    pub name: String,
164    pub map_kva: u64,
165}
166
167// ---------------------------------------------------------------------------
168// Error type
169// ---------------------------------------------------------------------------
170
171/// Reason a snapshot accessor or terminal read could not resolve.
172///
173/// Returned by every fallible accessor (`Snapshot::map`,
174/// `SnapshotEntry::get`, `SnapshotField::as_u64`, …) so a missing
175/// field, type mismatch, or absent map surfaces as a structured
176/// error the test author can `?`-propagate. Each variant carries
177/// the path / alternatives needed to fix the call site without
178/// re-running the test.
179#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
180#[non_exhaustive]
181pub enum SnapshotError {
182    /// No map matched the requested name. `available` enumerates
183    /// the captured map names so a typo surfaces in test output.
184    MapNotFound {
185        requested: String,
186        available: Vec<String>,
187    },
188    /// No top-level global variable matched the requested name in
189    /// any `*.bss` / `*.data` / `*.rodata` global-section map.
190    /// `available` lists the union of every section's top-level
191    /// member names.
192    VarNotFound {
193        requested: String,
194        available: Vec<String>,
195    },
196    /// More than one global-section map exposes a top-level member
197    /// with the requested name, so [`super::Snapshot::var`] cannot pick a
198    /// deterministic answer. `found_in` lists every map (in capture
199    /// order) where the name was seen — the caller should disambiguate
200    /// via [`super::Snapshot::map`] and walk into the named map directly
201    /// (e.g. `snap.map("scx_obj.bss")?.at(0).get("nr_cpus")`).
202    AmbiguousVar {
203        requested: String,
204        found_in: Vec<String>,
205    },
206    /// A path component did not match any
207    /// `crate::monitor::btf_render::RenderedValue::Struct` member at that depth. `requested`
208    /// is the user-supplied lookup string; `walked` is the prefix
209    /// that resolved successfully; `component` is the failing
210    /// segment; `available` lists the struct's actual member names.
211    FieldNotFound {
212        requested: String,
213        walked: String,
214        component: String,
215        available: Vec<String>,
216    },
217    /// A path component reached a non-Struct value where a struct
218    /// was expected (e.g. descending into a `Uint` leaf).
219    /// `requested` is the user-supplied lookup string; `kind` names
220    /// the actual variant for diagnostics.
221    NotAStruct {
222        requested: String,
223        walked: String,
224        component: String,
225        kind: String,
226    },
227    /// A typed accessor (`as_u64` etc.) was called on a rendered
228    /// shape it cannot decode (e.g. `as_str` on a `Struct`).
229    /// `expected` names the scalar type the accessor requires;
230    /// `actual` names the rendered variant; `requested` is the
231    /// user-supplied lookup string (empty when the accessor was
232    /// invoked on a leaf without a path walk).
233    TypeMismatch {
234        expected: String,
235        actual: String,
236        requested: String,
237    },
238    /// A map index was out of range for the underlying entry list.
239    IndexOutOfRange {
240        map: String,
241        index: usize,
242        len: usize,
243    },
244    /// A per-CPU slot was out of range or unmapped.
245    PerCpuSlot {
246        map: String,
247        cpu: u32,
248        len: usize,
249        unmapped: bool,
250    },
251    /// A predicate-based lookup (`find`, `max_by`) found no match.
252    /// `len` is the number of entries the lookup traversed before
253    /// giving up; `available_keys` is a small sample (up to
254    /// `NO_MATCH_KEY_SAMPLE` entries) of rendered keys seen during
255    /// the traversal so an operator can distinguish "empty map"
256    /// (`len == 0`) from "populated map with no predicate hit"
257    /// (`len > 0`) and inspect the sample to debug the predicate.
258    /// Keys are rendered via `crate::monitor::btf_render::RenderedValue`'s `Display` impl and
259    /// each is capped at `NO_MATCH_KEY_CHAR_CAP` chars with an
260    /// ellipsis to keep the failure message readable for wide struct
261    /// keys.
262    ///
263    /// Aggregation methods (`max_by`, `cpu_max_u64` / `cpu_min_u64`
264    /// / `cpu_max_f64` / `cpu_min_f64`) produce this variant for
265    /// empty / all-None inputs; their NoMatch always carries
266    /// `len == 0` and empty `available_keys`. Only `find` can
267    /// produce `len > 0` here.
268    NoMatch {
269        map: String,
270        op: String,
271        len: usize,
272        available_keys: Vec<String>,
273    },
274    /// A path string contained an empty component (e.g. `"a..b"`).
275    /// `requested` is the user-supplied lookup string.
276    EmptyPathComponent { requested: String },
277    /// [`super::SnapshotEntry::get`] was called on a per-CPU entry
278    /// without narrowing to a CPU first via [`super::SnapshotMap::cpu`].
279    PerCpuNotNarrowed { map: String },
280    /// Hash entry has no rendered key/value side (BTF type id was
281    /// missing at capture time, leaving the hex bytes only).
282    NoRendered { map: String, side: String },
283    /// The sample's underlying `crate::monitor::dump::FailureDumpReport`
284    /// is a placeholder produced by
285    /// `crate::monitor::dump::FailureDumpReport::placeholder` —
286    /// the freeze-rendezvous path could not collect real data
287    /// (typical cause: vCPU rendezvous timed out). Temporal
288    /// patterns in [`crate::assert::temporal`] route this variant
289    /// through their per-sample skip handling so a placeholder
290    /// sample never falsely registers as zero progress against a
291    /// monotonicity / rate / steady / ratio band. The `reason`
292    /// string mirrors `FailureDumpReport::scx_walker_unavailable`
293    /// when present (set by `placeholder()` to the constructor
294    /// argument), giving the operator the cause without re-walking
295    /// the report.
296    PlaceholderSample { tag: String, reason: String },
297    /// A [`SampleSeries::stats`](crate::scenario::sample::SampleSeries::stats)
298    /// projection ran on a sample whose `stats` field carries an
299    /// `Err` — the stats client was not wired (no
300    /// `scheduler_binary`) or the per-sample stats request failed.
301    /// The carried [`MissingStatsReason`] identifies the *why* so
302    /// operator diagnostics distinguish "no scheduler configured"
303    /// from "scheduler refused the request" from "watchdog
304    /// cancelled the request" without re-walking the source error.
305    /// Distinguishes a per-sample stats coverage gap from an
306    /// in-stats-JSON path miss (`TypeMismatch` /
307    /// `FieldNotFound`) so the temporal-assertion site can
308    /// branch on the cause without re-walking the source.
309    MissingStats {
310        tag: String,
311        reason: MissingStatsReason,
312    },
313    /// A [`SampleSeries::host`](crate::scenario::sample::SampleSeries::host)
314    /// projection ran on a sample whose `per_cpu_time` slice did
315    /// not include `cpu` — placeholder report (freeze rendezvous
316    /// timed out), or a kernel that didn't surface per-CPU
317    /// `kernel_stat`/`tick_cpu_sched`/`kernel_cpustat` resolution
318    /// for the requested CPU. Distinguishes a per-sample host-data
319    /// coverage gap from a kernel-walker failure (`Unavailable` on
320    /// the broader Snapshot accessor) so the temporal-assertion
321    /// site can decide whether to fail strict or skip with a
322    /// rendered Note.
323    HostFieldUnavailable { tag: String, cpu: u32 },
324    /// [`super::Snapshot::var`] / [`super::Snapshot::live_var`] /
325    /// [`super::Snapshot::map`] was called on a snapshot whose
326    /// underlying `crate::monitor::dump::FailureDumpReport` is a
327    /// placeholder (the freeze-rendezvous path could not collect
328    /// real data — typical cause: vCPU rendezvous timed out). The
329    /// captured `report.maps` is empty by construction so the
330    /// var/map lookup has nothing to walk. Distinct from
331    /// [`Self::VarNotFound`] (which means "the captured report did
332    /// not contain a global by this name") so the assertion site
333    /// can distinguish "freeze failed" from "typo in field name".
334    /// `tag` carries the capture tag (if any).
335    PlaceholderSnapshot { tag: Option<String> },
336    /// [`super::Snapshot::active`] / [`super::Snapshot::live_var`]
337    /// could not identify a currently-active scheduler from the
338    /// snapshot's `*scx_root` + `prog_runtime_stats`. Typical
339    /// causes: snapshot taken in the dead window between
340    /// [`crate::scenario::ops::Op::DetachScheduler`] +
341    /// [`crate::scenario::ops::Op::AttachScheduler`]; snapshot
342    /// taken in the post-swap settle window before the new
343    /// scheduler's progs have advanced their run counter; snapshot
344    /// captured before any scheduler attached. Distinct from
345    /// [`Self::AmbiguousVar`] (which means "the snapshot has
346    /// multiple scheduler bss copies and the call did not opt
347    /// into active-only filtering") so the assertion site can
348    /// distinguish "no scheduler is running right now" from
349    /// "multiple are running, pick one".
350    NoActiveScheduler { reason: String },
351    /// [`super::Snapshot::var`] / [`super::Snapshot::map`] (or one
352    /// of the `live_*` shortcuts) ran against an active-filtered
353    /// view where the KVA whitelist excluded EVERY captured map
354    /// that shared the active obj prefix (i.e. the admitted set
355    /// for this obj was empty). Distinct from [`Self::VarNotFound`]
356    /// — `VarNotFound` means "the active filter admitted maps but
357    /// none carry the requested name"; this variant means "the
358    /// active filter admitted zero maps for this obj, so the
359    /// lookup never got the chance to walk anything."
360    ///
361    /// The variant never fires when at least one captured
362    /// `<active_obj>.*` map passes the KVA whitelist — in that
363    /// case the lookup miss is a real typo or absent symbol and
364    /// the standard `VarNotFound` / `MapNotFound` carries the
365    /// admitted list. This narrow firing scope prevents
366    /// false-positives that would otherwise mask genuine typos
367    /// in same-binary post-swap captures.
368    ///
369    /// Typical causes when this DOES fire: stale walker capture
370    /// (captured KVAs predate the most recent struct_ops swap),
371    /// same-binary post-swap window where the report still
372    /// carries the old instance's maps, or a walker bug that
373    /// resolved `*scx_root` against a different binary's map set.
374    ActiveFilterExcludedMaps {
375        /// User-supplied lookup string (the `var` / `map`
376        /// argument). For [`super::Snapshot::live_vars_via`] this
377        /// carries the joined name list `"[a, b, c]"`.
378        requested: String,
379        /// Obj name the active filter pinned to
380        /// (`*scx_root → struct_ops map → obj prefix` resolution).
381        active_obj: String,
382        /// Maps captured under the active obj prefix that the KVA
383        /// whitelist rejected.
384        excluded_maps: Vec<ExcludedMap>,
385        /// KVA whitelist the walker populated for the active obj.
386        /// A non-empty set whose every entry mismatched the
387        /// captured `map_kva` values points at stale capture or
388        /// KVA aliasing; an empty set is unreachable through this
389        /// variant (no filter means no exclusion).
390        whitelist_kvas: Vec<u64>,
391    },
392    /// A walker-resolved [`crate::scenario::sample::SampleSeries::bpf_live_u64`]
393    /// / `bpf_live_i64` / `bpf_live_f64` projection detected that
394    /// the snapshot's per-snapshot walker output
395    /// ([`crate::monitor::dump::FailureDumpReport::active_map_kvas`])
396    /// disagrees with an earlier same-phase snapshot's walker
397    /// output for the same lookup. The framework pins the first
398    /// non-empty walker output it sees per phase and surfaces this
399    /// variant for every later same-phase snapshot whose walker
400    /// resolved to a different KVA set — without this gate the
401    /// projected series would silently switch between bss copies
402    /// mid-phase (typical cause: post-`Op::ReplaceScheduler` swap
403    /// window where the walker re-publishes mid-phase) and
404    /// downstream reducers like
405    /// [`crate::assert::temporal::SeriesField::counter_delta_per_phase`]
406    /// would see non-monotonic counter values. The drifted
407    /// samples become per-sample `Err` slots; the temporal
408    /// patterns' standard error-skip semantics apply.
409    WalkerDriftedWithinPhase {
410        phase: crate::assert::Phase,
411        pinned_kvas: Vec<u64>,
412        sample_kvas: Vec<u64>,
413        requested: String,
414    },
415    /// A user-supplied projection closure (the kind passed to
416    /// [`crate::scenario::sample::SampleSeries::bpf`]) signalled
417    /// failure for reasons that don't fit the structured variants
418    /// above. `reason` is the closure's free-form explanation —
419    /// "lookup returned None for sched_id A, B, C" — so the failure
420    /// message stays diagnostic without forcing the closure to
421    /// synthesize an `available: Vec<String>` it cannot populate.
422    ///
423    /// Closures should reach for the structured variants
424    /// ([`Self::VarNotFound`], [`Self::MapNotFound`], etc.) when
425    /// they can; this variant is the escape hatch for higher-level
426    /// disambiguation logic (e.g. "I walked vars(name) and none of
427    /// the candidates matched my active-instance fingerprint").
428    /// Surfaces in temporal-assertion failure messages as
429    /// `projection failed: <reason>`.
430    ProjectionFailed { reason: String },
431    /// A captured map's contents could not be rendered at dump time:
432    /// `crate::monitor::dump::FailureDumpMap::error` is set and the
433    /// map carries no entries / value to walk. Surfaced by
434    /// [`super::Snapshot::var`] / [`super::SnapshotMap::at`] /
435    /// [`super::SnapshotMap::find`] / [`super::SnapshotMap::max_by`]
436    /// instead of [`Self::VarNotFound`] /
437    /// [`Self::IndexOutOfRange`] `{ len: 0 }` /
438    /// [`Self::NoMatch`] `{ len: 0 }` so a guest-memory render
439    /// failure is distinguishable from a genuinely-absent variable
440    /// or a legitimately-empty map. Without this distinction a map
441    /// whose contents failed to read reads identically to "the
442    /// symbol does not exist", masking the capture failure.
443    /// `map` names the owning map; `error` mirrors
444    /// `FailureDumpMap.error`.
445    MapRenderIncomplete { map: String, error: String },
446}
447
448impl std::fmt::Display for SnapshotError {
449    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
450        match self {
451            SnapshotError::MapNotFound {
452                requested,
453                available,
454            } => {
455                write!(
456                    f,
457                    "snapshot has no map '{requested}' (captured maps: {available:?})"
458                )
459            }
460            SnapshotError::VarNotFound {
461                requested,
462                available,
463            } => {
464                write!(
465                    f,
466                    "snapshot has no global variable '{requested}' in any \
467                     *.bss/*.data/*.rodata map (available globals: {available:?})"
468                )
469            }
470            SnapshotError::AmbiguousVar {
471                requested,
472                found_in,
473            } => {
474                write!(
475                    f,
476                    "snapshot global '{requested}' is ambiguous (found in \
477                     {found_in:?}); use Snapshot::active().var(name) (or the \
478                     shorthand Snapshot::live_var(name)) to pick the active \
479                     scheduler's copy automatically, or Snapshot::map(name) \
480                     to address a specific scheduler's bss explicitly"
481                )
482            }
483            SnapshotError::FieldNotFound {
484                requested,
485                walked,
486                component,
487                available,
488            } => {
489                write!(
490                    f,
491                    "path '{requested}': component '{component}' (after walking '{walked}') \
492                     not found (members at this depth: {available:?})"
493                )
494            }
495            SnapshotError::NotAStruct {
496                requested,
497                walked,
498                component,
499                kind,
500            } => {
501                write!(
502                    f,
503                    "path '{requested}': component '{component}' (after walking '{walked}') \
504                     expected a Struct, got {kind}"
505                )
506            }
507            SnapshotError::TypeMismatch {
508                expected,
509                actual,
510                requested,
511            } => {
512                write!(
513                    f,
514                    "path '{requested}': cannot read as {expected} — actual rendered \
515                     variant is {actual}"
516                )
517            }
518            SnapshotError::IndexOutOfRange { map, index, len } => {
519                write!(f, "map '{map}': index {index} out of range (length {len})")
520            }
521            SnapshotError::PerCpuSlot {
522                map,
523                cpu,
524                len,
525                unmapped,
526            } => {
527                if *unmapped {
528                    write!(f, "map '{map}': cpu {cpu} per-CPU slot is unmapped (None)")
529                } else {
530                    write!(
531                        f,
532                        "map '{map}': cpu {cpu} out of range (have {len} per-CPU slots)"
533                    )
534                }
535            }
536            SnapshotError::NoMatch {
537                map,
538                op,
539                len,
540                available_keys,
541            } => fmt_no_match(f, map, op, *len, available_keys),
542            SnapshotError::EmptyPathComponent { requested } => {
543                write!(
544                    f,
545                    "path '{requested}' has an empty component (consecutive '.')"
546                )
547            }
548            SnapshotError::PerCpuNotNarrowed { map } => {
549                write!(
550                    f,
551                    "map '{map}': per-CPU entry without a CPU narrow — call .cpu(N) first"
552                )
553            }
554            SnapshotError::NoRendered { map, side } => {
555                write!(
556                    f,
557                    "map '{map}': {side} has no rendered structure (no BTF type at capture time)"
558                )
559            }
560            SnapshotError::PlaceholderSample { tag, reason } => {
561                write!(
562                    f,
563                    "sample '{tag}' is a placeholder report (capture pipeline did not land): \
564                     {reason}"
565                )
566            }
567            SnapshotError::MissingStats { tag, reason } => {
568                write!(f, "sample '{tag}': stats absent ({reason})")
569            }
570            SnapshotError::HostFieldUnavailable { tag, cpu } => {
571                write!(
572                    f,
573                    "sample '{tag}': per_cpu_time has no entry for cpu {cpu} \
574                     (placeholder report or kernel-walker resolution failure)"
575                )
576            }
577            SnapshotError::PlaceholderSnapshot { tag } => match tag {
578                Some(t) => write!(
579                    f,
580                    "snapshot '{t}' is a placeholder — the freeze-rendezvous \
581                     path could not capture real data; no maps to walk"
582                ),
583                None => f.write_str(
584                    "snapshot is a placeholder — the freeze-rendezvous path \
585                     could not capture real data; no maps to walk",
586                ),
587            },
588            SnapshotError::NoActiveScheduler { reason } => {
589                write!(
590                    f,
591                    "snapshot has no currently-active scheduler ({reason}); \
592                     use Snapshot::vars(name) to enumerate every observed \
593                     copy explicitly, Snapshot::live_var(name) to keep the \
594                     typed error path while opting into the active filter, \
595                     or Snapshot::map(\"<obj>.<section>\") to address a \
596                     specific scheduler's bss directly"
597                )
598            }
599            SnapshotError::ActiveFilterExcludedMaps {
600                requested,
601                active_obj,
602                excluded_maps,
603                whitelist_kvas,
604            } => {
605                fmt_active_filter_excluded(f, requested, active_obj, excluded_maps, whitelist_kvas)
606            }
607            SnapshotError::WalkerDriftedWithinPhase {
608                phase,
609                pinned_kvas,
610                sample_kvas,
611                requested,
612            } => fmt_walker_drift(f, phase, pinned_kvas, sample_kvas, requested),
613            SnapshotError::ProjectionFailed { reason } => {
614                write!(f, "projection failed: {reason}")
615            }
616            SnapshotError::MapRenderIncomplete { map, error } => {
617                write!(
618                    f,
619                    "map '{map}' contents unavailable (render failed at capture): {error}"
620                )
621            }
622        }
623    }
624}
625
626/// Render the `NoMatch` arm: an empty-map message, a no-sample-keys
627/// message, or the matched-none message plus the BTF-missing hint when
628/// every sample key fell back to the hex-bytes form.
629fn fmt_no_match(
630    f: &mut std::fmt::Formatter<'_>,
631    map: &str,
632    op: &str,
633    len: usize,
634    available_keys: &[String],
635) -> std::fmt::Result {
636    if len == 0 {
637        write!(f, "map '{map}': {op} matched no entries (map is empty)")
638    } else if available_keys.is_empty() {
639        write!(
640            f,
641            "map '{map}': {op} matched none of {len} entries (sample keys unavailable)"
642        )
643    } else {
644        write!(
645            f,
646            "map '{map}': {op} matched none of {len} entries (first {sampled}: {available_keys:?})",
647            sampled = available_keys.len(),
648        )?;
649        // The `hex:` prefix is only ever produced by
650        // `render_entry_key`'s fallback path when the
651        // entry's `key` field was `None` at capture time.
652        // Typed `RenderedValue::Display` does not emit
653        // this prefix for any scalar variant; `Struct`
654        // emits `TypeName{...}` inline or `TypeName:`
655        // breadcrumb, where a `hex:` collision would
656        // require a BTF struct literally named `hex` —
657        // no real kernel scheduler does that. The hint
658        // therefore fires only when BTF was uniformly
659        // absent for this map's key type at capture time,
660        // and names the kernel-side fix so the operator
661        // does not have to reverse-engineer the `hex:`
662        // discriminator.
663        if available_keys.iter().all(|k| k.starts_with(HEX_KEY_PREFIX)) {
664            write!(
665                f,
666                " (BTF missing at capture — keys shown as hex bytes; \
667                 rebuild guest kernel with CONFIG_DEBUG_INFO_BTF=y for \
668                 typed keys)"
669            )?;
670        }
671        Ok(())
672    }
673}
674
675/// Render the `ActiveFilterExcludedMaps` arm: classify why the walker's
676/// KVA whitelist excluded the captured maps sharing the requested obj
677/// prefix, and name the explicit-picker escape hatches.
678fn fmt_active_filter_excluded(
679    f: &mut std::fmt::Formatter<'_>,
680    requested: &str,
681    active_obj: &str,
682    excluded_maps: &[ExcludedMap],
683    whitelist_kvas: &[u64],
684) -> std::fmt::Result {
685    let excluded_rendered = excluded_maps
686        .iter()
687        .map(|m| format!("{}@{:#x}", m.name, m.map_kva))
688        .collect::<Vec<_>>()
689        .join(", ");
690    let some_zero = excluded_maps.iter().any(|m| m.map_kva == 0);
691    let some_alias = excluded_maps
692        .iter()
693        .any(|m| m.map_kva != 0 && !whitelist_kvas.contains(&m.map_kva));
694    let cause = match (some_zero, some_alias) {
695        (false, true) => {
696            "this snapshot pre-dates your most recent \
697             Op::ReplaceScheduler / Op::AttachScheduler — \
698             wait for the next periodic boundary (or re-run \
699             the test) so the walker re-publishes the live \
700             scheduler's KVAs"
701        }
702        (true, false) => {
703            "the captured maps have no recorded KVAs — \
704             the snapshot pre-dates the walker plumbing, \
705             or the capture path failed to record per-map KVAs"
706        }
707        (true, true) => {
708            "some captured maps lack KVAs and some disagree \
709             with the walker's whitelist — both \
710             pre-walker-capture state and a post-swap window \
711             can produce this; re-run the test to regenerate \
712             the snapshot"
713        }
714        (false, false) => "captured KVAs were neither absent nor in disagreement",
715    };
716    write!(
717        f,
718        "snapshot lookup '{requested}' returned no hits under the \
719         active filter (obj='{active_obj}'): the walker's KVA \
720         whitelist {whitelist_kvas:#x?} excluded {n} captured map(s) \
721         sharing the obj prefix: {excluded_rendered} — {cause}. \
722         Reach for Snapshot::vars('{requested}') to enumerate every \
723         copy across all obj prefixes, or Snapshot::map(\"<name>\") \
724         to address one of the excluded maps directly.",
725        n = excluded_maps.len(),
726    )
727}
728
729/// Render the `WalkerDriftedWithinPhase` arm: the walker re-published
730/// its KVA set mid-phase, so the drifted sample is surfaced as Err to
731/// keep per-phase reducers on a single walker decision.
732fn fmt_walker_drift(
733    f: &mut std::fmt::Formatter<'_>,
734    phase: &crate::assert::Phase,
735    pinned_kvas: &[u64],
736    sample_kvas: &[u64],
737    requested: &str,
738) -> std::fmt::Result {
739    write!(
740        f,
741        "walker drift within {phase:?}: lookup '{requested}' resolved against \
742         KVA set {sample_kvas:#x?}, but an earlier same-phase snapshot pinned \
743         {pinned_kvas:#x?}. The walker re-published mid-phase (typical cause: \
744         a post-Op::ReplaceScheduler swap window). The drifted sample is \
745         surfaced as Err so per-phase reducers (counter_delta_per_phase, \
746         ratio_across_phases) see monotonic Ok-sequences from one walker \
747         decision; address by stepping the phase past the swap settle window \
748         or by reading via the explicit picker form."
749    )
750}
751
752impl std::error::Error for SnapshotError {}
753
754/// Result alias for snapshot accessors.
755pub type SnapshotResult<T> = std::result::Result<T, SnapshotError>;
756
757/// Typed shape of one entry drained from the snapshot bridge's
758/// ordered per-tag store. Fields:
759/// * `tag`: snapshot name the report was stored under.
760/// * `report`: `crate::monitor::dump::FailureDumpReport` of the
761///   captured guest state.
762/// * `stats`: scheduler-side stats JSON or a typed
763///   [`MissingStatsReason`] when capture happened without a
764///   wired stats client.
765/// * `elapsed_ms`: optional wall-clock anchor (ms since run-start).
766/// * `step_index`: scenario phase index stamped at capture time.
767///   `Some(idx)` for captures stored via the step-aware entry
768///   points ([`crate::scenario::snapshot::SnapshotBridge::capture_with_step`]
769///   or [`crate::scenario::snapshot::SnapshotBridge::store_with_stats_and_step`]);
770///   `None` for fixture-injected captures via the unstamped legacy
771///   paths ([`crate::scenario::snapshot::SnapshotBridge::capture`]
772///   / [`crate::scenario::snapshot::SnapshotBridge::store`]
773///   / [`crate::scenario::snapshot::SnapshotBridge::store_with_stats`]).
774///
775/// Used by [`crate::scenario::snapshot::SnapshotBridge::drain_ordered_with_stats`]
776/// and [`crate::scenario::sample::SampleSeries::from_drained_typed`].
777/// `#[non_exhaustive]` so future additive fields stay
778/// pattern-match-compatible via rest-pattern destructure
779/// (`DrainedSnapshotEntry { tag, report, .. }`).
780#[derive(Debug)]
781#[non_exhaustive]
782pub struct DrainedSnapshotEntry {
783    pub tag: String,
784    pub report: crate::monitor::dump::FailureDumpReport,
785    pub stats: std::result::Result<serde_json::Value, MissingStatsReason>,
786    pub elapsed_ms: Option<u64>,
787    /// Workload-relative boundary offset (ms) a periodic capture was
788    /// scheduled for (`boundary_ns - scenario_anchor_ns`); `None` for
789    /// non-periodic / on-demand captures. Distinct from `elapsed_ms`
790    /// (run_start-relative fire time). See
791    /// [`crate::scenario::snapshot::SnapshotBridge`]'s store doc.
792    pub boundary_offset_ms: Option<u64>,
793    pub step_index: Option<u16>,
794}
795
796#[cfg(test)]
797mod tests_api_gaps {
798    use super::*;
799
800    /// Pin: `SnapshotError::ProjectionFailed { reason }` renders as
801    /// `projection failed: <reason>` so the temporal-assertion
802    /// failure path surfaces the closure's diagnostic without
803    /// re-wrapping. Closure call-sites synthesize this variant
804    /// when the structured variants (`VarNotFound`, `MapNotFound`,
805    /// `AmbiguousVar`) require an `available: Vec<String>` they
806    /// cannot populate.
807    #[test]
808    fn projection_failed_display_carries_reason() {
809        let e = SnapshotError::ProjectionFailed {
810            reason: "live_var_via picker rejected all 2 candidates".to_string(),
811        };
812        let rendered = format!("{e}");
813        assert_eq!(
814            rendered,
815            "projection failed: live_var_via picker rejected all 2 candidates"
816        );
817    }
818
819    /// Pin: `ProjectionFailed` participates in the same
820    /// `PartialEq` / `Hash` derive set as every other variant —
821    /// pattern-match callers can assert "yes, my projection
822    /// closure failed" without falling through to a `_` arm.
823    #[test]
824    fn projection_failed_eq_and_hash_round_trip() {
825        let a = SnapshotError::ProjectionFailed {
826            reason: "x".to_string(),
827        };
828        let b = a.clone();
829        assert_eq!(a, b);
830        let mut seen = std::collections::HashSet::new();
831        seen.insert(a);
832        assert!(seen.contains(&b));
833    }
834
835    /// `From<&SchedStatsError>` must map EVERY scheduler-stats failure
836    /// to its corresponding MissingStatsReason. A swapped arm would
837    /// mislabel a stats failure (e.g. a size-cap breach reported as a
838    /// poisoned mutex), and the operator's diagnostic would point at the
839    /// wrong cause. Pin each variant's mapping.
840    #[test]
841    fn missing_stats_reason_from_sched_stats_error_maps_each_variant() {
842        use crate::vmm::sched_stats::SchedStatsError as S;
843        use serde_json::json;
844        assert_eq!(
845            MissingStatsReason::from(&S::Poisoned),
846            MissingStatsReason::MutexPoisoned,
847        );
848        assert_eq!(
849            MissingStatsReason::from(&S::DuringFreeze),
850            MissingStatsReason::DuringFreeze,
851        );
852        assert_eq!(
853            MissingStatsReason::from(&S::Cancelled),
854            MissingStatsReason::Cancelled,
855        );
856        assert_eq!(
857            MissingStatsReason::from(&S::RequestTooLarge { size: 10, max: 5 }),
858            MissingStatsReason::RequestTooLarge { size: 10, max: 5 },
859        );
860        assert_eq!(
861            MissingStatsReason::from(&S::ResponseTooLarge { size: 20, max: 8 }),
862            MissingStatsReason::ResponseTooLarge { size: 20, max: 8 },
863        );
864        assert_eq!(
865            MissingStatsReason::from(&S::NoScheduler {
866                reason: "no sock".into()
867            }),
868            MissingStatsReason::NoScheduler {
869                reason: "no sock".into()
870            },
871        );
872        assert_eq!(
873            MissingStatsReason::from(&S::SchedulerError {
874                errno: 13,
875                args: json!({"k": 1}),
876            }),
877            MissingStatsReason::SchedulerError {
878                errno: 13,
879                args: json!({"k": 1}),
880            },
881        );
882        assert_eq!(
883            MissingStatsReason::from(&S::MissingResp {
884                args: json!({"a": 2}),
885            }),
886            MissingStatsReason::MissingResp {
887                args: json!({"a": 2}),
888            },
889        );
890    }
891
892    /// `From<&anyhow::Error>` downcasts a typed SchedStatsError in the
893    /// chain to its mapped variant; a non-typed error falls back to
894    /// NoScheduler carrying the rendered display (so serde/IO errors
895    /// surfacing through the same Result still classify).
896    #[test]
897    fn missing_stats_reason_from_anyhow_downcasts_typed_else_no_scheduler() {
898        use crate::vmm::sched_stats::SchedStatsError as S;
899        let typed: anyhow::Error = anyhow::Error::new(S::DuringFreeze);
900        assert_eq!(
901            MissingStatsReason::from(&typed),
902            MissingStatsReason::DuringFreeze,
903            "a typed SchedStatsError in the anyhow chain must downcast",
904        );
905        let other = anyhow::anyhow!("plain io failure");
906        match MissingStatsReason::from(&other) {
907            MissingStatsReason::NoScheduler { reason } => {
908                assert!(reason.contains("plain io failure"));
909            }
910            x => panic!("expected NoScheduler fallback, got {x:?}"),
911        }
912    }
913
914    /// Every MissingStatsReason renders a non-empty operator message,
915    /// and the value-bearing variants surface their payload.
916    #[test]
917    fn missing_stats_reason_display_covers_each_variant() {
918        use serde_json::json;
919        let cases = [
920            MissingStatsReason::NoSchedulerBinary,
921            MissingStatsReason::NoScheduler { reason: "r".into() },
922            MissingStatsReason::DuringFreeze,
923            MissingStatsReason::Cancelled,
924            MissingStatsReason::SchedulerError {
925                errno: 1,
926                args: json!({}),
927            },
928            MissingStatsReason::MissingResp { args: json!({}) },
929            MissingStatsReason::RequestTooLarge { size: 1, max: 2 },
930            MissingStatsReason::ResponseTooLarge { size: 3, max: 4 },
931            MissingStatsReason::MutexPoisoned,
932        ];
933        for c in &cases {
934            assert!(
935                !format!("{c}").is_empty(),
936                "Display must be non-empty for {c:?}"
937            );
938        }
939        assert!(
940            format!(
941                "{}",
942                MissingStatsReason::RequestTooLarge { size: 10, max: 5 }
943            )
944            .contains("10"),
945        );
946        assert!(
947            format!(
948                "{}",
949                MissingStatsReason::SchedulerError {
950                    errno: 13,
951                    args: json!({})
952                }
953            )
954            .contains("13"),
955        );
956    }
957}