ktstr/scenario/sample/
monitor.rs

1//! Per-VM-run host monitor projection.
2//!
3//! The host-side monitor (see [`crate::monitor`]) aggregates sampling
4//! observations across the VM run and produces a [`MonitorReport`]
5//! exposing summary statistics and SCX event-counter deltas. This
6//! module wraps that report in borrowed views ([`MonitorView`] +
7//! [`ScxEventsView`]) returned from [`SampleSeries::monitor`].
8//!
9//! Orthogonal to [`super::host`]: monitor exposes the per-VM-run
10//! cross-CPU AGGREGATE; the host view exposes the per-SAMPLE per-CPU
11//! TIMELINE. The two draw from different fields on the captured
12//! reports (`MonitorReport.summary` here vs
13//! `FailureDumpReport::per_cpu_time` for the host view) and never
14//! overlap.
15
16use crate::monitor::{MonitorReport, MonitorSummary, ScxEventDeltas};
17
18use super::SampleSeries;
19
20/// Borrowed view over a per-VM-run `MonitorReport`. Returned by
21/// [`SampleSeries::monitor`]; provides typed access to the report's
22/// summary statistics + the SCX event-counter deltas.
23///
24/// Aggregates here refer to the monitoring window of THE SERIES
25/// THIS VIEW WAS DRAWN FROM — not the entire test run, not
26/// cumulative across series. A test that wants cross-series
27/// aggregation must perform it explicitly.
28#[derive(Debug, Clone, Copy)]
29#[must_use = "MonitorView is a borrowed view; call .summary() or .scx_events() to project"]
30#[non_exhaustive]
31pub struct MonitorView<'a> {
32    report: &'a MonitorReport,
33}
34
35impl<'a> MonitorView<'a> {
36    /// Aggregate summary statistics: imbalance ratio, nr_running
37    /// averages, local DSQ depth, stuck-CPU count, and
38    /// optional schedstat / prog-stats deltas. See
39    /// `MonitorSummary` for the full field set.
40    pub fn summary(&self) -> &'a MonitorSummary {
41        &self.report.summary
42    }
43
44    /// SCX event-counter accessor. Returns `None` when the monitor
45    /// ran but `event_deltas` were not computed (kernel without
46    /// event counters, monitoring window too short to compute
47    /// first/last deltas) — Option chain matches the source
48    /// `MonitorSummary::event_deltas: Option<ScxEventDeltas>` field.
49    /// Callers chain `if let Some(evt) = view.scx_events()` to
50    /// branch on availability without panicking.
51    pub fn scx_events(&self) -> Option<ScxEventsView<'a>> {
52        self.report
53            .summary
54            .event_deltas
55            .as_ref()
56            .map(|deltas| ScxEventsView { deltas })
57    }
58
59    /// Borrowed per-tick monitor samples. Each
60    /// `crate::monitor::MonitorSample` is one host-side
61    /// observation of the guest's per-CPU runqueue state
62    /// (`nr_running`, `local_dsq_depth`, `rq_clock`, optional
63    /// event counters). The monitor thread captures these on a
64    /// fixed cadence independent of the snapshot bridge's
65    /// freeze-rendezvous captures; samples carry their own
66    /// `elapsed_ms` timestamp for windowing.
67    ///
68    /// Empty when the monitor ran but produced no samples (very
69    /// short run, monitor thread exited early). The slot is
70    /// always present — `MonitorView` itself only exists when a
71    /// `MonitorReport` was attached at series construction.
72    ///
73    /// Live caller: [`crate::assert::build_phase_buckets`] windows
74    /// these samples per phase to compute metrics like
75    /// `avg_imbalance_ratio` that need per-CPU `rq.nr_running`
76    /// (full-class count), which the bridge-captured
77    /// [`crate::scenario::snapshot::Snapshot`] does NOT expose
78    /// (Snapshot carries only `scx_rq.nr_running`, the SCX-only
79    /// subset). The two data axes are complementary: Snapshot for
80    /// frozen BPF state at capture instants, MonitorSample for
81    /// per-tick observations across the whole window.
82    pub fn samples(&self) -> &'a [crate::monitor::MonitorSample] {
83        &self.report.samples
84    }
85
86    /// vCPU-preemption exemption window (ns) for stall detection,
87    /// derived from the guest kernel's `CONFIG_HZ` at run time. `0`
88    /// means "derive from a default" — callers folding per-phase
89    /// stall counts pass it through to
90    /// `crate::timeline::compute_metrics` so the per-phase predicate
91    /// matches the run-level `MonitorSummary::stuck_count` one.
92    pub fn preemption_threshold_ns(&self) -> u64 {
93        self.report.preemption_threshold_ns
94    }
95}
96
97/// Default curated subset of [`ScxEventsView::total_pairs`] counter
98/// names that signal genuine scheduler-class errors when non-zero.
99/// Used to filter the full 14-entry total slice down to the entries
100/// that callers conventionally bound at zero with
101/// [`crate::assert::assert_scx_events_clean`].
102///
103/// Membership is the documented intersection of the kernel-side
104/// `SCX_EV_*` counters whose non-zero firing is exclusively
105/// pathological (skipped enqueue paths, repeated re-enqueue cycles,
106/// owner-mismatched inserts) — the `bypass_*`,
107/// `dispatch_keep_last`, `refill_slice_dfl` counters that
108/// legitimately fire on healthy schedulers are deliberately
109/// excluded. Different test scenarios may consider different
110/// counters error-class; the projector exposes the full slice via
111/// [`ScxEventsView::total_pairs`] so callers can override this
112/// default by filtering on their own set.
113pub const ERROR_CLASS_NAMES: &[&str] = &[
114    "enq_skip_exiting",
115    "enq_skip_migration_disabled",
116    "reenq_immed",
117    "reenq_local_repeat",
118    "insert_not_owned",
119];
120
121/// Borrowed view over the `ScxEventDeltas` aggregated across the
122/// monitor's first/last sample window. Returned by
123/// [`MonitorView::scx_events`]; exposes the 14 i64 counter totals
124/// via [`Self::total_pairs`] and the 2 f64 derived rates via
125/// [`Self::rates_pairs`].
126#[derive(Debug, Clone, Copy)]
127#[must_use = "ScxEventsView is a borrowed view; call .total_pairs() or .rates_pairs() to project"]
128#[non_exhaustive]
129pub struct ScxEventsView<'a> {
130    deltas: &'a ScxEventDeltas,
131}
132
133impl<'a> ScxEventsView<'a> {
134    /// All 14 i64 counter totals as `(name, value)` pairs in the
135    /// shape that feeds
136    /// [`crate::assert::assert_scx_events_clean`]. Order:
137    /// `select_cpu_fallback`, `select_cpu_fallback_max_burst`,
138    /// `dispatch_local_dsq_offline`, `dispatch_keep_last`,
139    /// `enq_skip_exiting`, `enq_skip_migration_disabled`,
140    /// `reenq_immed`, `reenq_local_repeat`, `refill_slice_dfl`,
141    /// `bypass_duration_ns`, `bypass_dispatch`, `bypass_activate`,
142    /// `insert_not_owned`, `sub_bypass_dispatch`.
143    ///
144    /// **STRICTNESS WARNING:** `assert_scx_events_clean(pairs,
145    /// None)` against the full 14-entry slice will spuriously
146    /// fail under normal scheduling load — several counters
147    /// (`bypass_*`, `dispatch_keep_last`, `refill_slice_dfl`)
148    /// legitimately fire on healthy schedulers. Callers either
149    /// curate the slice (`pairs.iter().filter(...).collect()`)
150    /// or pass `Some(bound)` for non-error-class events. The
151    /// projector deliberately does NOT bake "error class" judgment
152    /// in — different test scenarios consider different counters
153    /// error-class.
154    ///
155    /// Example — assert only error-class counters are zero by
156    /// curating the slice before the assertion:
157    ///
158    /// ```no_run
159    /// # use ktstr::scenario::sample::SampleSeries;
160    /// # use ktstr::scenario::sample::ERROR_CLASS_NAMES;
161    /// # use ktstr::assert::assert_scx_events_clean;
162    /// # fn example(series: &SampleSeries) {
163    /// if let Some(view) = series.monitor()
164    ///     && let Some(events) = view.scx_events()
165    /// {
166    ///     let pairs = events.total_pairs();
167    ///     let error_only: Vec<(&str, i64)> = pairs
168    ///         .into_iter()
169    ///         .filter(|(name, _)| ERROR_CLASS_NAMES.contains(name))
170    ///         .collect();
171    ///     assert!(assert_scx_events_clean(&error_only, None).is_pass());
172    /// }
173    /// # }
174    /// ```
175    pub fn total_pairs(&self) -> Vec<(&'static str, i64)> {
176        vec![
177            ("select_cpu_fallback", self.deltas.total_fallback),
178            (
179                "select_cpu_fallback_max_burst",
180                self.deltas.max_fallback_burst,
181            ),
182            (
183                "dispatch_local_dsq_offline",
184                self.deltas.total_dispatch_offline,
185            ),
186            ("dispatch_keep_last", self.deltas.total_dispatch_keep_last),
187            ("enq_skip_exiting", self.deltas.total_enq_skip_exiting),
188            (
189                "enq_skip_migration_disabled",
190                self.deltas.total_enq_skip_migration_disabled,
191            ),
192            ("reenq_immed", self.deltas.total_reenq_immed),
193            ("reenq_local_repeat", self.deltas.total_reenq_local_repeat),
194            ("refill_slice_dfl", self.deltas.total_refill_slice_dfl),
195            ("bypass_duration_ns", self.deltas.total_bypass_duration),
196            ("bypass_dispatch", self.deltas.total_bypass_dispatch),
197            ("bypass_activate", self.deltas.total_bypass_activate),
198            ("insert_not_owned", self.deltas.total_insert_not_owned),
199            ("sub_bypass_dispatch", self.deltas.total_sub_bypass_dispatch),
200        ]
201    }
202
203    /// Derived per-second rate fields as `(name, value)` pairs.
204    /// Separate from [`Self::total_pairs`] because rates have a
205    /// different semantic (rate-bounded asserts, not count-bounded)
206    /// and a different value type (f64 vs i64). Order:
207    /// `select_cpu_fallback_rate`, `dispatch_keep_last_rate`.
208    pub fn rates_pairs(&self) -> Vec<(&'static str, f64)> {
209        vec![
210            ("select_cpu_fallback_rate", self.deltas.fallback_rate),
211            ("dispatch_keep_last_rate", self.deltas.keep_last_rate),
212        ]
213    }
214}
215
216impl SampleSeries {
217    /// Borrowed view over the per-VM-run host monitor report
218    /// associated with this series. `None` when the monitor did
219    /// not run (host-only tests, early VM failure, or
220    /// [`Self::from_drained`] was called with `None` monitor).
221    ///
222    /// Monitor is per-series — aggregates inside the returned
223    /// [`MonitorView`] refer to THAT series' monitoring window
224    /// only; no cross-series merge is supported. A test that
225    /// constructs two `SampleSeries` from two VM runs gets two
226    /// independent monitors.
227    ///
228    /// The returned `MonitorView<'_>` borrows from this series,
229    /// so the series must outlive any projection chained off the
230    /// view (e.g. `series.monitor().map(|m|
231    /// m.scx_events()?.total_pairs())` — the whole chain is bound
232    /// by `series`'s lifetime).
233    pub fn monitor(&self) -> Option<MonitorView<'_>> {
234        self.monitor.as_ref().map(|m| MonitorView { report: m })
235    }
236}
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241
242    /// `series.monitor()` returns `None` when no monitor was
243    /// supplied (host-only tests, early VM failure, or
244    /// `from_drained` was called with `None` monitor). Pins the
245    /// Option chain — callers reaching for monitor metrics via
246    /// `if let Some(view) = series.monitor()` must NOT panic and
247    /// must NOT vacuously return default-empty data.
248    #[test]
249    fn series_monitor_none_when_unset() {
250        let series = SampleSeries::from_drained(vec![], None);
251        assert!(series.monitor().is_none());
252    }
253
254    /// `series.monitor()` returns `Some(view)` when monitor was
255    /// supplied; the view wraps the supplied report and the inner
256    /// `.summary()` accessor returns a reference to the report's
257    /// summary unchanged. Pins the borrow-through-view shape.
258    #[test]
259    fn series_monitor_view_threads_through_supplied_report() {
260        let mut report = MonitorReport::default();
261        report.summary.total_samples = 42;
262        report.summary.max_imbalance_ratio = 2.5;
263        let series = SampleSeries::from_drained(vec![], Some(report));
264        let view = series.monitor().expect("monitor must be Some");
265        let summary = view.summary();
266        assert_eq!(summary.total_samples, 42);
267        assert_eq!(summary.max_imbalance_ratio, 2.5);
268    }
269
270    /// `view.scx_events()` returns `None` when `event_deltas` is
271    /// `None` on the underlying summary (kernel without event
272    /// counters, monitoring window too short). Inner-Option chain
273    /// must NOT collapse to default-zero pairs — silently masking
274    /// the missing-data condition would be a silent-loss path.
275    #[test]
276    fn series_monitor_scx_events_none_when_event_deltas_absent() {
277        let report = MonitorReport::default(); // event_deltas defaults to None
278        let series = SampleSeries::from_drained(vec![], Some(report));
279        let view = series.monitor().expect("monitor must be Some");
280        assert!(
281            view.scx_events().is_none(),
282            "scx_events must return None when event_deltas is absent — \
283             returning Some with zero-default pairs would silently mask the missing-data condition"
284        );
285    }
286
287    /// `view.scx_events()?.total_pairs()` enumerates all 14 i64
288    /// counter fields in the documented order with the documented
289    /// names, and `.rates_pairs()` enumerates the 2 f64 derived
290    /// rates. Pins the projector's name-to-field mapping against
291    /// drift — a regression that reordered fields, renamed a counter,
292    /// or accidentally included a rate in total_pairs would fail
293    /// here.
294    #[test]
295    fn series_monitor_scx_events_pairs_map_to_named_counters() {
296        let mut report = MonitorReport::default();
297        report.summary.event_deltas = Some(ScxEventDeltas {
298            total_fallback: 1,
299            fallback_rate: 0.5,
300            max_fallback_burst: 2,
301            total_dispatch_offline: 3,
302            total_dispatch_keep_last: 4,
303            keep_last_rate: 0.75,
304            total_enq_skip_exiting: 5,
305            total_enq_skip_migration_disabled: 6,
306            total_reenq_immed: 7,
307            total_reenq_local_repeat: 8,
308            total_refill_slice_dfl: 9,
309            total_bypass_duration: 10,
310            total_bypass_dispatch: 11,
311            total_bypass_activate: 12,
312            total_insert_not_owned: 13,
313            total_sub_bypass_dispatch: 14,
314        });
315        let series = SampleSeries::from_drained(vec![], Some(report));
316        let view = series.monitor().expect("monitor must be Some");
317        let events = view.scx_events().expect("event_deltas were set");
318        let totals = events.total_pairs();
319        assert_eq!(totals.len(), 14, "exactly 14 i64 counter pairs");
320        assert_eq!(
321            totals,
322            vec![
323                ("select_cpu_fallback", 1),
324                ("select_cpu_fallback_max_burst", 2),
325                ("dispatch_local_dsq_offline", 3),
326                ("dispatch_keep_last", 4),
327                ("enq_skip_exiting", 5),
328                ("enq_skip_migration_disabled", 6),
329                ("reenq_immed", 7),
330                ("reenq_local_repeat", 8),
331                ("refill_slice_dfl", 9),
332                ("bypass_duration_ns", 10),
333                ("bypass_dispatch", 11),
334                ("bypass_activate", 12),
335                ("insert_not_owned", 13),
336                ("sub_bypass_dispatch", 14),
337            ]
338        );
339        let rates = events.rates_pairs();
340        assert_eq!(rates.len(), 2, "exactly 2 f64 rate pairs");
341        assert_eq!(
342            rates,
343            vec![
344                ("select_cpu_fallback_rate", 0.5),
345                ("dispatch_keep_last_rate", 0.75),
346            ]
347        );
348    }
349
350    /// Pins the STRICTNESS WARNING contract on
351    /// [`ScxEventsView::total_pairs`]: when a non-error-class
352    /// counter (`total_bypass_dispatch` here) legitimately fires
353    /// alongside an error-class counter at zero,
354    /// `assert_scx_events_clean(pairs, None)` against the FULL
355    /// 14-entry slice MUST FAIL (because bypass_dispatch > 0); the
356    /// CURATED subset of error-class counters MUST PASS (because
357    /// every error counter is zero). A future regression that
358    /// silently dropped a counter from `total_pairs` or accidentally
359    /// curated by the projector would break one of these two
360    /// assertions.
361    #[test]
362    fn series_monitor_scx_events_strict_zero_misuse_pinning() {
363        use crate::assert::assert_scx_events_clean;
364        let mut report = MonitorReport::default();
365        report.summary.event_deltas = Some(ScxEventDeltas {
366            total_bypass_dispatch: 100,
367            total_bypass_activate: 50,
368            total_dispatch_keep_last: 7,
369            ..Default::default()
370        });
371        let series = SampleSeries::from_drained(vec![], Some(report));
372        let view = series.monitor().expect("monitor was set");
373        let events = view.scx_events().expect("event_deltas were set");
374        let pairs = events.total_pairs();
375        // Full slice + strict zero: MUST fail because bypass_*
376        // counters fired with legitimate non-zero values.
377        let r_full = assert_scx_events_clean(&pairs, None);
378        assert!(
379            !r_full.is_pass(),
380            "strict-zero against full 14-entry slice MUST fail when non-error-class counters legitimately fire — pins the STRICTNESS WARNING design contract"
381        );
382        // Curated error-class subset: MUST pass because every
383        // error-class counter is zero (we only populated bypass_*
384        // and dispatch_keep_last, neither of which is error class).
385        let error_only: Vec<(&str, i64)> = pairs
386            .into_iter()
387            .filter(|(name, _)| ERROR_CLASS_NAMES.contains(name))
388            .collect();
389        let r_curated = assert_scx_events_clean(&error_only, None);
390        assert!(
391            r_curated.is_pass(),
392            "curated error-class subset MUST pass when every error counter is zero — pins the curate-then-assert documented workaround"
393        );
394    }
395}