ktstr/scenario/sample/monitor.rs
1//! Per-VM-run host monitor projection.
2//!
3//! The host-side monitor (see [`crate::monitor`]) aggregates sampling
4//! observations across the VM run and produces a [`MonitorReport`]
5//! exposing summary statistics and SCX event-counter deltas. This
6//! module wraps that report in borrowed views ([`MonitorView`] +
7//! [`ScxEventsView`]) returned from [`SampleSeries::monitor`].
8//!
9//! Orthogonal to [`super::host`]: monitor exposes the per-VM-run
10//! cross-CPU AGGREGATE; the host view exposes the per-SAMPLE per-CPU
11//! TIMELINE. The two draw from different fields on the captured
12//! reports (`MonitorReport.summary` here vs
13//! `FailureDumpReport::per_cpu_time` for the host view) and never
14//! overlap.
15
16use crate::monitor::{MonitorReport, MonitorSummary, ScxEventDeltas};
17
18use super::SampleSeries;
19
20/// Borrowed view over a per-VM-run `MonitorReport`. Returned by
21/// [`SampleSeries::monitor`]; provides typed access to the report's
22/// summary statistics + the SCX event-counter deltas.
23///
24/// Aggregates here refer to the monitoring window of THE SERIES
25/// THIS VIEW WAS DRAWN FROM — not the entire test run, not
26/// cumulative across series. A test that wants cross-series
27/// aggregation must perform it explicitly.
28#[derive(Debug, Clone, Copy)]
29#[must_use = "MonitorView is a borrowed view; call .summary() or .scx_events() to project"]
30#[non_exhaustive]
31pub struct MonitorView<'a> {
32 report: &'a MonitorReport,
33}
34
35impl<'a> MonitorView<'a> {
36 /// Aggregate summary statistics: imbalance ratio, nr_running
37 /// averages, local DSQ depth, stuck-CPU count, and
38 /// optional schedstat / prog-stats deltas. See
39 /// `MonitorSummary` for the full field set.
40 pub fn summary(&self) -> &'a MonitorSummary {
41 &self.report.summary
42 }
43
44 /// SCX event-counter accessor. Returns `None` when the monitor
45 /// ran but `event_deltas` were not computed (kernel without
46 /// event counters, monitoring window too short to compute
47 /// first/last deltas) — Option chain matches the source
48 /// `MonitorSummary::event_deltas: Option<ScxEventDeltas>` field.
49 /// Callers chain `if let Some(evt) = view.scx_events()` to
50 /// branch on availability without panicking.
51 pub fn scx_events(&self) -> Option<ScxEventsView<'a>> {
52 self.report
53 .summary
54 .event_deltas
55 .as_ref()
56 .map(|deltas| ScxEventsView { deltas })
57 }
58
59 /// Borrowed per-tick monitor samples. Each
60 /// `crate::monitor::MonitorSample` is one host-side
61 /// observation of the guest's per-CPU runqueue state
62 /// (`nr_running`, `local_dsq_depth`, `rq_clock`, optional
63 /// event counters). The monitor thread captures these on a
64 /// fixed cadence independent of the snapshot bridge's
65 /// freeze-rendezvous captures; samples carry their own
66 /// `elapsed_ms` timestamp for windowing.
67 ///
68 /// Empty when the monitor ran but produced no samples (very
69 /// short run, monitor thread exited early). The slot is
70 /// always present — `MonitorView` itself only exists when a
71 /// `MonitorReport` was attached at series construction.
72 ///
73 /// Live caller: [`crate::assert::build_phase_buckets`] windows
74 /// these samples per phase to compute metrics like
75 /// `avg_imbalance_ratio` that need per-CPU `rq.nr_running`
76 /// (full-class count), which the bridge-captured
77 /// [`crate::scenario::snapshot::Snapshot`] does NOT expose
78 /// (Snapshot carries only `scx_rq.nr_running`, the SCX-only
79 /// subset). The two data axes are complementary: Snapshot for
80 /// frozen BPF state at capture instants, MonitorSample for
81 /// per-tick observations across the whole window.
82 pub fn samples(&self) -> &'a [crate::monitor::MonitorSample] {
83 &self.report.samples
84 }
85
86 /// vCPU-preemption exemption window (ns) for stall detection,
87 /// derived from the guest kernel's `CONFIG_HZ` at run time. `0`
88 /// means "derive from a default" — callers folding per-phase
89 /// stall counts pass it through to
90 /// `crate::timeline::compute_metrics` so the per-phase predicate
91 /// matches the run-level `MonitorSummary::stuck_count` one.
92 pub fn preemption_threshold_ns(&self) -> u64 {
93 self.report.preemption_threshold_ns
94 }
95}
96
97/// Default curated subset of [`ScxEventsView::total_pairs`] counter
98/// names that signal genuine scheduler-class errors when non-zero.
99/// Used to filter the full 14-entry total slice down to the entries
100/// that callers conventionally bound at zero with
101/// [`crate::assert::assert_scx_events_clean`].
102///
103/// Membership is the documented intersection of the kernel-side
104/// `SCX_EV_*` counters whose non-zero firing is exclusively
105/// pathological (skipped enqueue paths, repeated re-enqueue cycles,
106/// owner-mismatched inserts) — the `bypass_*`,
107/// `dispatch_keep_last`, `refill_slice_dfl` counters that
108/// legitimately fire on healthy schedulers are deliberately
109/// excluded. Different test scenarios may consider different
110/// counters error-class; the projector exposes the full slice via
111/// [`ScxEventsView::total_pairs`] so callers can override this
112/// default by filtering on their own set.
113pub const ERROR_CLASS_NAMES: &[&str] = &[
114 "enq_skip_exiting",
115 "enq_skip_migration_disabled",
116 "reenq_immed",
117 "reenq_local_repeat",
118 "insert_not_owned",
119];
120
121/// Borrowed view over the `ScxEventDeltas` aggregated across the
122/// monitor's first/last sample window. Returned by
123/// [`MonitorView::scx_events`]; exposes the 14 i64 counter totals
124/// via [`Self::total_pairs`] and the 2 f64 derived rates via
125/// [`Self::rates_pairs`].
126#[derive(Debug, Clone, Copy)]
127#[must_use = "ScxEventsView is a borrowed view; call .total_pairs() or .rates_pairs() to project"]
128#[non_exhaustive]
129pub struct ScxEventsView<'a> {
130 deltas: &'a ScxEventDeltas,
131}
132
133impl<'a> ScxEventsView<'a> {
134 /// All 14 i64 counter totals as `(name, value)` pairs in the
135 /// shape that feeds
136 /// [`crate::assert::assert_scx_events_clean`]. Order:
137 /// `select_cpu_fallback`, `select_cpu_fallback_max_burst`,
138 /// `dispatch_local_dsq_offline`, `dispatch_keep_last`,
139 /// `enq_skip_exiting`, `enq_skip_migration_disabled`,
140 /// `reenq_immed`, `reenq_local_repeat`, `refill_slice_dfl`,
141 /// `bypass_duration_ns`, `bypass_dispatch`, `bypass_activate`,
142 /// `insert_not_owned`, `sub_bypass_dispatch`.
143 ///
144 /// **STRICTNESS WARNING:** `assert_scx_events_clean(pairs,
145 /// None)` against the full 14-entry slice will spuriously
146 /// fail under normal scheduling load — several counters
147 /// (`bypass_*`, `dispatch_keep_last`, `refill_slice_dfl`)
148 /// legitimately fire on healthy schedulers. Callers either
149 /// curate the slice (`pairs.iter().filter(...).collect()`)
150 /// or pass `Some(bound)` for non-error-class events. The
151 /// projector deliberately does NOT bake "error class" judgment
152 /// in — different test scenarios consider different counters
153 /// error-class.
154 ///
155 /// Example — assert only error-class counters are zero by
156 /// curating the slice before the assertion:
157 ///
158 /// ```no_run
159 /// # use ktstr::scenario::sample::SampleSeries;
160 /// # use ktstr::scenario::sample::ERROR_CLASS_NAMES;
161 /// # use ktstr::assert::assert_scx_events_clean;
162 /// # fn example(series: &SampleSeries) {
163 /// if let Some(view) = series.monitor()
164 /// && let Some(events) = view.scx_events()
165 /// {
166 /// let pairs = events.total_pairs();
167 /// let error_only: Vec<(&str, i64)> = pairs
168 /// .into_iter()
169 /// .filter(|(name, _)| ERROR_CLASS_NAMES.contains(name))
170 /// .collect();
171 /// assert!(assert_scx_events_clean(&error_only, None).is_pass());
172 /// }
173 /// # }
174 /// ```
175 pub fn total_pairs(&self) -> Vec<(&'static str, i64)> {
176 vec![
177 ("select_cpu_fallback", self.deltas.total_fallback),
178 (
179 "select_cpu_fallback_max_burst",
180 self.deltas.max_fallback_burst,
181 ),
182 (
183 "dispatch_local_dsq_offline",
184 self.deltas.total_dispatch_offline,
185 ),
186 ("dispatch_keep_last", self.deltas.total_dispatch_keep_last),
187 ("enq_skip_exiting", self.deltas.total_enq_skip_exiting),
188 (
189 "enq_skip_migration_disabled",
190 self.deltas.total_enq_skip_migration_disabled,
191 ),
192 ("reenq_immed", self.deltas.total_reenq_immed),
193 ("reenq_local_repeat", self.deltas.total_reenq_local_repeat),
194 ("refill_slice_dfl", self.deltas.total_refill_slice_dfl),
195 ("bypass_duration_ns", self.deltas.total_bypass_duration),
196 ("bypass_dispatch", self.deltas.total_bypass_dispatch),
197 ("bypass_activate", self.deltas.total_bypass_activate),
198 ("insert_not_owned", self.deltas.total_insert_not_owned),
199 ("sub_bypass_dispatch", self.deltas.total_sub_bypass_dispatch),
200 ]
201 }
202
203 /// Derived per-second rate fields as `(name, value)` pairs.
204 /// Separate from [`Self::total_pairs`] because rates have a
205 /// different semantic (rate-bounded asserts, not count-bounded)
206 /// and a different value type (f64 vs i64). Order:
207 /// `select_cpu_fallback_rate`, `dispatch_keep_last_rate`.
208 pub fn rates_pairs(&self) -> Vec<(&'static str, f64)> {
209 vec![
210 ("select_cpu_fallback_rate", self.deltas.fallback_rate),
211 ("dispatch_keep_last_rate", self.deltas.keep_last_rate),
212 ]
213 }
214}
215
216impl SampleSeries {
217 /// Borrowed view over the per-VM-run host monitor report
218 /// associated with this series. `None` when the monitor did
219 /// not run (host-only tests, early VM failure, or
220 /// [`Self::from_drained`] was called with `None` monitor).
221 ///
222 /// Monitor is per-series — aggregates inside the returned
223 /// [`MonitorView`] refer to THAT series' monitoring window
224 /// only; no cross-series merge is supported. A test that
225 /// constructs two `SampleSeries` from two VM runs gets two
226 /// independent monitors.
227 ///
228 /// The returned `MonitorView<'_>` borrows from this series,
229 /// so the series must outlive any projection chained off the
230 /// view (e.g. `series.monitor().map(|m|
231 /// m.scx_events()?.total_pairs())` — the whole chain is bound
232 /// by `series`'s lifetime).
233 pub fn monitor(&self) -> Option<MonitorView<'_>> {
234 self.monitor.as_ref().map(|m| MonitorView { report: m })
235 }
236}
237
238#[cfg(test)]
239mod tests {
240 use super::*;
241
242 /// `series.monitor()` returns `None` when no monitor was
243 /// supplied (host-only tests, early VM failure, or
244 /// `from_drained` was called with `None` monitor). Pins the
245 /// Option chain — callers reaching for monitor metrics via
246 /// `if let Some(view) = series.monitor()` must NOT panic and
247 /// must NOT vacuously return default-empty data.
248 #[test]
249 fn series_monitor_none_when_unset() {
250 let series = SampleSeries::from_drained(vec![], None);
251 assert!(series.monitor().is_none());
252 }
253
254 /// `series.monitor()` returns `Some(view)` when monitor was
255 /// supplied; the view wraps the supplied report and the inner
256 /// `.summary()` accessor returns a reference to the report's
257 /// summary unchanged. Pins the borrow-through-view shape.
258 #[test]
259 fn series_monitor_view_threads_through_supplied_report() {
260 let mut report = MonitorReport::default();
261 report.summary.total_samples = 42;
262 report.summary.max_imbalance_ratio = 2.5;
263 let series = SampleSeries::from_drained(vec![], Some(report));
264 let view = series.monitor().expect("monitor must be Some");
265 let summary = view.summary();
266 assert_eq!(summary.total_samples, 42);
267 assert_eq!(summary.max_imbalance_ratio, 2.5);
268 }
269
270 /// `view.scx_events()` returns `None` when `event_deltas` is
271 /// `None` on the underlying summary (kernel without event
272 /// counters, monitoring window too short). Inner-Option chain
273 /// must NOT collapse to default-zero pairs — silently masking
274 /// the missing-data condition would be a silent-loss path.
275 #[test]
276 fn series_monitor_scx_events_none_when_event_deltas_absent() {
277 let report = MonitorReport::default(); // event_deltas defaults to None
278 let series = SampleSeries::from_drained(vec![], Some(report));
279 let view = series.monitor().expect("monitor must be Some");
280 assert!(
281 view.scx_events().is_none(),
282 "scx_events must return None when event_deltas is absent — \
283 returning Some with zero-default pairs would silently mask the missing-data condition"
284 );
285 }
286
287 /// `view.scx_events()?.total_pairs()` enumerates all 14 i64
288 /// counter fields in the documented order with the documented
289 /// names, and `.rates_pairs()` enumerates the 2 f64 derived
290 /// rates. Pins the projector's name-to-field mapping against
291 /// drift — a regression that reordered fields, renamed a counter,
292 /// or accidentally included a rate in total_pairs would fail
293 /// here.
294 #[test]
295 fn series_monitor_scx_events_pairs_map_to_named_counters() {
296 let mut report = MonitorReport::default();
297 report.summary.event_deltas = Some(ScxEventDeltas {
298 total_fallback: 1,
299 fallback_rate: 0.5,
300 max_fallback_burst: 2,
301 total_dispatch_offline: 3,
302 total_dispatch_keep_last: 4,
303 keep_last_rate: 0.75,
304 total_enq_skip_exiting: 5,
305 total_enq_skip_migration_disabled: 6,
306 total_reenq_immed: 7,
307 total_reenq_local_repeat: 8,
308 total_refill_slice_dfl: 9,
309 total_bypass_duration: 10,
310 total_bypass_dispatch: 11,
311 total_bypass_activate: 12,
312 total_insert_not_owned: 13,
313 total_sub_bypass_dispatch: 14,
314 });
315 let series = SampleSeries::from_drained(vec![], Some(report));
316 let view = series.monitor().expect("monitor must be Some");
317 let events = view.scx_events().expect("event_deltas were set");
318 let totals = events.total_pairs();
319 assert_eq!(totals.len(), 14, "exactly 14 i64 counter pairs");
320 assert_eq!(
321 totals,
322 vec![
323 ("select_cpu_fallback", 1),
324 ("select_cpu_fallback_max_burst", 2),
325 ("dispatch_local_dsq_offline", 3),
326 ("dispatch_keep_last", 4),
327 ("enq_skip_exiting", 5),
328 ("enq_skip_migration_disabled", 6),
329 ("reenq_immed", 7),
330 ("reenq_local_repeat", 8),
331 ("refill_slice_dfl", 9),
332 ("bypass_duration_ns", 10),
333 ("bypass_dispatch", 11),
334 ("bypass_activate", 12),
335 ("insert_not_owned", 13),
336 ("sub_bypass_dispatch", 14),
337 ]
338 );
339 let rates = events.rates_pairs();
340 assert_eq!(rates.len(), 2, "exactly 2 f64 rate pairs");
341 assert_eq!(
342 rates,
343 vec![
344 ("select_cpu_fallback_rate", 0.5),
345 ("dispatch_keep_last_rate", 0.75),
346 ]
347 );
348 }
349
350 /// Pins the STRICTNESS WARNING contract on
351 /// [`ScxEventsView::total_pairs`]: when a non-error-class
352 /// counter (`total_bypass_dispatch` here) legitimately fires
353 /// alongside an error-class counter at zero,
354 /// `assert_scx_events_clean(pairs, None)` against the FULL
355 /// 14-entry slice MUST FAIL (because bypass_dispatch > 0); the
356 /// CURATED subset of error-class counters MUST PASS (because
357 /// every error counter is zero). A future regression that
358 /// silently dropped a counter from `total_pairs` or accidentally
359 /// curated by the projector would break one of these two
360 /// assertions.
361 #[test]
362 fn series_monitor_scx_events_strict_zero_misuse_pinning() {
363 use crate::assert::assert_scx_events_clean;
364 let mut report = MonitorReport::default();
365 report.summary.event_deltas = Some(ScxEventDeltas {
366 total_bypass_dispatch: 100,
367 total_bypass_activate: 50,
368 total_dispatch_keep_last: 7,
369 ..Default::default()
370 });
371 let series = SampleSeries::from_drained(vec![], Some(report));
372 let view = series.monitor().expect("monitor was set");
373 let events = view.scx_events().expect("event_deltas were set");
374 let pairs = events.total_pairs();
375 // Full slice + strict zero: MUST fail because bypass_*
376 // counters fired with legitimate non-zero values.
377 let r_full = assert_scx_events_clean(&pairs, None);
378 assert!(
379 !r_full.is_pass(),
380 "strict-zero against full 14-entry slice MUST fail when non-error-class counters legitimately fire — pins the STRICTNESS WARNING design contract"
381 );
382 // Curated error-class subset: MUST pass because every
383 // error-class counter is zero (we only populated bypass_*
384 // and dispatch_keep_last, neither of which is error class).
385 let error_only: Vec<(&str, i64)> = pairs
386 .into_iter()
387 .filter(|(name, _)| ERROR_CLASS_NAMES.contains(name))
388 .collect();
389 let r_curated = assert_scx_events_clean(&error_only, None);
390 assert!(
391 r_curated.is_pass(),
392 "curated error-class subset MUST pass when every error counter is zero — pins the curate-then-assert documented workaround"
393 );
394 }
395}