ktstr/workload/schbench/
mod.rs

1//! schbench_rs — a faithful native re-expression of schbench in ktstr.
2//! No binary, no subprocess: the schbench algorithm is
3//! re-expressed in ktstr's own workload / scenario / metric primitives so
4//! its numbers flow natively through the metric API (phases, assertions,
5//! perf-delta).
6//!
7//! Modules: [`plat`] (schbench's bit-exact fio log2 histogram + percentiles),
8//! [`percpu_lock`] (the per-CPU mutex stressor), [`handshake`] (the futex
9//! message<->worker handshake), and [`run`] (the run engine: topology, lockless
10//! wait-list, the wakeup + request latency loop, per-phase histogram snapshots,
11//! and schedstat run-delay capture). [`run::run`] backs the
12//! [`Schbench`](crate::workload::WorkType::Schbench) workload and the per-phase
13//! metric path; [`run_standalone`] drives the same engine host-side, outside a
14//! VM, for the side-by-side validation against the reference schbench. The
15//! RPS-injector mode (`-R`) and its auto-RPS rate control (`-A`) are part of the
16//! engine; the control thread also samples the per-second RPS distribution.
17
18pub(crate) mod handshake;
19pub(crate) mod percpu_lock;
20pub(crate) mod plat;
21pub(crate) mod run;
22
23/// User-facing config for the [`Schbench`](crate::workload::WorkType::Schbench) workload.
24pub use run::SchbenchConfig;
25/// Pipe-mode (`-p`) throughput reporting used by the `ktstr-schbench-validate`
26/// driver to mirror schbench's `avg worker transfer` line; clamps the transfer
27/// size + scales bytes/sec exactly like schbench (`schbench.c:1979-1982`). Not in
28/// the prelude (validation-tool surface, like [`StandaloneReport`]).
29pub use run::{PipeTransferReport, pipe_transfer_report};
30
31/// The five latency percentiles reported by [`StandaloneReport`] and the
32/// per-phase metric path, in column order: 20.0, 50.0, 90.0, 99.0, 99.9. Matches
33/// schbench's percentile rows (`schbench.c` `show_latencies`). Callers label the
34/// [`StandaloneReport`] percentile arrays by zipping with this slice rather than
35/// hard-coding an index-to-percentile mapping.
36pub const SCHBENCH_PERCENTILES: [f64; 5] = plat::PLIST;
37
38/// Whole-run result of a standalone (no-VM) schbench engine run, projected for
39/// the side-by-side comparison against the reference schbench. The percentile
40/// arrays index in [`SCHBENCH_PERCENTILES`] order (20.0, 50.0, 90.0, 99.0,
41/// 99.9), in microseconds. The sample counts are carried so a zero-sample run is
42/// visible rather than silently reported as an all-zero distribution.
43#[derive(Debug, Clone, Copy)]
44pub struct StandaloneReport {
45    /// Wakeup-latency percentiles (µs), in [`SCHBENCH_PERCENTILES`] order.
46    pub wakeup_pcts_us: [u32; 5],
47    /// Differenced per-bucket sample count at each percentile (schbench's
48    /// per-row `(N samples)`), in [`SCHBENCH_PERCENTILES`] order.
49    pub wakeup_counts: [u64; 5],
50    /// Minimum observed wakeup latency (µs).
51    pub wakeup_min_us: u32,
52    /// Maximum observed wakeup latency (µs).
53    pub wakeup_max_us: u32,
54    /// Number of wakeup-latency samples folded into the percentiles.
55    pub nr_wakeup_samples: u64,
56    /// Request-latency percentiles (µs), in [`SCHBENCH_PERCENTILES`] order.
57    pub request_pcts_us: [u32; 5],
58    /// Differenced per-bucket sample count at each percentile (schbench's
59    /// per-row `(N samples)`), in [`SCHBENCH_PERCENTILES`] order.
60    pub request_counts: [u64; 5],
61    /// Minimum observed request latency (µs).
62    pub request_min_us: u32,
63    /// Maximum observed request latency (µs).
64    pub request_max_us: u32,
65    /// Number of request-latency samples folded into the percentiles.
66    pub nr_request_samples: u64,
67    /// Per-second achieved-RPS percentiles (requests/sec), in
68    /// [`SCHBENCH_PERCENTILES`] order — schbench's `rps_stats` table sampled once
69    /// per second by the control thread (`schbench.c:1777`). Unitless rate, not
70    /// µs, so no `_us` suffix.
71    pub rps_pcts: [u32; 5],
72    /// Differenced per-bucket sample count at each RPS percentile, in
73    /// [`SCHBENCH_PERCENTILES`] order.
74    pub rps_counts: [u64; 5],
75    /// Minimum observed per-second RPS sample.
76    pub rps_min: u32,
77    /// Maximum observed per-second RPS sample.
78    pub rps_max: u32,
79    /// Number of per-second RPS samples folded into the percentiles.
80    pub nr_rps_samples: u64,
81    /// Auto-RPS final TOTAL target rate at run exit (per-thread live rate *
82    /// message_threads), schbench's `final rps goal` (`schbench.c:1995`). Equal to
83    /// the seeded total for fixed `-R`/default mode; diverges only under auto-RPS.
84    pub final_rps_goal: usize,
85    /// Completed work cycles per second over the TRUE elapsed run window
86    /// (`loop_count / elapsed`). NOT schbench's `average rps` summary line, which
87    /// divides by the integer `-r` runtime — `schbench_validate` prints that
88    /// (`loop_count / runtime_secs`) separately; this field is the measured
89    /// elapsed-window rate.
90    pub achieved_rps: f64,
91    /// Mean message-thread run-queue wait (ns), schedstat mean-of-means.
92    pub sched_delay_msg_ns: u64,
93    /// Mean worker-thread run-queue wait (ns), schedstat mean-of-means.
94    pub sched_delay_worker_ns: u64,
95    /// Total work-loop iterations across all worker threads.
96    pub loop_count: u64,
97    /// Resolved total worker count (`message_threads * worker_threads`). Divisor
98    /// for the PER-WORKER pipe-mode `avg worker transfer` rate — see
99    /// [`pipe_transfer_report`].
100    pub nr_workers: usize,
101}
102
103/// Run the schbench engine standalone — host-side, no VM, no phases — for
104/// `run_secs` seconds and project the whole-run result into a
105/// [`StandaloneReport`] for the side-by-side validation against the reference
106/// schbench.
107///
108/// The `run_secs` window mirrors schbench's `-r <secs>`: it is the benchmark's
109/// own defined runtime — the workload behavior, like the per-request think-sleep
110/// in `run` — not a harness poll or synchronization wait. The engine itself
111/// stays stop-gated and event-driven; this wrapper is the only place a
112/// wall-clock timer drives it, bounding the benchmark window the way `-r` does
113/// upstream.
114///
115/// Non-phasic: `phase_epoch` is `None`, so the engine produces a single
116/// whole-run aggregate — the reference schbench has no phases, and the
117/// comparison is whole-run to whole-run.
118pub fn run_standalone(config: &SchbenchConfig, run_secs: u64) -> StandaloneReport {
119    use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
120
121    let stop = AtomicBool::new(false);
122    let progress = AtomicU64::new(0);
123    let outcome = std::thread::scope(|s| {
124        let runner = s.spawn(|| run::run(config, &stop, &progress, None));
125        // The `-r` benchmark window: the workload's defined runtime, not a
126        // synchronization sleep. The engine runs until `stop` is set here.
127        std::thread::sleep(std::time::Duration::from_secs(run_secs));
128        stop.store(true, Ordering::Release);
129        runner.join().expect("schbench standalone runner panicked")
130    });
131
132    let w = &outcome.whole_run;
133    StandaloneReport {
134        wakeup_pcts_us: w.wakeup.values,
135        wakeup_counts: w.wakeup.counts,
136        wakeup_min_us: w.wakeup.min,
137        wakeup_max_us: w.wakeup.max,
138        nr_wakeup_samples: w.wakeup.nr_samples,
139        request_pcts_us: w.request.values,
140        request_counts: w.request.counts,
141        request_min_us: w.request.min,
142        request_max_us: w.request.max,
143        nr_request_samples: w.request.nr_samples,
144        rps_pcts: w.rps.values,
145        rps_counts: w.rps.counts,
146        rps_min: w.rps.min,
147        rps_max: w.rps.max,
148        nr_rps_samples: w.rps.nr_samples,
149        final_rps_goal: w.final_rps_goal,
150        achieved_rps: w.achieved_rps,
151        sched_delay_msg_ns: w.sched_delay_msg_ns,
152        sched_delay_worker_ns: w.sched_delay_worker_ns,
153        loop_count: w.loop_count,
154        nr_workers: w.nr_workers,
155    }
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    /// `run_standalone` runs the host engine for the benchmark window and the
163    /// projection fills every [`StandaloneReport`] field from the whole-run
164    /// aggregate. Pins the pub entry the `ktstr-schbench-validate` bin depends
165    /// on. Minimal topology (1 message thread, 1 worker, no think-sleep) over a
166    /// 2-second window — the wakeup/request loop records latency samples, and the
167    /// control thread fires at the 1-second tick so at least one per-second RPS
168    /// sample lands before stop (a 1s window would race the single tick).
169    #[test]
170    fn run_standalone_fills_report_from_a_real_run() {
171        let config = SchbenchConfig::default()
172            .message_threads(1)
173            .worker_threads(1)
174            .sleep_usec(0);
175        let report = run_standalone(&config, 2);
176
177        // The engine did work and paced requests over the window.
178        assert!(report.loop_count > 0, "loop_count: {}", report.loop_count);
179        assert!(
180            report.achieved_rps > 0.0,
181            "achieved_rps: {}",
182            report.achieved_rps
183        );
184
185        // Both latency distributions recorded samples.
186        assert!(report.nr_wakeup_samples > 0, "wakeup samples");
187        assert!(report.nr_request_samples > 0, "request samples");
188
189        // The control thread sampled the per-second RPS distribution (default
190        // mode samples every second; the 1s tick lands inside the 2s window).
191        assert!(
192            report.nr_rps_samples > 0,
193            "rps samples: {}",
194            report.nr_rps_samples
195        );
196
197        // Percentile values projected in order: a distribution is monotonic
198        // non-decreasing across p20..p99.9 (a higher percentile sits at a
199        // higher histogram bucket). Catches a transposed/garbled values array.
200        for w in report.wakeup_pcts_us.windows(2) {
201            assert!(
202                w[0] <= w[1],
203                "wakeup percentiles monotonic: {:?}",
204                report.wakeup_pcts_us
205            );
206        }
207        for w in report.request_pcts_us.windows(2) {
208            assert!(
209                w[0] <= w[1],
210                "request percentiles monotonic: {:?}",
211                report.request_pcts_us
212            );
213        }
214        for w in report.rps_pcts.windows(2) {
215            assert!(
216                w[0] <= w[1],
217                "rps percentiles monotonic: {:?}",
218                report.rps_pcts
219            );
220        }
221
222        // Per-row counts carried through (not zeroed by the projection). They are
223        // schbench's DIFFERENCED per-band counts, so they sum to the cumulative
224        // count at p99.9 -- positive and bounded above by the total, not equal to
225        // it.
226        let wc: u64 = report.wakeup_counts.iter().sum();
227        assert!(
228            wc > 0 && wc <= report.nr_wakeup_samples,
229            "wakeup counts {wc} in (0, {}]",
230            report.nr_wakeup_samples
231        );
232        let rc: u64 = report.request_counts.iter().sum();
233        assert!(
234            rc > 0 && rc <= report.nr_request_samples,
235            "request counts {rc} in (0, {}]",
236            report.nr_request_samples
237        );
238        let rpsc: u64 = report.rps_counts.iter().sum();
239        assert!(
240            rpsc > 0 && rpsc <= report.nr_rps_samples,
241            "rps counts {rpsc} in (0, {}]",
242            report.nr_rps_samples
243        );
244
245        // min/max projected (carried, not swapped); min <= max is the
246        // invariant. NOT `min <= pcts[0]` / `pcts[4] <= max`: min/max are exact
247        // sample values, but a percentile value is the log-bucket MIDPOINT for
248        // tails (plat_idx_to_val for idx >= 512), so a >=512µs sample in the
249        // p99.9 bucket can put pcts[4] above the exact max (and min above
250        // pcts[0]) -- a real value, not a bug, so bracketing would flake.
251        assert!(report.wakeup_min_us <= report.wakeup_max_us);
252        assert!(report.request_min_us <= report.request_max_us);
253        assert!(report.rps_min <= report.rps_max);
254    }
255}