ktstr/workload/schbench/mod.rs
1//! schbench_rs — a faithful native re-expression of schbench in ktstr.
2//! No binary, no subprocess: the schbench algorithm is
3//! re-expressed in ktstr's own workload / scenario / metric primitives so
4//! its numbers flow natively through the metric API (phases, assertions,
5//! perf-delta).
6//!
7//! Modules: [`plat`] (schbench's bit-exact fio log2 histogram + percentiles),
8//! [`percpu_lock`] (the per-CPU mutex stressor), [`handshake`] (the futex
9//! message<->worker handshake), and [`run`] (the run engine: topology, lockless
10//! wait-list, the wakeup + request latency loop, per-phase histogram snapshots,
11//! and schedstat run-delay capture). [`run::run`] backs the
12//! [`Schbench`](crate::workload::WorkType::Schbench) workload and the per-phase
13//! metric path; [`run_standalone`] drives the same engine host-side, outside a
14//! VM, for the side-by-side validation against the reference schbench. The
15//! RPS-injector mode (`-R`) and its auto-RPS rate control (`-A`) are part of the
16//! engine; the control thread also samples the per-second RPS distribution.
17
18pub(crate) mod handshake;
19pub(crate) mod percpu_lock;
20pub(crate) mod plat;
21pub(crate) mod run;
22
23/// User-facing config for the [`Schbench`](crate::workload::WorkType::Schbench) workload.
24pub use run::SchbenchConfig;
25/// Pipe-mode (`-p`) throughput reporting used by the `ktstr-schbench-validate`
26/// driver to mirror schbench's `avg worker transfer` line; clamps the transfer
27/// size + scales bytes/sec exactly like schbench (`schbench.c:1979-1982`). Not in
28/// the prelude (validation-tool surface, like [`StandaloneReport`]).
29pub use run::{PipeTransferReport, pipe_transfer_report};
30
31/// The five latency percentiles reported by [`StandaloneReport`] and the
32/// per-phase metric path, in column order: 20.0, 50.0, 90.0, 99.0, 99.9. Matches
33/// schbench's percentile rows (`schbench.c` `show_latencies`). Callers label the
34/// [`StandaloneReport`] percentile arrays by zipping with this slice rather than
35/// hard-coding an index-to-percentile mapping.
36pub const SCHBENCH_PERCENTILES: [f64; 5] = plat::PLIST;
37
38/// Whole-run result of a standalone (no-VM) schbench engine run, projected for
39/// the side-by-side comparison against the reference schbench. The percentile
40/// arrays index in [`SCHBENCH_PERCENTILES`] order (20.0, 50.0, 90.0, 99.0,
41/// 99.9), in microseconds. The sample counts are carried so a zero-sample run is
42/// visible rather than silently reported as an all-zero distribution.
43#[derive(Debug, Clone, Copy)]
44pub struct StandaloneReport {
45 /// Wakeup-latency percentiles (µs), in [`SCHBENCH_PERCENTILES`] order.
46 pub wakeup_pcts_us: [u32; 5],
47 /// Differenced per-bucket sample count at each percentile (schbench's
48 /// per-row `(N samples)`), in [`SCHBENCH_PERCENTILES`] order.
49 pub wakeup_counts: [u64; 5],
50 /// Minimum observed wakeup latency (µs).
51 pub wakeup_min_us: u32,
52 /// Maximum observed wakeup latency (µs).
53 pub wakeup_max_us: u32,
54 /// Number of wakeup-latency samples folded into the percentiles.
55 pub nr_wakeup_samples: u64,
56 /// Request-latency percentiles (µs), in [`SCHBENCH_PERCENTILES`] order.
57 pub request_pcts_us: [u32; 5],
58 /// Differenced per-bucket sample count at each percentile (schbench's
59 /// per-row `(N samples)`), in [`SCHBENCH_PERCENTILES`] order.
60 pub request_counts: [u64; 5],
61 /// Minimum observed request latency (µs).
62 pub request_min_us: u32,
63 /// Maximum observed request latency (µs).
64 pub request_max_us: u32,
65 /// Number of request-latency samples folded into the percentiles.
66 pub nr_request_samples: u64,
67 /// Per-second achieved-RPS percentiles (requests/sec), in
68 /// [`SCHBENCH_PERCENTILES`] order — schbench's `rps_stats` table sampled once
69 /// per second by the control thread (`schbench.c:1777`). Unitless rate, not
70 /// µs, so no `_us` suffix.
71 pub rps_pcts: [u32; 5],
72 /// Differenced per-bucket sample count at each RPS percentile, in
73 /// [`SCHBENCH_PERCENTILES`] order.
74 pub rps_counts: [u64; 5],
75 /// Minimum observed per-second RPS sample.
76 pub rps_min: u32,
77 /// Maximum observed per-second RPS sample.
78 pub rps_max: u32,
79 /// Number of per-second RPS samples folded into the percentiles.
80 pub nr_rps_samples: u64,
81 /// Auto-RPS final TOTAL target rate at run exit (per-thread live rate *
82 /// message_threads), schbench's `final rps goal` (`schbench.c:1995`). Equal to
83 /// the seeded total for fixed `-R`/default mode; diverges only under auto-RPS.
84 pub final_rps_goal: usize,
85 /// Completed work cycles per second over the TRUE elapsed run window
86 /// (`loop_count / elapsed`). NOT schbench's `average rps` summary line, which
87 /// divides by the integer `-r` runtime — `schbench_validate` prints that
88 /// (`loop_count / runtime_secs`) separately; this field is the measured
89 /// elapsed-window rate.
90 pub achieved_rps: f64,
91 /// Mean message-thread run-queue wait (ns), schedstat mean-of-means.
92 pub sched_delay_msg_ns: u64,
93 /// Mean worker-thread run-queue wait (ns), schedstat mean-of-means.
94 pub sched_delay_worker_ns: u64,
95 /// Total work-loop iterations across all worker threads.
96 pub loop_count: u64,
97 /// Resolved total worker count (`message_threads * worker_threads`). Divisor
98 /// for the PER-WORKER pipe-mode `avg worker transfer` rate — see
99 /// [`pipe_transfer_report`].
100 pub nr_workers: usize,
101}
102
103/// Run the schbench engine standalone — host-side, no VM, no phases — for
104/// `run_secs` seconds and project the whole-run result into a
105/// [`StandaloneReport`] for the side-by-side validation against the reference
106/// schbench.
107///
108/// The `run_secs` window mirrors schbench's `-r <secs>`: it is the benchmark's
109/// own defined runtime — the workload behavior, like the per-request think-sleep
110/// in `run` — not a harness poll or synchronization wait. The engine itself
111/// stays stop-gated and event-driven; this wrapper is the only place a
112/// wall-clock timer drives it, bounding the benchmark window the way `-r` does
113/// upstream.
114///
115/// Non-phasic: `phase_epoch` is `None`, so the engine produces a single
116/// whole-run aggregate — the reference schbench has no phases, and the
117/// comparison is whole-run to whole-run.
118pub fn run_standalone(config: &SchbenchConfig, run_secs: u64) -> StandaloneReport {
119 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
120
121 let stop = AtomicBool::new(false);
122 let progress = AtomicU64::new(0);
123 let outcome = std::thread::scope(|s| {
124 let runner = s.spawn(|| run::run(config, &stop, &progress, None));
125 // The `-r` benchmark window: the workload's defined runtime, not a
126 // synchronization sleep. The engine runs until `stop` is set here.
127 std::thread::sleep(std::time::Duration::from_secs(run_secs));
128 stop.store(true, Ordering::Release);
129 runner.join().expect("schbench standalone runner panicked")
130 });
131
132 let w = &outcome.whole_run;
133 StandaloneReport {
134 wakeup_pcts_us: w.wakeup.values,
135 wakeup_counts: w.wakeup.counts,
136 wakeup_min_us: w.wakeup.min,
137 wakeup_max_us: w.wakeup.max,
138 nr_wakeup_samples: w.wakeup.nr_samples,
139 request_pcts_us: w.request.values,
140 request_counts: w.request.counts,
141 request_min_us: w.request.min,
142 request_max_us: w.request.max,
143 nr_request_samples: w.request.nr_samples,
144 rps_pcts: w.rps.values,
145 rps_counts: w.rps.counts,
146 rps_min: w.rps.min,
147 rps_max: w.rps.max,
148 nr_rps_samples: w.rps.nr_samples,
149 final_rps_goal: w.final_rps_goal,
150 achieved_rps: w.achieved_rps,
151 sched_delay_msg_ns: w.sched_delay_msg_ns,
152 sched_delay_worker_ns: w.sched_delay_worker_ns,
153 loop_count: w.loop_count,
154 nr_workers: w.nr_workers,
155 }
156}
157
158#[cfg(test)]
159mod tests {
160 use super::*;
161
162 /// `run_standalone` runs the host engine for the benchmark window and the
163 /// projection fills every [`StandaloneReport`] field from the whole-run
164 /// aggregate. Pins the pub entry the `ktstr-schbench-validate` bin depends
165 /// on. Minimal topology (1 message thread, 1 worker, no think-sleep) over a
166 /// 2-second window — the wakeup/request loop records latency samples, and the
167 /// control thread fires at the 1-second tick so at least one per-second RPS
168 /// sample lands before stop (a 1s window would race the single tick).
169 #[test]
170 fn run_standalone_fills_report_from_a_real_run() {
171 let config = SchbenchConfig::default()
172 .message_threads(1)
173 .worker_threads(1)
174 .sleep_usec(0);
175 let report = run_standalone(&config, 2);
176
177 // The engine did work and paced requests over the window.
178 assert!(report.loop_count > 0, "loop_count: {}", report.loop_count);
179 assert!(
180 report.achieved_rps > 0.0,
181 "achieved_rps: {}",
182 report.achieved_rps
183 );
184
185 // Both latency distributions recorded samples.
186 assert!(report.nr_wakeup_samples > 0, "wakeup samples");
187 assert!(report.nr_request_samples > 0, "request samples");
188
189 // The control thread sampled the per-second RPS distribution (default
190 // mode samples every second; the 1s tick lands inside the 2s window).
191 assert!(
192 report.nr_rps_samples > 0,
193 "rps samples: {}",
194 report.nr_rps_samples
195 );
196
197 // Percentile values projected in order: a distribution is monotonic
198 // non-decreasing across p20..p99.9 (a higher percentile sits at a
199 // higher histogram bucket). Catches a transposed/garbled values array.
200 for w in report.wakeup_pcts_us.windows(2) {
201 assert!(
202 w[0] <= w[1],
203 "wakeup percentiles monotonic: {:?}",
204 report.wakeup_pcts_us
205 );
206 }
207 for w in report.request_pcts_us.windows(2) {
208 assert!(
209 w[0] <= w[1],
210 "request percentiles monotonic: {:?}",
211 report.request_pcts_us
212 );
213 }
214 for w in report.rps_pcts.windows(2) {
215 assert!(
216 w[0] <= w[1],
217 "rps percentiles monotonic: {:?}",
218 report.rps_pcts
219 );
220 }
221
222 // Per-row counts carried through (not zeroed by the projection). They are
223 // schbench's DIFFERENCED per-band counts, so they sum to the cumulative
224 // count at p99.9 -- positive and bounded above by the total, not equal to
225 // it.
226 let wc: u64 = report.wakeup_counts.iter().sum();
227 assert!(
228 wc > 0 && wc <= report.nr_wakeup_samples,
229 "wakeup counts {wc} in (0, {}]",
230 report.nr_wakeup_samples
231 );
232 let rc: u64 = report.request_counts.iter().sum();
233 assert!(
234 rc > 0 && rc <= report.nr_request_samples,
235 "request counts {rc} in (0, {}]",
236 report.nr_request_samples
237 );
238 let rpsc: u64 = report.rps_counts.iter().sum();
239 assert!(
240 rpsc > 0 && rpsc <= report.nr_rps_samples,
241 "rps counts {rpsc} in (0, {}]",
242 report.nr_rps_samples
243 );
244
245 // min/max projected (carried, not swapped); min <= max is the
246 // invariant. NOT `min <= pcts[0]` / `pcts[4] <= max`: min/max are exact
247 // sample values, but a percentile value is the log-bucket MIDPOINT for
248 // tails (plat_idx_to_val for idx >= 512), so a >=512µs sample in the
249 // p99.9 bucket can put pcts[4] above the exact max (and min above
250 // pcts[0]) -- a real value, not a bug, so bracketing would flake.
251 assert!(report.wakeup_min_us <= report.wakeup_max_us);
252 assert!(report.request_min_us <= report.request_max_us);
253 assert!(report.rps_min <= report.rps_max);
254 }
255}