ktstr/vmm/wire.rs
1//! Shared wire-format types for the host/guest virtio-console port-1
2//! TLV stream and the multiport control protocol.
3//!
4//! Both [`super::guest_comms`] (guest-only senders) and
5//! [`super::host_comms`] (host-only consumers) reference this module.
6//! Splitting the wire format out of the transport modules keeps the
7//! frame layout authoritative — a producer change here lands in both
8//! the guest writer and the host parser without a hand-sync step.
9//!
10//! # Postcard wire-format pin inventory (contributor guide)
11//!
12//! Every type that crosses the in-VM postcard TLV channel MUST be
13//! externally-tagged (postcard cannot decode `#[serde(untagged)]`
14//! or `#[serde(tag, content)]` enums — encode raises `WontImplement`
15//! at runtime and the host surfaces it as
16//! `ERR_NO_TEST_FUNCTION_OUTPUT`). The compile-time
17//! `#[derive(serde::Serialize, serde::Deserialize)]` itself does
18//! NOT catch the shape mismatch; the contract is verified by the
19//! per-type roundtrip pin tests listed below. A contributor adding
20//! a new postcard payload MUST add a roundtrip pin in the
21//! corresponding location and update this inventory.
22//!
23//! Inventory (type → test name → file):
24//! - `AssertResult` → `assert_result_postcard_roundtrip`
25//! → `src/assert/tests_serde.rs`
26//! - `KernAddrs` → `kern_addrs_roundtrip_all_present` (+ 4 sibling
27//! boundary pins) → `src/vmm/wire.rs` (this file)
28//! - `KernelOpRequestPayload` / `KernelOpReplyPayload`
29//! → 4 tests → `src/vmm/wire.rs` (this file)
30//! - `PayloadMetrics`
31//! → `payload_metrics_postcard_roundtrip`
32//! → `src/test_support/payload.rs`
33//! - `WorkloadConfig` → `payload_roundtrip`
34//! → `src/test_support/payload.rs`
35//! - `WorkerReport` → `worker_report_postcard_roundtrip` (+ 3 sibling
36//! pins covering `Vec<WorkerReport>` + all `ExitInfo` variants)
37//! → `src/workload/spawn/tests_integration.rs`
38//! - `PersistedCastAnalysis` → see `src/vmm/cast_analysis_load`
39//! module's tests
40//!
41//! # Frame layout
42//!
43//! Each guest→host bulk message is a 16-byte [`ShmMessage`] header
44//! followed by `length` payload bytes. The host's
45//! [`super::host_comms::parse_tlv_stream`] consumes this format. CRC32
46//! covers payload bytes only, not the header.
47//!
48//! ```text
49//! offset size field
50//! ------ ---- ----------------------------------------------
51//! 0 4 msg_type (u32 LE) — see [`MsgType`]
52//! 4 4 length (u32 LE) — payload bytes following
53//! 8 4 crc32 (u32 LE) — crc32fast over payload
54//! 12 4 _pad (u32 LE) — reserved, MUST be zero
55//! 16 N payload (N=length bytes)
56//! ```
57//!
58//! # Control protocol
59//!
60//! [`VirtioConsoleControl`] mirrors the kernel uapi `struct
61//! virtio_console_control` for multiport handshake messages on the
62//! c_ivq / c_ovq queues (8 bytes: id u32, event u16, value u16).
63//! [`ControlEvent`] enumerates the event discriminants the kernel and
64//! the host VMM exchange during port enumeration.
65//!
66//! Many of the typed wrappers and constants in this module are part
67//! of the public bulk API surface; the lib build does not yet read
68//! every variant from internal call sites (the typed `MsgType` enum,
69//! `ControlEvent`, `VirtioConsoleControl`, `NUM_PORTS`, `PORT1_NAME`,
70//! and the `from_wire` reverse mappings are reachable via the public
71//! crate path for downstream test code and wire-format tests). The
72//! module-level `#[allow(dead_code)]` matches the `VmResult` field
73//! pattern in `result.rs` — public surface that the in-tree readers
74//! do not exercise without the unused-X lint firing.
75
76#![allow(dead_code)]
77
78use zerocopy::{FromBytes, IntoBytes};
79
80// ---------------------------------------------------------------------------
81// MsgType — typed message-type discriminant
82// ---------------------------------------------------------------------------
83
84/// Message-type discriminant for the bulk TLV stream.
85///
86/// Each variant maps to a 32-bit on-wire value via [`Self::wire_value`].
87/// The values are 4-character ASCII tags chosen so the integer literal
88/// itself spells the tag in hex (e.g. `0x4558_4954` reads as `"EXIT"`
89/// — `45`='E', `58`='X', `49`='I', `54`='T'). Because the wire format
90/// is little-endian, a raw byte-level hex dump of a captured frame
91/// shows the bytes in reverse order (e.g. `54 49 58 45` for the
92/// `Exit` tag, which spells `"TIXE"` byte-by-byte). The integer
93/// hex value spells the tag; the on-wire bytes are reversed.
94///
95/// On-wire values are stable across host/guest builds — adding a new
96/// variant requires picking a fresh ASCII tag and updating
97/// [`Self::from_wire`] to recognise it. Existing tags must never be
98/// repurposed.
99#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
100pub enum MsgType {
101 /// Stimulus event from the guest step executor — emitted at each
102 /// step's START (the StepStart frame).
103 Stimulus,
104 /// Per-step END frame from the guest step executor, emitted at the
105 /// end of each step's hold while its workers are still alive.
106 /// Reuses the [`StimulusPayload`] body (same 24 bytes) carrying the
107 /// step's coincident end-of-hold (elapsed_ms, total_iterations) and
108 /// the SAME 1-indexed `step_index` as its StepStart. The host pairs
109 /// `StepStart[k]` -> `StepEnd[k]` for each step's OWN throughput
110 /// (step-local iteration_rate), which — unlike the cross-step
111 /// `StepStart[k]` -> `StepStart[k+1]` delta — does not read ~0 for
112 /// workers respawned per step.
113 StepEnd,
114 /// Scenario start marker. Sets a fresh watchdog deadline.
115 ScenarioStart,
116 /// Scenario end marker (payload: two 8-byte LE u64s — elapsed_ms
117 /// then the final cumulative `total_iterations`; see
118 /// [`SCENARIO_END_PAYLOAD_SIZE`] / [`parse_scenario_end`]).
119 ScenarioEnd,
120 /// Pause the watchdog clock. Wall time while paused doesn't
121 /// count against the workload budget.
122 ScenarioPause,
123 /// Resume the watchdog clock after a pause. Extends the deadline
124 /// by the pause duration — gives back the paused time.
125 ScenarioResume,
126 /// Guest exit code (payload: 4-byte LE i32).
127 Exit,
128 /// Test result (payload: postcard-encoded `AssertResult`).
129 TestResult,
130 /// Scheduler process exit (payload: 4-byte LE i32 exit code).
131 SchedExit,
132 /// Guest crash diagnostic (payload: UTF-8 panic + backtrace).
133 /// Reserved tag — never travels on the bulk port. Panic
134 /// diagnostics are written directly to COM2 (`/dev/ttyS1`)
135 /// because `virtio_console` TX can block on host backpressure
136 /// and blocking inside a fault handler would deadlock the
137 /// guest before the diagnostic reached the host.
138 Crash,
139 /// Per-payload-invocation metrics (payload: postcard-encoded
140 /// `PayloadMetrics`).
141 PayloadMetrics,
142 /// Coverage profraw blob.
143 Profraw,
144 /// Guest→host stdout chunk. Payload: opaque UTF-8 bytes. Each
145 /// frame carries one chunk read from the guest's stdout pipe;
146 /// host concatenates chunks in arrival order to reconstruct the
147 /// stream. Replaces the prior COM2 stdout redirect.
148 Stdout,
149 /// Guest→host stderr chunk. Payload: opaque UTF-8 bytes. Same
150 /// chunked semantics as [`Self::Stdout`].
151 Stderr,
152 /// Guest→host scheduler-log chunk. Payload: opaque UTF-8 bytes
153 /// from the scheduler child process's captured log. Replaces
154 /// the prior COM2 SCHED_OUTPUT_START/END dump path. The host
155 /// concatenates chunks in arrival order; the existing
156 /// `SCHED_OUTPUT_START` / `SCHED_OUTPUT_END` delimiters and the
157 /// embedded BPF verifier section are preserved verbatim
158 /// inside the chunk bytes.
159 SchedLog,
160 /// Guest→host lifecycle phase event. Payload: 1-byte
161 /// [`LifecyclePhase`] discriminant followed by an optional
162 /// UTF-8 reason buffer (used by `SchedulerNotAttached`'s
163 /// suffix detail). Replaces the prior `KTSTR_INIT_STARTED` /
164 /// `KTSTR_PAYLOAD_STARTING` / `SCHEDULER_DIED` /
165 /// `SCHEDULER_NOT_ATTACHED` sentinel strings on COM2.
166 Lifecycle,
167 /// Guest→host shell-exec exit. Payload: 4-byte LE i32 exit
168 /// code from `cargo ktstr shell --exec <cmd>`. Replaces the
169 /// prior COM2 `KTSTR_EXEC_EXIT=N` sentinel line.
170 ExecExit,
171 /// Guest→host kernel ring-buffer dump. Payload: opaque UTF-8
172 /// bytes from `rmesg::logs_raw`. Sent on the
173 /// initramfs-extraction failure path so the host sees the
174 /// kernel OOM messages without scraping COM2.
175 Dmesg,
176 /// Guest→host probe-pipeline JSON output. Payload: opaque UTF-8
177 /// bytes from the probe output stream. Replaces the prior
178 /// COM2 ProbeDrain path so probe JSON does not interleave
179 /// with sched-log dumps on the same serial port.
180 ProbeOutput,
181 /// Guest→host on-demand snapshot request (payload:
182 /// [`SnapshotRequestPayload`]). The freeze coordinator's bulk-drain
183 /// path intercepts this frame, runs the CAPTURE / WATCH dispatch,
184 /// and replies with [`MsgType::SnapshotReply`] on port 1 RX.
185 SnapshotRequest,
186 /// Host→guest snapshot reply (payload: [`SnapshotReplyPayload`]).
187 /// Sent on port 1 RX so the guest's blocking read on
188 /// `/dev/vport0p1` wakes within microseconds. Reply payload
189 /// carries the matching request_id, the status, and a UTF-8
190 /// reason buffer for the failure path.
191 SnapshotReply,
192 /// Guest→host kernel-memory write/read op request (payload:
193 /// postcard-encoded [`KernelOpRequestPayload`]). Carries the
194 /// `Op::WriteKernel{Hot,Cold}` / `Op::ReadKernel{Hot,Cold}`
195 /// invocation from the guest's step executor; variable-length
196 /// payload rides this distinct MSG_TYPE rather than extending
197 /// the fixed-72-byte [`SnapshotRequestPayload`].
198 KernelOpRequest,
199 /// Host→guest reply to [`MsgType::KernelOpRequest`] (payload:
200 /// postcard-encoded [`KernelOpReplyPayload`]).
201 KernelOpReply,
202 /// Guest→host wprof Perfetto-format trace blob (payload: raw
203 /// `.pb` bytes from `wprof -T trace.pb`). The freeze
204 /// coordinator writes the payload as `wprof.pb` next to the
205 /// failure-dump JSON so the operator picks it up alongside the
206 /// rest of the per-test debugging artefacts.
207 WprofTrace,
208 /// Guest→host wprof trace CHUNK (payload: a ≤`MAX_BULK_FRAME_PAYLOAD`
209 /// slice of the `.pb`). A wprof trace larger than the single-frame
210 /// bulk-port ceiling is split into ordered `WprofTraceChunk` frames
211 /// terminated by a final [`Self::WprofTrace`] frame (the last slice);
212 /// the host concatenates the chunk payloads in arrival order and
213 /// appends the terminal `WprofTrace` payload to reconstruct the `.pb`.
214 /// A trace that fits in one frame ships as a lone `WprofTrace` (no
215 /// chunks) — the reassembly is a no-op for that fast path.
216 ///
217 /// Like the `Stdout`/`Stderr`/`SchedLog` transports, a large blob is
218 /// split across frames and concatenated on the host. It DIVERGES from
219 /// them in using a distinct terminal frame type rather than uniform
220 /// same-type frames: a Perfetto `.pb` is useless if truncated (a partial
221 /// protobuf still passes the leading-tag/size shape check but decodes to
222 /// garbage), whereas partial stdout is still useful. The terminal frame
223 /// lets the host distinguish a complete trace (terminal present → write
224 /// the `.pb`) from a transport that tore mid-ship (chunks but no terminal
225 /// → write nothing, so the post_vm `.pb`-landed assert fails loudly
226 /// instead of shipping a plausible-but-corrupt artifact). Stdout/SchedLog
227 /// need no such marker — their in-band `SCHED_OUTPUT_START/END` content
228 /// delimiters, not the framing, bound their payloads.
229 WprofTraceChunk,
230 /// Guest→host system-ready signal (payload: empty).
231 ///
232 /// Emitted by the guest's `ktstr_guest_init` after
233 /// `mount_filesystems()` completes, so by the time the host
234 /// observes the frame the guest's `setup_per_cpu_areas` and
235 /// KASLR randomization (both kernel-boot prerequisites) are
236 /// already done. The freeze coordinator's bulk-drain dispatch
237 /// promotes a CRC-valid SYS_RDY frame into the monitor's
238 /// boot-complete eventfd, so the monitor's pre-sample
239 /// `epoll_wait` returns within microseconds rather than
240 /// waiting for the 5 s fallback timeout. Replaces an earlier
241 /// trigger that fired on the first port-0 TX byte (kernel
242 /// printk via `/dev/hvc0`), which depended on incidental
243 /// console traffic rather than an explicit readiness signal.
244 SysRdy,
245 /// Guest→host scheduler-swap notification (payload: empty).
246 ///
247 /// Emitted by the guest's `kill_current_scheduler`
248 /// (`Op::DetachScheduler` / `RestartScheduler` / `ReplaceScheduler`)
249 /// AFTER `wait_for_scx_disabled` returns, so by the time the host
250 /// observes the frame the kernel has already NULLed `*scx_root`
251 /// (`RCU_INIT_POINTER(scx_root, NULL)` precedes
252 /// `scx_set_enable_state(SCX_DISABLED)` in kernel/sched/ext.c) and
253 /// the prior scx_sched object is unlinked (`*scx_root` NULLed) and
254 /// its slab is subject to RCU-grace-period reuse. The freeze
255 /// coordinator decodes a CRC-valid frame and SYNCHRONOUSLY
256 /// invalidates the periodic-capture accessor (mirroring the
257 /// watchpoint poll's Detached teardown) rather than waiting up to
258 /// one SCAN_INTERVAL for the poll to notice the rebind — collapsing
259 /// the post-swap periodic-capture defer window. Coordinator-internal:
260 /// carries no test verdict.
261 SchedSwapNotify,
262}
263
264impl MsgType {
265 /// 32-bit on-wire discriminant for this message type. The value is
266 /// the big-endian ASCII representation of a 4-character tag.
267 pub const fn wire_value(self) -> u32 {
268 match self {
269 MsgType::Stimulus => MSG_TYPE_STIMULUS,
270 MsgType::StepEnd => MSG_TYPE_STEP_END,
271 MsgType::ScenarioStart => MSG_TYPE_SCENARIO_START,
272 MsgType::ScenarioEnd => MSG_TYPE_SCENARIO_END,
273 MsgType::ScenarioPause => MSG_TYPE_SCENARIO_PAUSE,
274 MsgType::ScenarioResume => MSG_TYPE_SCENARIO_RESUME,
275 MsgType::Exit => MSG_TYPE_EXIT,
276 MsgType::TestResult => MSG_TYPE_TEST_RESULT,
277 MsgType::SchedExit => MSG_TYPE_SCHED_EXIT,
278 MsgType::Crash => MSG_TYPE_CRASH,
279 MsgType::PayloadMetrics => MSG_TYPE_PAYLOAD_METRICS,
280 MsgType::Profraw => MSG_TYPE_PROFRAW,
281 MsgType::WprofTrace => MSG_TYPE_WPROF_TRACE,
282 MsgType::WprofTraceChunk => MSG_TYPE_WPROF_TRACE_CHUNK,
283 MsgType::SnapshotRequest => MSG_TYPE_SNAPSHOT_REQUEST,
284 MsgType::SnapshotReply => MSG_TYPE_SNAPSHOT_REPLY,
285 MsgType::KernelOpRequest => MSG_TYPE_KERNEL_OP_REQUEST,
286 MsgType::KernelOpReply => MSG_TYPE_KERNEL_OP_REPLY,
287 MsgType::SysRdy => MSG_TYPE_SYS_RDY,
288 MsgType::SchedSwapNotify => MSG_TYPE_SCHED_SWAP_NOTIFY,
289 MsgType::Stdout => MSG_TYPE_STDOUT,
290 MsgType::Stderr => MSG_TYPE_STDERR,
291 MsgType::SchedLog => MSG_TYPE_SCHED_LOG,
292 MsgType::Lifecycle => MSG_TYPE_LIFECYCLE,
293 MsgType::ExecExit => MSG_TYPE_EXEC_EXIT,
294 MsgType::Dmesg => MSG_TYPE_DMESG,
295 MsgType::ProbeOutput => MSG_TYPE_PROBE_OUTPUT,
296 }
297 }
298
299 /// Reverse the wire mapping. Returns `None` when `value` is not a
300 /// recognised discriminant — callers can either skip the frame or
301 /// surface the unknown tag for diagnostics.
302 pub const fn from_wire(value: u32) -> Option<Self> {
303 match value {
304 MSG_TYPE_STIMULUS => Some(MsgType::Stimulus),
305 MSG_TYPE_STEP_END => Some(MsgType::StepEnd),
306 MSG_TYPE_SCENARIO_START => Some(MsgType::ScenarioStart),
307 MSG_TYPE_SCENARIO_END => Some(MsgType::ScenarioEnd),
308 MSG_TYPE_SCENARIO_PAUSE => Some(MsgType::ScenarioPause),
309 MSG_TYPE_SCENARIO_RESUME => Some(MsgType::ScenarioResume),
310 MSG_TYPE_EXIT => Some(MsgType::Exit),
311 MSG_TYPE_TEST_RESULT => Some(MsgType::TestResult),
312 MSG_TYPE_SCHED_EXIT => Some(MsgType::SchedExit),
313 MSG_TYPE_CRASH => Some(MsgType::Crash),
314 MSG_TYPE_PAYLOAD_METRICS => Some(MsgType::PayloadMetrics),
315 MSG_TYPE_PROFRAW => Some(MsgType::Profraw),
316 MSG_TYPE_WPROF_TRACE => Some(MsgType::WprofTrace),
317 MSG_TYPE_WPROF_TRACE_CHUNK => Some(MsgType::WprofTraceChunk),
318 MSG_TYPE_SNAPSHOT_REQUEST => Some(MsgType::SnapshotRequest),
319 MSG_TYPE_SNAPSHOT_REPLY => Some(MsgType::SnapshotReply),
320 MSG_TYPE_KERNEL_OP_REQUEST => Some(MsgType::KernelOpRequest),
321 MSG_TYPE_KERNEL_OP_REPLY => Some(MsgType::KernelOpReply),
322 MSG_TYPE_SYS_RDY => Some(MsgType::SysRdy),
323 MSG_TYPE_SCHED_SWAP_NOTIFY => Some(MsgType::SchedSwapNotify),
324 MSG_TYPE_STDOUT => Some(MsgType::Stdout),
325 MSG_TYPE_STDERR => Some(MsgType::Stderr),
326 MSG_TYPE_SCHED_LOG => Some(MsgType::SchedLog),
327 MSG_TYPE_LIFECYCLE => Some(MsgType::Lifecycle),
328 MSG_TYPE_EXEC_EXIT => Some(MsgType::ExecExit),
329 MSG_TYPE_DMESG => Some(MsgType::Dmesg),
330 MSG_TYPE_PROBE_OUTPUT => Some(MsgType::ProbeOutput),
331 _ => None,
332 }
333 }
334
335 /// `true` for control frames the freeze coordinator interprets
336 /// internally and that must NOT surface as test verdict entries
337 /// in [`super::host_comms::BulkDrainResult`]. Both the
338 /// coordinator's mid-run `bulk_messages_for_closure` filter and
339 /// `collect_results`'s post-run drain key on this single
340 /// classifier so the gate stays in lockstep — adding a new
341 /// internal control frame is a one-line update here.
342 ///
343 /// The current internal set:
344 /// - [`MsgType::SnapshotRequest`] — has its matching
345 /// [`MsgType::SnapshotReply`] delivered over port-1 RX; the
346 /// request itself carries no test verdict.
347 /// - [`MsgType::SnapshotReply`] — host→guest only on port-1 RX.
348 /// A guest TX frame stamped with this tag is illegitimate
349 /// (only the host coordinator emits replies); drop it instead
350 /// of bucketing it as a phantom verdict entry. Including the
351 /// tag in the internal set keeps the dispatch and the
352 /// `collect_results` post-run drain in lockstep — both filter
353 /// the same way.
354 /// - [`MsgType::KernelOpRequest`] — paired with its
355 /// [`MsgType::KernelOpReply`] over port-1 RX (the cold-path
356 /// kernel-op roundtrip); the request carries no test verdict.
357 /// - [`MsgType::KernelOpReply`] — host→guest only on port-1 RX,
358 /// same illegitimate-guest-TX reasoning as `SnapshotReply`.
359 /// - [`MsgType::SysRdy`] — its only semantic is the eventfd
360 /// promotion that releases the monitor's pre-sample
361 /// `epoll_wait`.
362 /// - [`MsgType::SchedSwapNotify`] — its only semantic is the
363 /// synchronous periodic-capture accessor teardown the freeze
364 /// coordinator performs on a CRC-valid frame; carries no test
365 /// verdict.
366 pub const fn is_coordinator_internal(self) -> bool {
367 matches!(
368 self,
369 MsgType::SnapshotRequest
370 | MsgType::SnapshotReply
371 | MsgType::KernelOpRequest
372 | MsgType::KernelOpReply
373 | MsgType::SysRdy
374 | MsgType::SchedSwapNotify
375 )
376 }
377}
378
379/// Lifecycle phase carried in the 1-byte header of a
380/// [`MsgType::Lifecycle`] payload. Replaces the prior
381/// `KTSTR_INIT_STARTED` / `KTSTR_PAYLOAD_STARTING` /
382/// `SCHEDULER_DIED` / `SCHEDULER_NOT_ATTACHED` COM2 sentinels.
383///
384/// `SchedulerNotAttached` carries an optional UTF-8 reason suffix
385/// (the bytes following the 1-byte phase header in the TLV
386/// payload) — every other variant has an empty suffix.
387#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
388pub enum LifecyclePhase {
389 /// Init started — devtmpfs mounted, initramfs verified,
390 /// equivalent to the legacy `KTSTR_INIT_STARTED` sentinel.
391 InitStarted,
392 /// Payload starting — guest dispatch is about to invoke the
393 /// `#[ktstr_test]` body. Equivalent to the legacy
394 /// `KTSTR_PAYLOAD_STARTING` sentinel.
395 PayloadStarting,
396 /// Scheduler process exited during startup. Equivalent to the
397 /// legacy `SCHEDULER_DIED` sentinel.
398 SchedulerDied,
399 /// Scheduler stayed alive but never attached to sched_ext (BPF
400 /// verifier reject, ops mismatch, sysfs absent). Equivalent to
401 /// the legacy `SCHEDULER_NOT_ATTACHED:<reason>` sentinel; the
402 /// reason suffix lives in the bytes after the 1-byte phase
403 /// header.
404 SchedulerNotAttached,
405 /// The injected verifier workload dispatched: after attach, at least
406 /// one worker of the `--ktstr-verifier-workload` run made forward
407 /// progress on-CPU (a positive, scheduler-agnostic dispatch proof).
408 /// Emitted by `ktstr_guest_init` Phase 5 only when a verifier-workload
409 /// run recorded a worker with non-zero `iterations` under a confirmed
410 /// SCHED_EXT policy (so a fair-class fallback cannot false-confirm).
411 /// Given a `PayloadStarting` frame, the ABSENCE of this frame means the
412 /// scheduler attached (sched_ext `enabled`) but never dispatched the
413 /// workload — a distinct, worse failure than never attaching. Carries
414 /// an empty suffix. Has no legacy COM2 sentinel equivalent.
415 WorkloadDispatched,
416}
417
418impl LifecyclePhase {
419 /// 1-byte on-wire discriminant. `0` is reserved as the
420 /// "unknown / invalid" sentinel — host parsers reject zero
421 /// rather than silently mapping it to a known phase.
422 pub const fn wire_value(self) -> u8 {
423 match self {
424 LifecyclePhase::InitStarted => 1,
425 LifecyclePhase::PayloadStarting => 2,
426 LifecyclePhase::SchedulerDied => 3,
427 LifecyclePhase::SchedulerNotAttached => 4,
428 LifecyclePhase::WorkloadDispatched => 5,
429 }
430 }
431
432 /// Reverse the wire mapping. Returns `None` for `0`
433 /// (reserved sentinel) or any value not present in the variant
434 /// list — host parsers skip unknown phases and log them rather
435 /// than panicking.
436 pub const fn from_wire(value: u8) -> Option<Self> {
437 match value {
438 1 => Some(LifecyclePhase::InitStarted),
439 2 => Some(LifecyclePhase::PayloadStarting),
440 3 => Some(LifecyclePhase::SchedulerDied),
441 4 => Some(LifecyclePhase::SchedulerNotAttached),
442 5 => Some(LifecyclePhase::WorkloadDispatched),
443 _ => None,
444 }
445 }
446}
447
448// ---------------------------------------------------------------------------
449// On-wire u32 discriminants
450// ---------------------------------------------------------------------------
451//
452// Kept as `pub const` for callers that compare a parsed frame's
453// `msg_type` field directly (e.g. the freeze coordinator's stream
454// filter). [`MsgType::wire_value`] is the typed entry point; the
455// constants are the same values exposed for raw-byte comparisons.
456
457/// Stimulus event from the guest step executor (step START frame).
458pub const MSG_TYPE_STIMULUS: u32 = 0x5354_494D; // "STIM"
459
460/// Per-step END frame from the guest step executor (reuses the
461/// [`StimulusPayload`] body; see [`MsgType::StepEnd`]).
462pub const MSG_TYPE_STEP_END: u32 = 0x5354_454E; // "STEN"
463
464/// Scenario start marker.
465pub const MSG_TYPE_SCENARIO_START: u32 = 0x5343_5354; // "SCST"
466
467/// Pause watchdog clock.
468pub const MSG_TYPE_SCENARIO_PAUSE: u32 = 0x5343_5050; // "SCPP"
469
470/// Resume watchdog clock after pause.
471pub const MSG_TYPE_SCENARIO_RESUME: u32 = 0x5343_5252; // "SCRR"
472
473/// Scenario end marker.
474pub const MSG_TYPE_SCENARIO_END: u32 = 0x5343_454E; // "SCEN"
475
476/// Guest exit code (payload: 4-byte i32).
477pub const MSG_TYPE_EXIT: u32 = 0x4558_4954; // "EXIT"
478
479/// Test result (payload: postcard-encoded AssertResult).
480pub const MSG_TYPE_TEST_RESULT: u32 = 0x5445_5354; // "TEST"
481
482/// Scheduler process exit (payload: 4-byte i32 exit code).
483pub const MSG_TYPE_SCHED_EXIT: u32 = 0x5343_4458; // "SCDX"
484
485/// Guest crash diagnostic (payload: UTF-8 panic + backtrace).
486pub const MSG_TYPE_CRASH: u32 = 0x4352_5348; // "CRSH"
487
488/// Per-payload-invocation metrics
489/// (payload: postcard-encoded `crate::test_support::PayloadMetrics`).
490pub const MSG_TYPE_PAYLOAD_METRICS: u32 = 0x504d_4554; // "PMET"
491
492/// Coverage profraw blob (payload: raw `.profraw` bytes serialized by
493/// `__llvm_profile_write_buffer`).
494pub const MSG_TYPE_PROFRAW: u32 = 0x5052_4157; // "PRAW"
495
496/// wprof Perfetto-format trace blob (payload: raw `.pb` bytes
497/// produced by `/bin/wprof -T trace.pb` during auto-repro). The
498/// host's freeze coordinator writes the payload to a sibling of
499/// the failure-dump file so the operator finds it under
500/// [`crate::test_support::sidecar_dir`] alongside the JSON dump.
501pub const MSG_TYPE_WPROF_TRACE: u32 = 0x5750_5246; // "WPRF"
502
503/// Guest→host wprof trace CHUNK (one ordered slice of a `.pb` too large
504/// for a single bulk frame; the stream is terminated by a
505/// [`MSG_TYPE_WPROF_TRACE`] frame carrying the final slice). See
506/// [`MsgType::WprofTraceChunk`].
507pub const MSG_TYPE_WPROF_TRACE_CHUNK: u32 = 0x5750_5243; // "WPRC"
508
509/// Guest→host on-demand snapshot request
510/// (payload: [`SnapshotRequestPayload`]).
511pub const MSG_TYPE_SNAPSHOT_REQUEST: u32 = 0x534e_5251; // "SNRQ"
512
513/// Host→guest on-demand snapshot reply
514/// (payload: [`SnapshotReplyPayload`]).
515pub const MSG_TYPE_SNAPSHOT_REPLY: u32 = 0x534e_5250; // "SNRP"
516
517/// Guest→host system-ready signal (payload: empty).
518///
519/// Tag spelled `"SRDY"` in hex digits; on-wire bytes (LE) are
520/// `0x59 0x44 0x52 0x53` (`"YDRS"` byte-by-byte). The freeze
521/// coordinator's bulk-drain dispatch promotes a CRC-valid
522/// `MSG_TYPE_SYS_RDY` frame into the monitor's boot-complete
523/// eventfd. See [`MsgType::SysRdy`] for the protocol contract.
524pub const MSG_TYPE_SYS_RDY: u32 = 0x5352_4459; // "SRDY"
525
526/// Guest→host scheduler-swap notification (payload: empty).
527///
528/// Tag spelled `"SCSW"` (SCheduler SWap) in hex digits; on-wire bytes
529/// (LE) are `0x57 0x53 0x43 0x53` (`"WSCS"` byte-by-byte). Emitted by
530/// the guest's `kill_current_scheduler` after `wait_for_scx_disabled`
531/// returns (so `*scx_root` is already NULL); the freeze coordinator
532/// synchronously invalidates the periodic-capture accessor on a
533/// CRC-valid frame. See [`MsgType::SchedSwapNotify`] for the protocol
534/// contract.
535pub const MSG_TYPE_SCHED_SWAP_NOTIFY: u32 = 0x5343_5357; // "SCSW"
536
537/// Guest→host stdout chunk (payload: opaque UTF-8 bytes).
538///
539/// Replaces the prior COM2 stdout redirect: the guest dups fd 1
540/// onto the write-end of an internal pipe and a forwarder thread
541/// chunks the pipe's read-end into TLV frames bounded by
542/// [`super::bulk::MAX_BULK_FRAME_PAYLOAD`].
543pub const MSG_TYPE_STDOUT: u32 = 0x534f_5554; // "SOUT"
544
545/// Guest→host stderr chunk (payload: opaque UTF-8 bytes).
546///
547/// Same chunked redirect semantics as [`MSG_TYPE_STDOUT`], applied
548/// to fd 2.
549pub const MSG_TYPE_STDERR: u32 = 0x5345_5252; // "SERR"
550
551/// Guest→host scheduler-log chunk (payload: opaque UTF-8 bytes).
552///
553/// Replaces the prior COM2 SCHED_OUTPUT_START/END dump in
554/// `dump_sched_output`. The host concatenates chunks in arrival
555/// order; the embedded `SCHED_OUTPUT_START` / `SCHED_OUTPUT_END`
556/// markers and the BPF verifier section travel verbatim inside
557/// the chunk bytes.
558pub const MSG_TYPE_SCHED_LOG: u32 = 0x5343_4c47; // "SCLG"
559
560/// Guest→host lifecycle phase event.
561///
562/// Payload layout: 1-byte [`LifecyclePhase`] discriminant followed
563/// by an optional UTF-8 reason buffer (used by
564/// `SchedulerNotAttached`'s suffix detail; empty for every other
565/// phase). Replaces the COM2 `KTSTR_INIT_STARTED` /
566/// `KTSTR_PAYLOAD_STARTING` / `SCHEDULER_DIED` /
567/// `SCHEDULER_NOT_ATTACHED` sentinel strings.
568pub const MSG_TYPE_LIFECYCLE: u32 = 0x4c49_4645; // "LIFE"
569
570/// Guest→host kernel address parameters (payload: 24 bytes LE).
571///
572/// Sent BEFORE `MSG_TYPE_SYS_RDY` so the monitor has `phys_base`
573/// and `page_offset_base` before its first sample iteration.
574/// Payload layout: 24 bytes encoded by [`KernAddrs::to_payload`]:
575/// `[phys_base + 1 : u64 LE, page_offset_base : u64 LE, kernel_text_runtime_kva + 1 : u64 LE]`
576/// The guest reads these from `/proc/iomem` and `/proc/kallsyms`
577/// after `mount_filesystems` — by that point `__startup_64`,
578/// `kernel_randomize_memory`, `cpu_init → syscall_init`, and the
579/// post-relocation kallsyms table population have all run, so the
580/// values are final regardless of KASLR configuration.
581pub const MSG_TYPE_KERN_ADDRS: u32 = 0x4b41_4452; // "KADR"
582
583/// Typed payload for [`MSG_TYPE_KERN_ADDRS`].
584///
585/// Three u64 fields published by the guest at boot so the host can
586/// translate kernel virtual addresses without walking guest page
587/// tables and recover the virt-KASLR slide without a separate
588/// in-VMM derivation. The wire layout uses bias-by-1 on the
589/// `phys_base` and `kernel_text_runtime_kva` slots so 0 stays the
590/// "not yet received / could not derive" sentinel; `page_offset_base`
591/// is unbiased (today the guest always sends 0 and the host
592/// re-derives via page-table walk — left in the layout for a future
593/// extension that bypasses the walk).
594///
595/// Constructors:
596/// - [`Self::new`]: bare fields, no sentinel logic. Used by
597/// [`crate::vmm::guest_comms::send_kern_addrs`] on the guest
598/// side.
599/// - [`Self::from_payload`]: decodes a 24-byte payload, strips
600/// the +1 bias on the biased slots, validates the length. Used
601/// by the host dispatch arm in
602/// `crate::vmm::freeze_coord::dispatch::dispatch_bulk_message`.
603///
604/// Field semantics:
605/// - `phys_base = 0` is a legitimate KASLR-off value (the
606/// payload encodes it biased as `1`, decoder strips back to
607/// `0`). [`Self::has_phys_present_bit`] reports whether the
608/// guest sent a non-zero biased phys_base (i.e. the payload
609/// carries phys_base data at all).
610/// - `kernel_text_runtime_kva` is wrapped in `Option<u64>` so
611/// the decoder distinguishes "guest could not read kallsyms"
612/// (`None`) from "guest read kallsyms and KASLR is off"
613/// (`Some(link_kva)`). The bias-by-1 encoding handles the
614/// former (biased 0 → `None`); a non-zero biased value
615/// decodes to `Some(raw)`.
616#[derive(Debug, Clone, Copy)]
617pub struct KernAddrs {
618 /// Guest-derived `phys_base` (the KASLR-physical slide), or 0
619 /// when KASLR-physical is off (`__startup_64` left
620 /// `phys_base = 0`). Compare with the host's expected
621 /// load-address to recover the physical KASLR offset.
622 pub phys_base: u64,
623 /// Symbol KVA of the guest's `page_offset_base` global (NOT the
624 /// runtime value the symbol points at — host dereferences via
625 /// `monitor::symbols::text_kva_to_pa_with_base` + `read_u64` once
626 /// it has `phys_base` resolved). Read by
627 /// `crate::vmm::guest_comms::read_kernel_page_offset_base_from_kallsyms`
628 /// (called from `vmm::rust_init::init`) from `/proc/kallsyms`.
629 /// Storage class: `.data..ro_after_init` per
630 /// `arch/x86/kernel/head64.c:63` — written during
631 /// `kernel_randomize_memory()` in `start_kernel`, frozen after
632 /// `mark_rodata_ro`. `0` means (a) arm64 (no `page_offset_base`
633 /// global — `PAGE_OFFSET` is compile-time per
634 /// `arch/arm64/include/asm/memory.h:43-45`), (b)
635 /// CONFIG_RANDOMIZE_MEMORY=n (symbol absent), or (c) kallsyms
636 /// unreadable (kptr_restrict elevated); the host falls back to
637 /// `resolve_page_offset_with_tcr` (the page-table walk) in
638 /// every 0 case.
639 pub page_offset_base: u64,
640 /// Runtime KVA of `_text` (the kernel image start symbol)
641 /// from the guest's `/proc/kallsyms`, when readable. The
642 /// host derives `virt_kaslr = runtime - link_text_kva` using
643 /// the link-time KVA extracted from vmlinux at coordinator
644 /// init. `None` when the guest could not read kallsyms
645 /// (kptr_restrict masked, /proc not mountable, symbol absent).
646 pub kernel_text_runtime_kva: Option<u64>,
647}
648
649impl KernAddrs {
650 /// Wire-format byte length. Exact-match check on the receive
651 /// side so a future payload extension trips a decoder
652 /// rejection rather than silently dropping the new bytes.
653 pub const WIRE_LEN: usize = 24;
654
655 /// Construct from bare field values. Caller owns the
656 /// "did I read kallsyms?" decision via the `Option` on
657 /// `kernel_text_runtime_kva`.
658 pub fn new(
659 phys_base: u64,
660 page_offset_base: u64,
661 kernel_text_runtime_kva: Option<u64>,
662 ) -> Self {
663 Self {
664 phys_base,
665 page_offset_base,
666 kernel_text_runtime_kva,
667 }
668 }
669
670 /// Encode to a 24-byte LE payload with `+1` bias on the
671 /// biased slots. Caller transmits this on the wire. Takes
672 /// `self` by value since [`Self`] is `Copy` and the encoder
673 /// reads each field at most once.
674 pub fn to_payload(self) -> [u8; Self::WIRE_LEN] {
675 let mut buf = [0u8; Self::WIRE_LEN];
676 buf[..8].copy_from_slice(&(self.phys_base.wrapping_add(1)).to_le_bytes());
677 buf[8..16].copy_from_slice(&self.page_offset_base.to_le_bytes());
678 // bias 0 → encodes as 0 (sentinel: guest could not derive)
679 let runtime_biased = match self.kernel_text_runtime_kva {
680 Some(kva) => kva.wrapping_add(1),
681 None => 0,
682 };
683 buf[16..24].copy_from_slice(&runtime_biased.to_le_bytes());
684 buf
685 }
686
687 /// Decode from a wire payload. Returns `None` on length
688 /// mismatch (exact match required — short payloads never
689 /// publish either slot to avoid a partial-init race; longer
690 /// payloads indicate a protocol extension the decoder
691 /// doesn't understand).
692 pub fn from_payload(payload: &[u8]) -> Option<Self> {
693 if payload.len() != Self::WIRE_LEN {
694 return None;
695 }
696 let phys_biased = u64::from_le_bytes(payload[..8].try_into().ok()?);
697 let page_offset_base = u64::from_le_bytes(payload[8..16].try_into().ok()?);
698 let runtime_biased = u64::from_le_bytes(payload[16..24].try_into().ok()?);
699 Some(Self {
700 // biased 0 means "guest didn't send" — but the
701 // unbiased phys_base = 0 is legitimate (KASLR off).
702 // `has_phys_present_bit` distinguishes the two on the
703 // host side.
704 phys_base: phys_biased.wrapping_sub(1),
705 page_offset_base,
706 kernel_text_runtime_kva: if runtime_biased == 0 {
707 None
708 } else {
709 Some(runtime_biased.wrapping_sub(1))
710 },
711 })
712 }
713
714 /// True iff the encoded payload had a non-zero biased
715 /// `phys_base` slot (i.e. the guest sent phys_base data).
716 /// Distinguishes "guest sent phys_base = 0" (KASLR off, valid)
717 /// from "guest didn't send phys_base at all" (truncated wire
718 /// path, treat as absent). Computed from the post-decode
719 /// `phys_base` field: encoded `phys_biased = phys_base + 1`
720 /// is non-zero iff `phys_base != u64::MAX`. Wrap-around case
721 /// (`phys_base = u64::MAX` encodes to biased 0) is impossible
722 /// in practice — kernel `phys_base` is a low physical address,
723 /// never the all-ones sentinel.
724 pub fn has_phys_present_bit(&self) -> bool {
725 self.phys_base != u64::MAX
726 }
727}
728
729/// Guest→host shell-exec exit code (payload: 4-byte LE i32).
730///
731/// Replaces the prior COM2 `KTSTR_EXEC_EXIT=N` sentinel line
732/// emitted by `cargo ktstr shell --exec <cmd>`.
733pub const MSG_TYPE_EXEC_EXIT: u32 = 0x4558_4358; // "EXCX"
734
735/// Guest→host kernel ring-buffer dump (payload: opaque UTF-8 bytes).
736///
737/// Sent on the initramfs-extraction failure path so the host sees
738/// the kernel OOM messages without scraping COM2.
739pub const MSG_TYPE_DMESG: u32 = 0x444d_5347; // "DMSG"
740
741/// Guest→host probe-pipeline JSON output (payload: opaque UTF-8
742/// bytes).
743///
744/// Replaces the prior COM2 ProbeDrain path so probe output and
745/// scheduler-log dumps stop interleaving on the same serial port.
746pub const MSG_TYPE_PROBE_OUTPUT: u32 = 0x5052_4f42; // "PROB"
747
748/// Guest→host kernel-memory write/read op request (payload:
749/// postcard-encoded [`KernelOpRequestPayload`]).
750///
751/// Carries an [`Op::WriteKernelHot`](crate::scenario::ops::Op::WriteKernelHot)
752/// / [`Op::WriteKernelCold`](crate::scenario::ops::Op::WriteKernelCold)
753/// / [`Op::ReadKernelHot`](crate::scenario::ops::Op::ReadKernelHot)
754/// / [`Op::ReadKernelCold`](crate::scenario::ops::Op::ReadKernelCold)
755/// request from the guest's step executor to the host coordinator.
756/// Variable-length payload (target + value bytes do not fit in the
757/// 72-byte [`SnapshotRequestPayload`]), so this rides a distinct
758/// MSG_TYPE_* with a postcard-encoded body rather than extending the
759/// fixed-size snapshot envelope.
760pub const MSG_TYPE_KERNEL_OP_REQUEST: u32 = 0x4b4f_5251; // "KORQ"
761
762/// Host→guest reply to a [`MSG_TYPE_KERNEL_OP_REQUEST`] (payload:
763/// postcard-encoded [`KernelOpReplyPayload`]). Echoes the request id
764/// the guest stamped, carries the status + reason + (for reads) the
765/// value bytes the host coordinator read.
766pub const MSG_TYPE_KERNEL_OP_REPLY: u32 = 0x4b4f_5250; // "KORP"
767
768// ---------------------------------------------------------------------------
769// ShmMessage — TLV header
770// ---------------------------------------------------------------------------
771
772/// 16-byte TLV header preceding each payload on the wire.
773///
774/// Used as the framing header for the bulk virtio-console port-1
775/// channel; the type name `ShmMessage` is retained from the
776/// predecessor SHM ring transport (now removed in favour of the
777/// virtio-console port). CRC32 covers payload bytes only (not the
778/// header).
779///
780/// SAFETY: `repr(C)` with four `u32` fields produces a 16-byte struct
781/// with no padding (every field is 4-aligned). `_pad` is reserved for
782/// future schema use; current writers MUST set it to 0 and current
783/// readers ignore it. zerocopy derives produce no panics — every bit
784/// pattern is valid for `u32`.
785#[repr(C)]
786#[derive(
787 Clone, Copy, Default, Debug, FromBytes, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout,
788)]
789pub struct ShmMessage {
790 pub msg_type: u32,
791 pub length: u32,
792 pub crc32: u32,
793 pub _pad: u32,
794}
795
796const _SHM_MESSAGE_SIZE: () = assert!(std::mem::size_of::<ShmMessage>() == 16);
797
798/// Size in bytes of the on-wire [`ShmMessage`] header.
799pub const FRAME_HEADER_SIZE: usize = std::mem::size_of::<ShmMessage>();
800
801// ---------------------------------------------------------------------------
802// ShmEntry — parsed TLV entry
803// ---------------------------------------------------------------------------
804
805/// A single parsed message extracted from the bulk byte stream.
806///
807/// `crc_ok` is `true` when the recomputed payload CRC matched the
808/// guest's stored value. CRC mismatches do not stop the walk — the
809/// parser yields the entry with `crc_ok=false` and continues with the
810/// next frame. Downstream consumers may filter on `crc_ok` to drop
811/// corrupted entries.
812#[derive(Debug, Clone)]
813pub struct ShmEntry {
814 pub msg_type: u32,
815 pub payload: Vec<u8>,
816 /// `true` when the recomputed payload CRC matched the on-wire CRC.
817 pub crc_ok: bool,
818}
819
820// ---------------------------------------------------------------------------
821// Stimulus payload — guest step executor → host
822// ---------------------------------------------------------------------------
823
824/// Payload for stimulus events written by the guest step executor.
825///
826/// Compact 24-byte struct describing the state after each step's ops
827/// are applied. The host correlates these with monitor samples to map
828/// scheduler telemetry to scenario phases.
829#[repr(C)]
830#[derive(Clone, Copy, Default, Debug, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout)]
831pub struct StimulusPayload {
832 /// Milliseconds since scenario start.
833 pub elapsed_ms: u32,
834 /// Index of the step that was just applied.
835 pub step_index: u16,
836 /// Number of ops applied in this step.
837 pub op_count: u16,
838 /// Bitmask of Op variant discriminants present in this step.
839 pub op_kinds: u32,
840 /// Number of live cgroups after this step: sum of step-local
841 /// cgroups (from the current Step's `CgroupDef`s + `Op`s) and
842 /// Backdrop-owned cgroups that persist across every Step.
843 pub cgroup_count: u16,
844 /// Total worker handles after this step: sum of step-local
845 /// workers and Backdrop-spawned workers that persist across
846 /// every Step.
847 pub worker_count: u16,
848 /// Sum of all workers' iteration counts at this step boundary.
849 /// Read from shared MAP_SHARED counters in the step executor.
850 pub total_iterations: u64,
851}
852
853const _STIMULUS_SIZE: () = assert!(std::mem::size_of::<StimulusPayload>() == 24);
854
855/// Deserialized stimulus event.
856#[derive(Debug, Clone)]
857pub struct StimulusEvent {
858 pub elapsed_ms: u32,
859 pub step_index: u16,
860 pub op_count: u16,
861 pub op_kinds: u32,
862 pub cgroup_count: u16,
863 pub worker_count: u16,
864 pub total_iterations: u64,
865}
866
867impl StimulusEvent {
868 /// Deserialize from raw payload bytes. Requires EXACTLY a
869 /// [`StimulusPayload`]-sized (24-byte) buffer — a shorter buffer would
870 /// truncate and a longer one would carry trailing bytes the guest
871 /// never frames (`send_stimulus`/`send_step_end` always write exactly
872 /// 24 bytes), so both are rejected (matching [`KernAddrs::from_payload`]'s
873 /// exact-length gate). A torn or hostile oversized frame is dropped
874 /// rather than promoted by reading a 24-byte prefix.
875 pub fn from_payload(data: &[u8]) -> Option<Self> {
876 if data.len() != std::mem::size_of::<StimulusPayload>() {
877 return None;
878 }
879 Some(StimulusEvent {
880 elapsed_ms: u32::from_ne_bytes(data[0..4].try_into().ok()?),
881 step_index: u16::from_ne_bytes(data[4..6].try_into().ok()?),
882 op_count: u16::from_ne_bytes(data[6..8].try_into().ok()?),
883 op_kinds: u32::from_ne_bytes(data[8..12].try_into().ok()?),
884 cgroup_count: u16::from_ne_bytes(data[12..14].try_into().ok()?),
885 worker_count: u16::from_ne_bytes(data[14..16].try_into().ok()?),
886 total_iterations: u64::from_ne_bytes(data[16..24].try_into().ok()?),
887 })
888 }
889}
890
891/// Size in bytes of the [`MsgType::ScenarioEnd`] payload: two
892/// little-endian `u64`s — scenario-relative elapsed milliseconds
893/// followed by the final cumulative worker iteration count.
894pub const SCENARIO_END_PAYLOAD_SIZE: usize = 16;
895
896/// Parse the [`MsgType::ScenarioEnd`] payload written by
897/// [`crate::vmm::guest_comms::send_scenario_end`]: `elapsed_ms`
898/// (LE `u64`, scenario-relative) followed by `total_iterations`
899/// (LE `u64`, the cumulative worker iteration count summed across
900/// every live handle at the LAST step's end). The iteration count is
901/// the right boundary the final step's `iteration_rate` delta needs —
902/// the host folds it into a synthetic terminal
903/// [`crate::timeline::StimulusEvent`] (see
904/// [`crate::timeline::StimulusEvent::terminal`]). Returns `None` for a
905/// short/torn payload so a CRC-bad or truncated frame is skipped
906/// rather than misread.
907pub fn parse_scenario_end(payload: &[u8]) -> Option<(u64, u64)> {
908 if payload.len() < SCENARIO_END_PAYLOAD_SIZE {
909 return None;
910 }
911 let elapsed_ms = u64::from_le_bytes(payload[0..8].try_into().ok()?);
912 let total_iterations = u64::from_le_bytes(payload[8..16].try_into().ok()?);
913 Some((elapsed_ms, total_iterations))
914}
915
916// ---------------------------------------------------------------------------
917// Snapshot request/reply TLV payloads
918// ---------------------------------------------------------------------------
919
920/// Maximum length, in bytes, of a snapshot tag (capture name or
921/// watchpoint symbol path) carried inside the
922/// [`SnapshotRequestPayload`]. Tags longer than this bound are
923/// truncated by the guest before publishing; the host treats the
924/// first NUL as the boundary, or stops at this size if no NUL is
925/// present.
926pub const SNAPSHOT_TAG_MAX: usize = 64;
927
928/// Maximum length, in bytes, of a host-supplied reason string carried
929/// inside the [`SnapshotReplyPayload`]. Same semantics as the tag
930/// buffer (NUL-terminated when shorter, truncated when longer). Sized
931/// to hold typed-Err diagnostics that name the failing condition
932/// (e.g. `kaslr_offset == 0`, `kern_virt_kaslr` Arc state) PLUS the
933/// failing symbol + KVA PLUS the actionable remediation tip (e.g.
934/// `set #[ktstr_test(kaslr = false)]`). The longest such diagnostic
935/// today — Fix C's high-half/zero-offset rejection at
936/// `crate::vmm::freeze_coord::snapshot::arm_user_watchpoint` — is
937/// ~343 bytes when rendered with a typical symbol + KVA; 512 gives
938/// ~170 bytes of headroom for future diagnostics. The original
939/// 64-byte buffer and an intermediate 256-byte size both truncated
940/// this message before the remediation tail.
941pub const SNAPSHOT_REASON_MAX: usize = 512;
942
943/// Snapshot request kind: no request pending. Used as the sentinel
944/// value for an uninitialised request slot (this discriminant must
945/// not appear on the wire — the framing of a TLV with
946/// `MSG_TYPE_SNAPSHOT_REQUEST` already implies a request).
947pub const SNAPSHOT_KIND_NONE: u32 = 0;
948
949/// Snapshot request kind: capture-now. The host runs
950/// `freeze_and_dispatch(FreezeMode::Capture { gate_on_exit_kind: false })` and stores the resulting
951/// `FailureDumpReport` on the bridge keyed by the request tag.
952pub const SNAPSHOT_KIND_CAPTURE: u32 = 1;
953
954/// Snapshot request kind: hardware-watchpoint registration. The host
955/// resolves the symbol path through the vmlinux ELF symtab,
956/// allocates a free user watchpoint slot, programs the hardware
957/// watchpoint via `KVM_SET_GUEST_DEBUG`, and replies. A future
958/// guest write to the resolved KVA fires the corresponding debug
959/// exit and synthesises a snapshot tagged by the symbol.
960pub const SNAPSHOT_KIND_WATCH: u32 = 2;
961
962/// Reply status: success — the host completed the requested action
963/// (capture stored, or watchpoint armed).
964pub const SNAPSHOT_STATUS_OK: u32 = 1;
965
966/// Reply status: failure — the host rejected or could not complete
967/// the request. The reason buffer carries a UTF-8 diagnostic.
968pub const SNAPSHOT_STATUS_ERR: u32 = 2;
969
970/// Outcome of a guest-driven snapshot request: ok, error with reason,
971/// or transport failure (port unavailable / not in guest / timeout).
972#[derive(Debug)]
973pub enum SnapshotRequestResult {
974 /// Host completed the request. For
975 /// [`SNAPSHOT_KIND_CAPTURE`] this means the report
976 /// was stored on the bridge under the supplied tag; for
977 /// [`SNAPSHOT_KIND_WATCH`] this means the hardware
978 /// watchpoint was armed.
979 Ok,
980 /// Host accepted the request but completed it as a failure. The
981 /// reason carries the host-supplied diagnostic text (truncated to
982 /// [`SNAPSHOT_REASON_MAX`] bytes).
983 HostError { reason: String },
984 /// Transport failed (called from host context, port not yet open,
985 /// host did not reply within `timeout`, malformed reply frame).
986 /// The supplied diagnostic names the underlying cause.
987 TransportError { reason: String },
988}
989
990/// Snapshot request payload (72 bytes).
991///
992/// Sent guest→host as the payload of a [`MsgType::SnapshotRequest`]
993/// frame on virtio-console port 1 TX. The guest fills every field
994/// before publishing; the trailing zeros in `tag` form the NUL
995/// terminator when the supplied tag is shorter than
996/// [`SNAPSHOT_TAG_MAX`].
997///
998/// SAFETY: `repr(C)` with `u32 + u32 + [u8; 64]` produces a 72-byte
999/// struct with no padding (every field is naturally aligned;
1000/// trailing array of `u8` requires no end-of-struct padding).
1001/// Every bit pattern is valid for `u32` and `u8`. zerocopy derives
1002/// produce no panics.
1003#[repr(C)]
1004#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout)]
1005pub struct SnapshotRequestPayload {
1006 /// Monotonic request id the guest stamped before publishing.
1007 /// The host echoes this value into the matching
1008 /// [`SnapshotReplyPayload::request_id`] so the guest's blocking
1009 /// reader can pair against the original request.
1010 pub request_id: u32,
1011 /// Request kind: one of [`SNAPSHOT_KIND_CAPTURE`] /
1012 /// [`SNAPSHOT_KIND_WATCH`]. [`SNAPSHOT_KIND_NONE`] is invalid on
1013 /// the wire — the host rejects it with [`SNAPSHOT_STATUS_ERR`].
1014 pub kind: u32,
1015 /// Tag — UTF-8, NUL-terminated when shorter than the buffer;
1016 /// truncated to [`SNAPSHOT_TAG_MAX`] when longer. For
1017 /// [`SNAPSHOT_KIND_CAPTURE`] the tag is the snapshot name (key
1018 /// the bridge stores the report under); for
1019 /// [`SNAPSHOT_KIND_WATCH`] the tag is the symbol path the host
1020 /// resolves through vmlinux ELF.
1021 pub tag: [u8; SNAPSHOT_TAG_MAX],
1022}
1023
1024const _SNAPSHOT_REQUEST_PAYLOAD_SIZE: () =
1025 assert!(std::mem::size_of::<SnapshotRequestPayload>() == 8 + SNAPSHOT_TAG_MAX);
1026
1027/// Snapshot reply payload (520 bytes: `u32 request_id + u32 status + [u8; 512] reason`).
1028///
1029/// Sent host→guest as the payload of a [`MsgType::SnapshotReply`]
1030/// frame on virtio-console port 1 RX. Mirrors the request layout —
1031/// the guest matches `request_id` against its outstanding request
1032/// and reads `status`/`reason` to surface the host's verdict.
1033///
1034/// SAFETY: identical layout reasoning as [`SnapshotRequestPayload`].
1035#[repr(C)]
1036#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout)]
1037pub struct SnapshotReplyPayload {
1038 /// Echo of the request's `request_id`. The guest's blocking
1039 /// reader spins until it observes this value match its
1040 /// outstanding request.
1041 pub request_id: u32,
1042 /// Reply status: [`SNAPSHOT_STATUS_OK`] when the host completed
1043 /// the request, [`SNAPSHOT_STATUS_ERR`] otherwise.
1044 pub status: u32,
1045 /// Reason — UTF-8, NUL-terminated when shorter than the buffer;
1046 /// truncated to [`SNAPSHOT_REASON_MAX`] when longer. Empty
1047 /// (all-zero) on the success path.
1048 pub reason: [u8; SNAPSHOT_REASON_MAX],
1049}
1050
1051const _SNAPSHOT_REPLY_PAYLOAD_SIZE: () =
1052 assert!(std::mem::size_of::<SnapshotReplyPayload>() == 8 + SNAPSHOT_REASON_MAX);
1053
1054// ---------------------------------------------------------------------------
1055// KernelOp request/reply payloads (postcard-encoded, variable-length)
1056// ---------------------------------------------------------------------------
1057
1058/// Hot/cold orchestration discriminant for kernel-memory ops on the
1059/// wire. Encoded inside [`KernelOpRequestPayload`].
1060#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1061pub enum KernelOpMode {
1062 /// Hot: dispatched on a host worker thread without freeze
1063 /// rendezvous. Mirrors `Op::WriteKernelHot` / `Op::ReadKernelHot`
1064 /// orchestration. Caller is responsible for guest-side sync.
1065 Hot,
1066 /// Cold: dispatched inside a freeze rendezvous with every vCPU
1067 /// parked. Mirrors `Op::WriteKernelCold` / `Op::ReadKernelCold`
1068 /// orchestration. Coherent with respect to guest state.
1069 Cold,
1070}
1071
1072/// Direction discriminant: write vs read. Inside
1073/// [`KernelOpRequestPayload`] the kind picks WHICH `GuestKernel::*`
1074/// method family the host dispatcher invokes.
1075#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1076pub enum KernelOpDirection {
1077 /// Write: `values` contains the bytes to write; reply carries
1078 /// success/error and the per-write byte count.
1079 Write,
1080 /// Read: `values` is empty; reply carries the bytes read into
1081 /// `KernelOpReplyPayload::read_values`.
1082 Read,
1083}
1084
1085/// Wire-encoded [`crate::scenario::ops::KernelTarget`] variant tag.
1086/// Mirrors the `KernelTarget` enum variants 1:1; postcard encodes
1087/// the tag + the variant payload that follows in [`KernelOpTarget`].
1088#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1089pub enum KernelOpTarget {
1090 /// Kernel symbol (text/data/bss), resolved at dispatch via
1091 /// runtime kernel image base + KASLR.
1092 Symbol(String),
1093 /// Direct-mapped KVA; translated via `kva - PAGE_OFFSET`.
1094 Direct(u64),
1095 /// Vmalloc'd KVA; translated via page-table walk through CR3.
1096 Kva(u64),
1097 /// Per-CPU field of a kernel struct. Resolved at dispatch via
1098 /// `symbol_kva + __per_cpu_offset[cpu] + BTF byte offset of field`.
1099 PerCpuField {
1100 /// Symbol naming the per-CPU template (e.g. `"runqueues"`).
1101 symbol: String,
1102 /// Field within the symbol's struct (e.g. `"clock"`).
1103 field: String,
1104 /// CPU index whose per-CPU instance to address.
1105 cpu: u32,
1106 },
1107 /// Per-task field of a `struct task_struct` — SCX-managed tasks
1108 /// only. Resolved at dispatch time by walking `init_task.tasks`
1109 /// plus each leader's `signal->thread_head` to locate the
1110 /// `task_struct *` whose `pid` matches AND whose `start_time`
1111 /// matches `expected_start_time_ns` (anti-PID-reuse identity
1112 /// guard), then adding the BTF-resolved nested-path byte offset
1113 /// of `field` within `struct task_struct`.
1114 ///
1115 /// `pid` is the GUEST-side `pid_t` (positive). Both
1116 /// thread-group leaders AND non-leader threads are addressable:
1117 /// the walker iterates leaders via `for_each_process` semantics
1118 /// (`include/linux/sched/signal.h:639`), and for each leader
1119 /// also walks `leader->signal->thread_head` via
1120 /// `for_each_thread` semantics (same header L654-659).
1121 ///
1122 /// `expected_start_time_ns` is the value `task->start_time` had
1123 /// at WorkSpec spawn time. The kernel sets `start_time` once via
1124 /// `ktime_get_ns()` in `kernel/fork.c::copy_process`
1125 /// (`include/linux/sched.h:1127`); the value never changes
1126 /// after that. Caller records it at spawn time (e.g. via
1127 /// `/proc/<pid>/stat` field 22 + sysconf-to-ns conversion).
1128 /// The dispatcher rejects writes when the observed
1129 /// `task->start_time` differs — catches the PID-reuse hazard
1130 /// where the original worker exited and the kernel recycled
1131 /// the PID for an unrelated task.
1132 ///
1133 /// `field` is a dot-separated nested-member path. **SCX-only**:
1134 /// the dispatcher's class gate accepts ONLY tasks whose
1135 /// `sched_class` is `ext_sched_class`. Recommended fields:
1136 /// - `"scx.dsq_vtime"` — SCX DSQ priority-queue ordering key;
1137 /// preserved across dequeue/enqueue cycles
1138 /// (`kernel/sched/ext.c`).
1139 /// - `"start_boottime"` — task fork timestamp; observable in
1140 /// `/proc/<pid>/stat` field 22.
1141 ///
1142 /// **DO NOT** write `"se.vruntime"` — EEVDF's `place_entity`
1143 /// (`kernel/sched/fair.c:5329-5414`, since 6.6) overwrites
1144 /// `se->vruntime` on every enqueue via `avg_vruntime(cfs_rq) -
1145 /// se->vlag`. Direct vruntime writes are silently discarded for
1146 /// sleeping tasks (which is our validation gate). TaskField
1147 /// rejects non-SCX tasks before reaching this field anyway.
1148 ///
1149 /// Eight-layer task validation before any write/read lands:
1150 /// 1. `task->pid == requested_pid` (anti-mismatch),
1151 /// 2. `task->start_time` within
1152 /// `[expected_start_time_ns, expected_start_time_ns + 10ms)`
1153 /// (anti-PID-reuse identity; the 10ms window absorbs the
1154 /// `/proc/<pid>/stat` CLK_TCK quantization since the kernel's
1155 /// `start_time` carries sub-tick ns precision while the
1156 /// caller's value is rounded down to a tick boundary),
1157 /// 3. `task->__state & TASK_DEAD == 0` (lifetime),
1158 /// 4. `task->on_rq == 0` (rb-tree / DSQ ordering safety per
1159 /// `task_on_rq_queued` at `kernel/sched/sched.h:2399`),
1160 /// 5. `task->scx.dsq == NULL` AND `task->scx.runnable_node` is
1161 /// list-empty (SCX maintains `runnable_node` linkage
1162 /// independent of dsq pointer per
1163 /// `include/linux/sched/ext.h:227`),
1164 /// 6. `task->sched_class == &ext_sched_class` (the canonical
1165 /// SCX-managed gate),
1166 /// 7. (REMOVED) a former `task->policy == SCHED_EXT` gate: SCX
1167 /// claims fair-policy tasks via `sched_class` without changing
1168 /// `task->policy`, so a policy check would wrongly reject
1169 /// SCX-managed tasks that forked under `SCHED_NORMAL`. The
1170 /// number is kept so the surviving gates retain their labels.
1171 /// 8. `task->start_boottime != 0` (anti-slab-recycle: a
1172 /// freshly-zeroed slab page reads zero; live tasks have this
1173 /// set to non-zero `ktime_get_boottime_ns()` at fork).
1174 TaskField {
1175 /// Guest-side PID of the target task. Both leaders and
1176 /// non-leader threads are addressable via the dispatcher's
1177 /// per-thread walker.
1178 pid: u32,
1179 /// `task->start_time` (`u64`, nanoseconds) recorded at
1180 /// WorkSpec spawn time. Used by the L2 anti-PID-reuse
1181 /// identity check.
1182 expected_start_time_ns: u64,
1183 /// Nested member path within `struct task_struct`. Dot-
1184 /// separated; first segment is a direct member of
1185 /// `task_struct`, subsequent segments descend through named
1186 /// composite members.
1187 field: String,
1188 },
1189}
1190
1191/// Wire-encoded [`crate::scenario::ops::KernelValue`] variant tag.
1192/// Mirrors the four `KernelValue` enum variants 1:1.
1193#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1194pub enum KernelOpValue {
1195 /// 32-bit unsigned, little-endian on the wire and at the
1196 /// resolved PA. Atomic when the resolved host PA is 4-byte
1197 /// aligned (see `GuestKernel::write_*_u32` doc).
1198 U32(u32),
1199 /// 64-bit unsigned, little-endian. Atomic at 8-byte alignment.
1200 U64(u64),
1201 /// Variable-length byte payload. Written non-atomically; the
1202 /// dispatcher emits a Release fence after the copy.
1203 Bytes(Vec<u8>),
1204 /// 32-bit unsigned read-modify-write OR mask. The cold-path
1205 /// dispatcher reads the live u32 at the resolved host PA,
1206 /// ORs the carried mask into it, and writes the new value
1207 /// back as two separate `read_u32` / `write_u32` calls —
1208 /// atomic by quiesce because the freeze rendezvous parks
1209 /// every guest vCPU before the RMW runs (no concurrent
1210 /// kernel writer can interleave). No `compare_exchange` loop
1211 /// in the cold path. Mirrors
1212 /// [`crate::scenario::ops::KernelValue::OrU32`] — see that
1213 /// variant's doc for the full atomicity, ordering, and
1214 /// width-correctness contract (the canonical
1215 /// `SCX_RQ_CLK_VALID` use case + the
1216 /// `kernel/sched/sched.h:802` u32-width citation for the
1217 /// `struct scx_rq.flags` field that motivated keeping the
1218 /// variant u32 rather than u64). Hot-path support is a
1219 /// future variant — it would require `AtomicU32::from_ptr`
1220 /// + cmpxchg + strict alignment rejection.
1221 OrU32(u32),
1222}
1223
1224/// One write/read pair inside a [`KernelOpRequestPayload`] batch.
1225/// `value` is the bytes to write for a [`KernelOpDirection::Write`]
1226/// request and a placeholder ignored by the dispatcher for a
1227/// [`KernelOpDirection::Read`] request (the value-width discriminant
1228/// IS still load-bearing for reads — it picks the read method
1229/// family).
1230#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1231pub struct KernelOpEntry {
1232 /// Address to write or read.
1233 pub target: KernelOpTarget,
1234 /// Value to write (or value-width hint for a read).
1235 pub value: KernelOpValue,
1236}
1237
1238/// Postcard-encoded payload for [`MsgType::KernelOpRequest`].
1239///
1240/// Carries an entire `Op::WriteKernel{Hot,Cold}` /
1241/// `Op::ReadKernel{Hot,Cold}` invocation including the full
1242/// `Vec<(KernelTarget, KernelValue)>` batch — variable-length, hence
1243/// the postcard encoding rather than a zerocopy fixed-size struct.
1244///
1245/// For write-direction payloads the executor's adjacent-cold-op
1246/// auto-merge pre-pass folds N adjacent `Op::WriteKernelCold`
1247/// singletons into one payload with N entries — multi-CPU seeds
1248/// (e.g. `with_uptime` writing per-CPU `rq.clock` on every CPU)
1249/// land in ONE freeze rendezvous with no inter-CPU skew. Reads
1250/// remain one-per-rendezvous until a follow-up batch adds
1251/// per-entry direction + tag.
1252#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1253pub struct KernelOpRequestPayload {
1254 /// Monotonic request id; the host echoes it into the matching
1255 /// [`KernelOpReplyPayload::request_id`].
1256 pub request_id: u32,
1257 /// Hot vs cold orchestration.
1258 pub mode: KernelOpMode,
1259 /// Write vs read direction.
1260 pub direction: KernelOpDirection,
1261 /// Bridge-keyed tag for the response. For reads the tag becomes
1262 /// the bridge entry key; for writes the tag is informational
1263 /// only (the executor surfaces it in the success record).
1264 pub tag: String,
1265 /// Ordered batch entries. For [`KernelOpDirection::Write`] all
1266 /// entries' `value` carries the bytes to write; for
1267 /// [`KernelOpDirection::Read`] only `target` + the value-width
1268 /// discriminant are load-bearing.
1269 pub entries: Vec<KernelOpEntry>,
1270}
1271
1272/// Postcard-encoded payload for `MsgType::KernelOpReply`.
1273///
1274/// Mirrors the request id so the guest's blocking reader can pair
1275/// against the original request. Status carries success/failure; on
1276/// failure `reason` describes the host-side error. For
1277/// `KernelOpDirection::Read` requests `read_values` carries the
1278/// per-entry bytes the host coordinator read; empty for writes.
1279#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1280pub struct KernelOpReplyPayload {
1281 /// Echo of the request's `request_id`.
1282 pub request_id: u32,
1283 /// `true` when the host completed every entry in the batch;
1284 /// `false` when any entry failed (reason describes the first
1285 /// failure).
1286 pub success: bool,
1287 /// Human-readable diagnostic on the failure path; empty on
1288 /// success.
1289 pub reason: String,
1290 /// For a `KernelOpDirection::Read` request: one
1291 /// [`KernelOpValue`] per request entry in iteration order. Empty
1292 /// for writes.
1293 pub read_values: Vec<KernelOpValue>,
1294}
1295
1296/// Upper bound on the on-wire size of a postcard-encoded
1297/// [`KernelOpReplyPayload`] frame the guest accepts on port-1 RX.
1298///
1299/// 1 MiB covers every realistic batch shape:
1300/// * `with_uptime` writing per-CPU `rq.clock` on 1024 CPUs:
1301/// ~9 KiB (well under cap).
1302/// * Bulk `read_*_bytes` of a struct page (4 KiB) per CPU on a
1303/// 128-CPU host: ~520 KiB (within cap).
1304/// * Per-CPU 1 KiB `Bytes` read on 1024 CPUs: ~1 MiB (right at cap).
1305///
1306/// **Per-op entry budget**: callers that need replies larger than
1307/// 1 MiB must split the request across multiple ops; the cap
1308/// rejects forged or accidentally-huge lengths BEFORE the
1309/// `vec![0u8; length]` allocation in
1310/// [`crate::vmm::guest_comms`]'s frame reader, so a hostile or
1311/// buggy host cannot OOM the guest's PID 1 init.
1312pub const KERNEL_OP_REPLY_MAX: usize = 1024 * 1024;
1313
1314/// Upper bound on [`KernelOpRequestPayload::tag`] bytes.
1315/// `req.tag` is a `String` and downstream formatters (the reply
1316/// `reason` field, tracing emits) embed it inline. Without a
1317/// bound, a framework bug or test-author misuse producing a
1318/// multi-megabyte tag would inflate the postcard-encoded reply
1319/// past [`KERNEL_OP_REPLY_MAX`] and the reply would silently
1320/// drop at the guest's RX cap, surfacing only as a 30-second
1321/// transport timeout. Tags longer than this cap are truncated at
1322/// decode time in `src/vmm/freeze_coord/dispatch.rs`'s
1323/// `MsgType::KernelOpRequest` arm with a UTF-8 char-boundary
1324/// walk-down to avoid the `String::truncate` mid-codepoint panic.
1325/// 256 bytes fits operator-readable test-name and scenario-phase
1326/// labels with headroom; framework code that benignly produces
1327/// longer tags loses suffix bytes from the diagnostic but the op
1328/// itself continues normally.
1329pub const KERNEL_OP_TAG_MAX: usize = 256;
1330/// Upper bound on [`KernelOpReplyPayload::reason`] bytes. Pairs
1331/// with [`KERNEL_OP_TAG_MAX`] for the reply-side bound:
1332/// coordinator-generated reasons embed the request tag inline and
1333/// otherwise format diagnostic text from typed-error payloads.
1334/// 256 bytes fits diagnostic messages like
1335/// "PA validation rejected: pa=0x... reason=wrong-half" plus the
1336/// request_id and the truncated tag.
1337pub const KERNEL_OP_REASON_MAX: usize = 256;
1338
1339/// Outcome of a guest-driven kernel-memory op request: the host
1340/// returned a reply (caller inspects [`KernelOpReplyPayload::success`])
1341/// or the transport failed (port not open, timeout, malformed frame).
1342///
1343/// Distinct from a `host_error` variant the way [`SnapshotRequestResult`]
1344/// distinguishes — kernel-op replies are postcard-encoded with
1345/// arbitrary structure, so the "host completed but op failed" carrier
1346/// is the reply payload's `success: false` + `reason`. The
1347/// `TransportError` arm covers cases where the guest never receives a
1348/// usable reply at all.
1349#[derive(Debug)]
1350pub enum KernelOpRequestResult {
1351 /// Host returned a postcard-decoded reply. The caller inspects
1352 /// `reply.success` to distinguish op success from host-side op
1353 /// failure; `reply.reason` carries the failure diagnostic when
1354 /// `success == false`.
1355 Ok(KernelOpReplyPayload),
1356 /// Transport failed (called from host context, port not yet open,
1357 /// host did not reply within `timeout`, malformed reply frame).
1358 /// The supplied diagnostic names the underlying cause.
1359 TransportError { reason: String },
1360}
1361
1362// ---------------------------------------------------------------------------
1363// ControlEvent — multiport control protocol discriminants
1364// ---------------------------------------------------------------------------
1365
1366/// Multiport control-event discriminant. Mirrors the kernel uapi
1367/// `enum virtio_console_event` in `include/uapi/linux/virtio_console.h`.
1368///
1369/// The on-wire value is a u16. [`Self::wire_value`] returns the value
1370/// the kernel and the host VMM exchange on the c_ivq / c_ovq queues;
1371/// [`Self::from_wire`] reverses the mapping for a host-side parser.
1372#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
1373pub enum ControlEvent {
1374 /// Guest-side: driver finished probing, host may begin port
1375 /// enumeration.
1376 DeviceReady,
1377 /// Host-side: announce a new port to the guest.
1378 PortAdd,
1379 /// Host-side: tear down a port.
1380 PortRemove,
1381 /// Guest-side: per-port driver finished setup.
1382 PortReady,
1383 /// Host-side: mark a port as the system console.
1384 ConsolePort,
1385 /// Host-side: terminal resize event.
1386 Resize,
1387 /// Bidirectional: open/close indication for a port.
1388 PortOpen,
1389 /// Host-side: PORT_NAME header followed by name bytes.
1390 PortName,
1391}
1392
1393impl ControlEvent {
1394 /// 16-bit on-wire discriminant. Values match the kernel uapi
1395 /// constants `VIRTIO_CONSOLE_*`.
1396 pub const fn wire_value(self) -> u16 {
1397 match self {
1398 ControlEvent::DeviceReady => 0,
1399 ControlEvent::PortAdd => 1,
1400 ControlEvent::PortRemove => 2,
1401 ControlEvent::PortReady => 3,
1402 ControlEvent::ConsolePort => 4,
1403 ControlEvent::Resize => 5,
1404 ControlEvent::PortOpen => 6,
1405 ControlEvent::PortName => 7,
1406 }
1407 }
1408
1409 /// Reverse the wire mapping. Returns `None` for unknown
1410 /// discriminants — the host parser is expected to log + skip such
1411 /// frames rather than panic.
1412 pub const fn from_wire(value: u16) -> Option<Self> {
1413 match value {
1414 0 => Some(ControlEvent::DeviceReady),
1415 1 => Some(ControlEvent::PortAdd),
1416 2 => Some(ControlEvent::PortRemove),
1417 3 => Some(ControlEvent::PortReady),
1418 4 => Some(ControlEvent::ConsolePort),
1419 5 => Some(ControlEvent::Resize),
1420 6 => Some(ControlEvent::PortOpen),
1421 7 => Some(ControlEvent::PortName),
1422 _ => None,
1423 }
1424 }
1425}
1426
1427// ---------------------------------------------------------------------------
1428// VirtioConsoleControl — wire-format control message
1429// ---------------------------------------------------------------------------
1430
1431/// Wire-format control message exchanged on c_ivq / c_ovq.
1432///
1433/// Mirrors `struct virtio_console_control` in
1434/// `include/uapi/linux/virtio_console.h`: id (u32), event (u16),
1435/// value (u16). The kernel's wire format is little-endian; on the LE
1436/// hosts ktstr targets (x86_64, aarch64), `repr(C)` produces the
1437/// correct byte order via zerocopy `IntoBytes` / `FromBytes`.
1438///
1439/// SAFETY: `repr(C)` produces an 8-byte struct with no padding when
1440/// every field is naturally aligned (u32 at offset 0, u16 at offset
1441/// 4, u16 at offset 6). The `packed` qualifier is unnecessary because
1442/// the natural alignment matches the kernel's expected wire layout
1443/// and is checked by [`std::mem::size_of`] below. Every bit pattern
1444/// is valid for u32/u16. zerocopy derives produce no panics.
1445#[repr(C)]
1446#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout)]
1447pub struct VirtioConsoleControl {
1448 pub id: u32,
1449 pub event: u16,
1450 pub value: u16,
1451}
1452
1453const _VIRTIO_CONSOLE_CONTROL_SIZE: () = assert!(std::mem::size_of::<VirtioConsoleControl>() == 8);
1454
1455// ---------------------------------------------------------------------------
1456// Multiport device constants
1457// ---------------------------------------------------------------------------
1458
1459/// Number of multiport ports the device exposes.
1460///
1461/// Port 0 is the kernel console (`/dev/hvc0`); port 1 is the
1462/// host-bound bulk TLV stream (`/dev/vport0p1`); port 2 is the
1463/// scheduler stats bridge (`/dev/vport0p2`) carrying raw byte
1464/// passthrough between the host's [`super::sched_stats::SchedStatsClient`]
1465/// and the guest's `scx_stats` Unix-socket relay. Three ports →
1466/// eight queues per virtio-v1.2 §5.3.5 (`2 + 2 * num_ports`).
1467pub const NUM_PORTS: u32 = 3;
1468
1469/// Port-1 device-name advertised to the guest. The kernel exposes
1470/// this as `/sys/class/virtio-ports/vport0p1/name`; the guest init
1471/// reads from this path to discover the bulk channel device node.
1472pub const PORT1_NAME: &str = "ktstr-bulk";
1473
1474/// Port-2 device-name advertised to the guest. The kernel exposes
1475/// this as `/sys/class/virtio-ports/vport0p2/name`; the guest init
1476/// reads from this path to discover the scheduler-stats relay
1477/// device node and connects it to the scheduler's
1478/// `/var/run/scx/root/stats` Unix socket.
1479pub const PORT2_NAME: &str = "ktstr-stats";
1480
1481#[cfg(test)]
1482mod tests {
1483 use super::*;
1484
1485 /// `parse_scenario_end` round-trips the two LE u64s the guest
1486 /// writes, and rejects a short/torn payload (returns None rather
1487 /// than misreading) — the host folds the parsed iteration count
1488 /// into the terminal StimulusEvent for the last step's rate.
1489 #[test]
1490 fn parse_scenario_end_round_trip_and_short_payload() {
1491 let mut payload = [0u8; SCENARIO_END_PAYLOAD_SIZE];
1492 payload[0..8].copy_from_slice(&12_345u64.to_le_bytes());
1493 payload[8..16].copy_from_slice(&987_654u64.to_le_bytes());
1494 assert_eq!(parse_scenario_end(&payload), Some((12_345, 987_654)));
1495 // A short payload (e.g. only the elapsed field) is rejected.
1496 assert_eq!(parse_scenario_end(&payload[..8]), None);
1497 assert_eq!(parse_scenario_end(&[]), None);
1498 }
1499
1500 /// `ShmMessage` round-trips through bytes — guards against an
1501 /// accidental field reorder or a stray padding byte that would
1502 /// shift the on-wire layout for both guest writer and host
1503 /// reader.
1504 #[test]
1505 fn shm_message_round_trip_through_bytes() {
1506 let f = ShmMessage {
1507 msg_type: MSG_TYPE_EXIT,
1508 length: 4,
1509 crc32: 0xDEAD_BEEF,
1510 _pad: 0,
1511 };
1512 let bytes = f.as_bytes();
1513 assert_eq!(bytes.len(), FRAME_HEADER_SIZE);
1514 let back = ShmMessage::read_from_bytes(bytes).expect("16-byte slice deserializes");
1515 let msg_type = back.msg_type;
1516 let length = back.length;
1517 let crc32 = back.crc32;
1518 let pad = back._pad;
1519 assert_eq!(msg_type, MSG_TYPE_EXIT);
1520 assert_eq!(length, 4);
1521 assert_eq!(crc32, 0xDEAD_BEEF);
1522 assert_eq!(pad, 0);
1523 }
1524
1525 /// Every msg_type constant is distinct — a copy/paste error
1526 /// that aliased two ids would silently misroute messages.
1527 #[test]
1528 fn msg_type_constants_are_unique() {
1529 let ids = [
1530 MSG_TYPE_STIMULUS,
1531 MSG_TYPE_STEP_END,
1532 MSG_TYPE_SCENARIO_START,
1533 MSG_TYPE_SCENARIO_END,
1534 MSG_TYPE_SCENARIO_PAUSE,
1535 MSG_TYPE_SCENARIO_RESUME,
1536 MSG_TYPE_EXIT,
1537 MSG_TYPE_TEST_RESULT,
1538 MSG_TYPE_SCHED_EXIT,
1539 MSG_TYPE_CRASH,
1540 MSG_TYPE_PAYLOAD_METRICS,
1541 MSG_TYPE_PROFRAW,
1542 MSG_TYPE_WPROF_TRACE,
1543 MSG_TYPE_WPROF_TRACE_CHUNK,
1544 MSG_TYPE_SNAPSHOT_REQUEST,
1545 MSG_TYPE_SNAPSHOT_REPLY,
1546 MSG_TYPE_KERNEL_OP_REQUEST,
1547 MSG_TYPE_KERNEL_OP_REPLY,
1548 MSG_TYPE_SYS_RDY,
1549 MSG_TYPE_SCHED_SWAP_NOTIFY,
1550 MSG_TYPE_STDOUT,
1551 MSG_TYPE_STDERR,
1552 MSG_TYPE_SCHED_LOG,
1553 MSG_TYPE_LIFECYCLE,
1554 MSG_TYPE_EXEC_EXIT,
1555 MSG_TYPE_DMESG,
1556 MSG_TYPE_PROBE_OUTPUT,
1557 ];
1558 for (i, a) in ids.iter().enumerate() {
1559 for b in &ids[i + 1..] {
1560 assert_ne!(a, b, "duplicate MSG_TYPE id 0x{a:08x}");
1561 }
1562 }
1563 }
1564
1565 /// Pin the on-wire byte order of `msg_type` to little-endian.
1566 /// The integer literal `0x4558_4954` spells `"EXIT"` in hex digits
1567 /// (`45`='E', `58`='X', `49`='I', `54`='T'), but the LE encoding
1568 /// places the least-significant byte first — so a raw byte dump
1569 /// of a serialized `ShmMessage` shows `[0x54, 0x49, 0x58, 0x45]`,
1570 /// which spells `"TIXE"` byte-by-byte. A future change that
1571 /// flipped the host to big-endian or switched zerocopy's
1572 /// serialization order would silently break the wire contract
1573 /// with the kernel virtio_console driver and every existing
1574 /// guest writer; this test fails loudly instead.
1575 #[test]
1576 fn msg_type_exit_wire_bytes_are_le() {
1577 let f = ShmMessage {
1578 msg_type: MSG_TYPE_EXIT,
1579 length: 0,
1580 crc32: 0,
1581 _pad: 0,
1582 };
1583 let bytes = f.as_bytes();
1584 // First 4 bytes of the header are msg_type as a u32 LE.
1585 assert_eq!(&bytes[..4], &MSG_TYPE_EXIT.to_le_bytes());
1586 // Spell-out check: the LE byte sequence is "TIXE", not "EXIT".
1587 // If the wire ever flips to BE, this assertion fails before the
1588 // guest driver sees the malformed frame.
1589 assert_eq!(&bytes[..4], b"TIXE");
1590 }
1591
1592 /// `ShmMessage` header is exactly 16 bytes with no padding.
1593 #[test]
1594 fn shm_message_size_is_16() {
1595 assert_eq!(FRAME_HEADER_SIZE, 16);
1596 assert_eq!(std::mem::size_of::<ShmMessage>(), 16);
1597 }
1598
1599 /// Every [`MsgType`] variant round-trips through
1600 /// `wire_value` → `from_wire`.
1601 #[test]
1602 fn msg_type_round_trips() {
1603 let all = [
1604 MsgType::Stimulus,
1605 MsgType::StepEnd,
1606 MsgType::ScenarioStart,
1607 MsgType::ScenarioEnd,
1608 MsgType::ScenarioPause,
1609 MsgType::ScenarioResume,
1610 MsgType::Exit,
1611 MsgType::TestResult,
1612 MsgType::SchedExit,
1613 MsgType::Crash,
1614 MsgType::PayloadMetrics,
1615 MsgType::Profraw,
1616 MsgType::WprofTrace,
1617 MsgType::WprofTraceChunk,
1618 MsgType::SnapshotRequest,
1619 MsgType::SnapshotReply,
1620 MsgType::KernelOpRequest,
1621 MsgType::KernelOpReply,
1622 MsgType::SysRdy,
1623 MsgType::SchedSwapNotify,
1624 MsgType::Stdout,
1625 MsgType::Stderr,
1626 MsgType::SchedLog,
1627 MsgType::Lifecycle,
1628 MsgType::ExecExit,
1629 MsgType::Dmesg,
1630 MsgType::ProbeOutput,
1631 ];
1632 for variant in all {
1633 let v = variant.wire_value();
1634 assert_eq!(MsgType::from_wire(v), Some(variant));
1635 }
1636 }
1637
1638 /// `MsgType::from_wire` returns `None` for an unrecognised
1639 /// discriminant — the bulk parser must surface unknown tags as
1640 /// errors rather than treat them as a known variant.
1641 #[test]
1642 fn msg_type_from_wire_unknown_returns_none() {
1643 assert_eq!(MsgType::from_wire(0xDEAD_BEEF), None);
1644 assert_eq!(MsgType::from_wire(0), None);
1645 }
1646
1647 /// `MsgType::wire_value` matches the corresponding `MSG_TYPE_*`
1648 /// constant — guards against a typo that would diverge the typed
1649 /// API from the on-wire constant.
1650 #[test]
1651 fn msg_type_wire_value_matches_constants() {
1652 assert_eq!(MsgType::Stimulus.wire_value(), MSG_TYPE_STIMULUS);
1653 assert_eq!(MsgType::StepEnd.wire_value(), MSG_TYPE_STEP_END);
1654 assert_eq!(MsgType::ScenarioStart.wire_value(), MSG_TYPE_SCENARIO_START);
1655 assert_eq!(MsgType::ScenarioPause.wire_value(), MSG_TYPE_SCENARIO_PAUSE);
1656 assert_eq!(
1657 MsgType::ScenarioResume.wire_value(),
1658 MSG_TYPE_SCENARIO_RESUME
1659 );
1660 assert_eq!(MsgType::ScenarioEnd.wire_value(), MSG_TYPE_SCENARIO_END);
1661 assert_eq!(MsgType::Exit.wire_value(), MSG_TYPE_EXIT);
1662 assert_eq!(MsgType::TestResult.wire_value(), MSG_TYPE_TEST_RESULT);
1663 assert_eq!(MsgType::SchedExit.wire_value(), MSG_TYPE_SCHED_EXIT);
1664 assert_eq!(MsgType::Crash.wire_value(), MSG_TYPE_CRASH);
1665 assert_eq!(
1666 MsgType::PayloadMetrics.wire_value(),
1667 MSG_TYPE_PAYLOAD_METRICS
1668 );
1669 assert_eq!(MsgType::Profraw.wire_value(), MSG_TYPE_PROFRAW);
1670 assert_eq!(MsgType::WprofTrace.wire_value(), MSG_TYPE_WPROF_TRACE);
1671 assert_eq!(
1672 MsgType::WprofTraceChunk.wire_value(),
1673 MSG_TYPE_WPROF_TRACE_CHUNK
1674 );
1675 assert_eq!(
1676 MsgType::SnapshotRequest.wire_value(),
1677 MSG_TYPE_SNAPSHOT_REQUEST
1678 );
1679 assert_eq!(MsgType::SnapshotReply.wire_value(), MSG_TYPE_SNAPSHOT_REPLY);
1680 assert_eq!(
1681 MsgType::KernelOpRequest.wire_value(),
1682 MSG_TYPE_KERNEL_OP_REQUEST
1683 );
1684 assert_eq!(
1685 MsgType::KernelOpReply.wire_value(),
1686 MSG_TYPE_KERNEL_OP_REPLY
1687 );
1688 assert_eq!(MsgType::SysRdy.wire_value(), MSG_TYPE_SYS_RDY);
1689 assert_eq!(MsgType::Stdout.wire_value(), MSG_TYPE_STDOUT);
1690 assert_eq!(MsgType::Stderr.wire_value(), MSG_TYPE_STDERR);
1691 assert_eq!(MsgType::SchedLog.wire_value(), MSG_TYPE_SCHED_LOG);
1692 assert_eq!(MsgType::Lifecycle.wire_value(), MSG_TYPE_LIFECYCLE);
1693 assert_eq!(MsgType::ExecExit.wire_value(), MSG_TYPE_EXEC_EXIT);
1694 assert_eq!(MsgType::Dmesg.wire_value(), MSG_TYPE_DMESG);
1695 assert_eq!(MsgType::ProbeOutput.wire_value(), MSG_TYPE_PROBE_OUTPUT);
1696 assert_eq!(
1697 MsgType::SchedSwapNotify.wire_value(),
1698 MSG_TYPE_SCHED_SWAP_NOTIFY
1699 );
1700 }
1701
1702 /// `is_coordinator_internal` flips on for SnapshotRequest,
1703 /// SnapshotReply, KernelOpRequest, KernelOpReply, SysRdy, and
1704 /// SchedSwapNotify and stays off for every test-verdict-bearing
1705 /// variant. The
1706 /// Reply variants are host→guest only on port-1 RX; a guest TX
1707 /// frame stamped with one of those tags is illegitimate and
1708 /// must be dropped rather than bucketed as a phantom verdict
1709 /// entry. Pinning this matrix here means a future contributor
1710 /// adding a new control frame must explicitly opt into the
1711 /// gate (or explicitly opt out by adding a "verdict-bearing"
1712 /// entry to the test) — the freeze coord's mid-run filter and
1713 /// `collect_results`'s post-run drain both key on this single
1714 /// classifier (search for `is_coordinator_internal` in
1715 /// `crate::vmm::freeze_coord`).
1716 #[test]
1717 fn is_coordinator_internal_matches_filter_set() {
1718 let internal = [
1719 MsgType::SnapshotRequest,
1720 MsgType::SnapshotReply,
1721 MsgType::KernelOpRequest,
1722 MsgType::KernelOpReply,
1723 MsgType::SysRdy,
1724 MsgType::SchedSwapNotify,
1725 ];
1726 let verdict = [
1727 MsgType::Stimulus,
1728 MsgType::StepEnd,
1729 MsgType::ScenarioStart,
1730 MsgType::ScenarioEnd,
1731 MsgType::ScenarioPause,
1732 MsgType::ScenarioResume,
1733 MsgType::Exit,
1734 MsgType::TestResult,
1735 MsgType::SchedExit,
1736 MsgType::Crash,
1737 MsgType::PayloadMetrics,
1738 MsgType::Profraw,
1739 MsgType::WprofTrace,
1740 MsgType::WprofTraceChunk,
1741 MsgType::Stdout,
1742 MsgType::Stderr,
1743 MsgType::SchedLog,
1744 MsgType::Lifecycle,
1745 MsgType::ExecExit,
1746 MsgType::Dmesg,
1747 MsgType::ProbeOutput,
1748 ];
1749 for v in internal {
1750 assert!(
1751 v.is_coordinator_internal(),
1752 "{v:?} must be classified as coordinator-internal"
1753 );
1754 }
1755 for v in verdict {
1756 assert!(
1757 !v.is_coordinator_internal(),
1758 "{v:?} carries test verdict data and must NOT be filtered out"
1759 );
1760 }
1761 }
1762
1763 /// `MsgType::SchedSwapNotify` round-trips `wire_value` →
1764 /// `from_wire`, carries the stable `"SCSW"` (0x5343_5357)
1765 /// discriminant, and is classified coordinator-internal so the
1766 /// freeze coord's mid-run filter and `collect_results`'s post-run
1767 /// drain both drop it rather than bucketing a phantom verdict.
1768 #[test]
1769 fn sched_swap_notify_round_trips() {
1770 assert_eq!(
1771 MsgType::from_wire(MsgType::SchedSwapNotify.wire_value()),
1772 Some(MsgType::SchedSwapNotify)
1773 );
1774 assert_eq!(MsgType::SchedSwapNotify.wire_value(), 0x5343_5357);
1775 assert!(MsgType::SchedSwapNotify.is_coordinator_internal());
1776 }
1777
1778 /// `LifecyclePhase` round-trips through `wire_value` →
1779 /// `from_wire`. Phase values are byte-stable across builds so
1780 /// the host never silently misclassifies a future guest's
1781 /// phase signal.
1782 #[test]
1783 fn lifecycle_phase_round_trips() {
1784 let all = [
1785 LifecyclePhase::InitStarted,
1786 LifecyclePhase::PayloadStarting,
1787 LifecyclePhase::SchedulerDied,
1788 LifecyclePhase::SchedulerNotAttached,
1789 LifecyclePhase::WorkloadDispatched,
1790 ];
1791 for p in all {
1792 let v = p.wire_value();
1793 assert_eq!(LifecyclePhase::from_wire(v), Some(p));
1794 }
1795 }
1796
1797 /// `LifecyclePhase::from_wire(0)` returns `None` — `0` is
1798 /// reserved as the unknown / invalid sentinel so a
1799 /// zero-initialised payload byte never silently maps to
1800 /// `InitStarted`.
1801 #[test]
1802 fn lifecycle_phase_zero_is_reserved() {
1803 assert_eq!(LifecyclePhase::from_wire(0), None);
1804 assert_eq!(LifecyclePhase::from_wire(0xFF), None);
1805 }
1806
1807 /// Pin the `LifecyclePhase` discriminants. Wire values are part
1808 /// of the protocol contract — a future change that reorders
1809 /// the enum variants would silently shift this mapping unless
1810 /// pinned by an explicit assertion here.
1811 #[test]
1812 fn lifecycle_phase_wire_values_are_stable() {
1813 assert_eq!(LifecyclePhase::InitStarted.wire_value(), 1);
1814 assert_eq!(LifecyclePhase::PayloadStarting.wire_value(), 2);
1815 assert_eq!(LifecyclePhase::SchedulerDied.wire_value(), 3);
1816 assert_eq!(LifecyclePhase::SchedulerNotAttached.wire_value(), 4);
1817 assert_eq!(LifecyclePhase::WorkloadDispatched.wire_value(), 5);
1818 }
1819
1820 /// `SnapshotRequestPayload` round-trips through bytes — guards
1821 /// against an accidental field reorder or a stray padding byte
1822 /// that would shift the on-wire layout for both guest writer
1823 /// and host parser.
1824 #[test]
1825 fn snapshot_request_payload_round_trip_through_bytes() {
1826 let mut tag = [0u8; SNAPSHOT_TAG_MAX];
1827 tag[..6].copy_from_slice(b"hello!");
1828 let p = SnapshotRequestPayload {
1829 request_id: 0xDEAD_BEEF,
1830 kind: SNAPSHOT_KIND_CAPTURE,
1831 tag,
1832 };
1833 let bytes = p.as_bytes();
1834 assert_eq!(bytes.len(), 8 + SNAPSHOT_TAG_MAX);
1835 let back = SnapshotRequestPayload::read_from_bytes(bytes).expect("payload deserializes");
1836 let request_id = back.request_id;
1837 let kind = back.kind;
1838 assert_eq!(request_id, 0xDEAD_BEEF);
1839 assert_eq!(kind, SNAPSHOT_KIND_CAPTURE);
1840 assert_eq!(&back.tag[..6], b"hello!");
1841 }
1842
1843 /// `SnapshotReplyPayload` round-trips through bytes.
1844 #[test]
1845 fn snapshot_reply_payload_round_trip_through_bytes() {
1846 let mut reason = [0u8; SNAPSHOT_REASON_MAX];
1847 reason[..4].copy_from_slice(b"oops");
1848 let p = SnapshotReplyPayload {
1849 request_id: 0xCAFE_BABE,
1850 status: SNAPSHOT_STATUS_ERR,
1851 reason,
1852 };
1853 let bytes = p.as_bytes();
1854 assert_eq!(bytes.len(), 8 + SNAPSHOT_REASON_MAX);
1855 let back = SnapshotReplyPayload::read_from_bytes(bytes).expect("payload deserializes");
1856 let request_id = back.request_id;
1857 let status = back.status;
1858 assert_eq!(request_id, 0xCAFE_BABE);
1859 assert_eq!(status, SNAPSHOT_STATUS_ERR);
1860 assert_eq!(&back.reason[..4], b"oops");
1861 }
1862
1863 /// Snapshot kind constants are distinct.
1864 #[test]
1865 fn snapshot_kind_constants_are_unique() {
1866 assert_ne!(SNAPSHOT_KIND_NONE, SNAPSHOT_KIND_CAPTURE);
1867 assert_ne!(SNAPSHOT_KIND_NONE, SNAPSHOT_KIND_WATCH);
1868 assert_ne!(SNAPSHOT_KIND_CAPTURE, SNAPSHOT_KIND_WATCH);
1869 }
1870
1871 /// Snapshot status constants are distinct.
1872 #[test]
1873 fn snapshot_status_constants_are_unique() {
1874 assert_ne!(SNAPSHOT_STATUS_OK, SNAPSHOT_STATUS_ERR);
1875 }
1876
1877 /// Every [`ControlEvent`] variant round-trips through
1878 /// `wire_value` → `from_wire`.
1879 #[test]
1880 fn control_event_round_trips() {
1881 let all = [
1882 ControlEvent::DeviceReady,
1883 ControlEvent::PortAdd,
1884 ControlEvent::PortRemove,
1885 ControlEvent::PortReady,
1886 ControlEvent::ConsolePort,
1887 ControlEvent::Resize,
1888 ControlEvent::PortOpen,
1889 ControlEvent::PortName,
1890 ];
1891 for variant in all {
1892 let v = variant.wire_value();
1893 assert_eq!(ControlEvent::from_wire(v), Some(variant));
1894 }
1895 }
1896
1897 /// `ControlEvent::from_wire` returns `None` for unknown values.
1898 #[test]
1899 fn control_event_from_wire_unknown_returns_none() {
1900 assert_eq!(ControlEvent::from_wire(8), None);
1901 assert_eq!(ControlEvent::from_wire(0xFFFF), None);
1902 }
1903
1904 /// `ControlEvent` discriminants match the kernel uapi numbers
1905 /// (`VIRTIO_CONSOLE_*` in `include/uapi/linux/virtio_console.h`).
1906 #[test]
1907 fn control_event_discriminants_match_uapi() {
1908 assert_eq!(ControlEvent::DeviceReady.wire_value(), 0);
1909 assert_eq!(ControlEvent::PortAdd.wire_value(), 1);
1910 assert_eq!(ControlEvent::PortRemove.wire_value(), 2);
1911 assert_eq!(ControlEvent::PortReady.wire_value(), 3);
1912 assert_eq!(ControlEvent::ConsolePort.wire_value(), 4);
1913 assert_eq!(ControlEvent::Resize.wire_value(), 5);
1914 assert_eq!(ControlEvent::PortOpen.wire_value(), 6);
1915 assert_eq!(ControlEvent::PortName.wire_value(), 7);
1916 }
1917
1918 /// `VirtioConsoleControl` is exactly 8 bytes — matches the
1919 /// kernel uapi struct.
1920 #[test]
1921 fn virtio_console_control_size_is_8() {
1922 assert_eq!(std::mem::size_of::<VirtioConsoleControl>(), 8);
1923 }
1924
1925 /// `VirtioConsoleControl` round-trips through bytes — pins the
1926 /// repr(C) layout against an accidental field reorder that would
1927 /// produce malformed control frames on the c_ivq / c_ovq queues.
1928 #[test]
1929 fn virtio_console_control_round_trip() {
1930 let c = VirtioConsoleControl {
1931 id: 1,
1932 event: ControlEvent::PortOpen.wire_value(),
1933 value: 1,
1934 };
1935 let bytes = c.as_bytes();
1936 assert_eq!(bytes.len(), 8);
1937 let back = VirtioConsoleControl::read_from_bytes(bytes).unwrap();
1938 let id = back.id;
1939 let event = back.event;
1940 let value = back.value;
1941 assert_eq!(id, 1);
1942 assert_eq!(event, ControlEvent::PortOpen.wire_value());
1943 assert_eq!(value, 1);
1944 }
1945
1946 /// `KernelOpRequestPayload` round-trips through postcard with
1947 /// every `KernelOpTarget` + `KernelOpValue` variant present —
1948 /// pins encode/decode against an accidental serde derive
1949 /// breakage on either side. The wire format the freeze coord
1950 /// (host) decodes is exactly what the guest's
1951 /// [`crate::vmm::guest_comms::request_kernel_op`] encodes, so a
1952 /// round-trip mismatch surfaces as a silent host-side parse
1953 /// failure rather than a typed error.
1954 #[test]
1955 fn kernel_op_request_payload_postcard_round_trip() {
1956 let payload = KernelOpRequestPayload {
1957 request_id: 0xCAFEBABE,
1958 mode: KernelOpMode::Cold,
1959 direction: KernelOpDirection::Write,
1960 tag: "with_uptime".into(),
1961 entries: vec![
1962 KernelOpEntry {
1963 target: KernelOpTarget::Symbol("jiffies".into()),
1964 value: KernelOpValue::U64(42),
1965 },
1966 KernelOpEntry {
1967 target: KernelOpTarget::Direct(0xffff_8000_0000_1000),
1968 value: KernelOpValue::U32(7),
1969 },
1970 KernelOpEntry {
1971 target: KernelOpTarget::Kva(0xffff_c000_dead_beef),
1972 value: KernelOpValue::Bytes(vec![1, 2, 3, 4, 5]),
1973 },
1974 KernelOpEntry {
1975 target: KernelOpTarget::PerCpuField {
1976 symbol: "runqueues".into(),
1977 field: "clock".into(),
1978 cpu: 3,
1979 },
1980 value: KernelOpValue::U64(0xDEAD_BEEF_CAFE_F00D),
1981 },
1982 KernelOpEntry {
1983 target: KernelOpTarget::TaskField {
1984 pid: 12345,
1985 expected_start_time_ns: 1_700_000_000_000,
1986 field: "scx.dsq_vtime".into(),
1987 },
1988 value: KernelOpValue::U64(30 * 86400 * 1_000_000_000),
1989 },
1990 ],
1991 };
1992 let bytes = postcard::to_allocvec(&payload).expect("encode");
1993 let back: KernelOpRequestPayload = postcard::from_bytes(&bytes).expect("decode");
1994 assert_eq!(back, payload);
1995 }
1996
1997 /// `KernelOpReplyPayload` round-trips through postcard. The
1998 /// reply carries success/failure + (for reads) the per-entry
1999 /// values the host coordinator read — both code paths must
2000 /// survive encode/decode unchanged.
2001 #[test]
2002 fn kernel_op_reply_payload_postcard_round_trip() {
2003 let success = KernelOpReplyPayload {
2004 request_id: 0x1234_5678,
2005 success: true,
2006 reason: String::new(),
2007 read_values: vec![
2008 KernelOpValue::U64(100),
2009 KernelOpValue::U32(200),
2010 KernelOpValue::Bytes(vec![0xAB, 0xCD, 0xEF]),
2011 ],
2012 };
2013 let bytes = postcard::to_allocvec(&success).expect("encode success");
2014 let back: KernelOpReplyPayload = postcard::from_bytes(&bytes).expect("decode success");
2015 assert_eq!(back, success);
2016
2017 let failure = KernelOpReplyPayload {
2018 request_id: 0xFEED_FACE,
2019 success: false,
2020 reason: "host: symbol 'jiffies' not found in vmlinux".into(),
2021 read_values: vec![],
2022 };
2023 let bytes = postcard::to_allocvec(&failure).expect("encode failure");
2024 let back: KernelOpReplyPayload = postcard::from_bytes(&bytes).expect("decode failure");
2025 assert_eq!(back, failure);
2026 }
2027
2028 /// `KERNEL_OP_REPLY_MAX` envelope check: a representative
2029 /// large reply (1024-CPU per-CPU u64) fits comfortably; the
2030 /// cap is 1 MiB which bounds OOM exposure while accommodating
2031 /// realistic batch shapes. A regression that shrunk the cap
2032 /// below ~10 KiB would silently truncate large kernel-op
2033 /// replies; one that grew it beyond 1 MiB would widen the
2034 /// OOM-attack surface.
2035 #[test]
2036 fn kernel_op_reply_max_envelope_check() {
2037 // 1024 CPUs * KernelOpValue::U64 (~9 bytes each + bookkeeping)
2038 // is well under 1 MiB. Build a representative reply and
2039 // verify its encoded size sits inside the cap.
2040 let big = KernelOpReplyPayload {
2041 request_id: 1,
2042 success: true,
2043 reason: String::new(),
2044 read_values: (0..1024u64).map(KernelOpValue::U64).collect(),
2045 };
2046 let bytes = postcard::to_allocvec(&big).expect("encode 1024-CPU reply");
2047 assert!(
2048 bytes.len() < KERNEL_OP_REPLY_MAX,
2049 "1024-CPU kernel-op reply ({} bytes) must fit under \
2050 KERNEL_OP_REPLY_MAX ({KERNEL_OP_REPLY_MAX} bytes)",
2051 bytes.len(),
2052 );
2053 // The cap is exactly 1 MiB — large enough for per-CPU 1 KiB
2054 // Bytes reads on 1024 CPUs, small enough to keep OOM
2055 // exposure bounded for a forged frame.
2056 assert_eq!(KERNEL_OP_REPLY_MAX, 1024 * 1024);
2057 }
2058
2059 // ----- KernAddrs wire-format pins -----
2060 //
2061 // Pin both the typed encode/decode contract AND the on-wire
2062 // byte layout. The byte-layout pin (last test) catches slot-
2063 // swap regressions that a roundtrip-equality test alone would
2064 // miss when encoder and decoder both flip the same way.
2065
2066 #[test]
2067 fn kern_addrs_roundtrip_all_present() {
2068 let a = KernAddrs::new(
2069 0x12345678u64,
2070 0xffff_8880_0000_0000u64,
2071 Some(0xffff_ffff_8200_0000u64),
2072 );
2073 let payload = a.to_payload();
2074 assert_eq!(payload.len(), KernAddrs::WIRE_LEN);
2075 let b = KernAddrs::from_payload(&payload).expect("decode");
2076 assert_eq!(b.phys_base, a.phys_base);
2077 assert_eq!(b.page_offset_base, a.page_offset_base);
2078 assert_eq!(b.kernel_text_runtime_kva, a.kernel_text_runtime_kva);
2079 assert!(b.has_phys_present_bit());
2080 }
2081
2082 #[test]
2083 fn kern_addrs_roundtrip_kallsyms_absent() {
2084 // The None branch on kernel_text_runtime_kva must decode
2085 // back to None (NOT Some(u64::MAX) via wrapping_sub(1) on
2086 // a raw-0 biased slot). The biased-0 sentinel is the
2087 // wire-format "guest could not derive" marker.
2088 let a = KernAddrs::new(0u64, 0u64, None);
2089 let payload = a.to_payload();
2090 let b = KernAddrs::from_payload(&payload).expect("decode");
2091 assert_eq!(
2092 b.kernel_text_runtime_kva, None,
2093 "biased-0 runtime slot must decode to None"
2094 );
2095 // phys_base = 0 IS a valid KASLR-off value; the biased
2096 // encoder writes 1 (non-zero) so has_phys_present_bit
2097 // surfaces present.
2098 assert!(b.has_phys_present_bit());
2099 }
2100
2101 #[test]
2102 fn kern_addrs_from_payload_rejects_length_mismatch() {
2103 // Exact-length match required so a protocol-extension
2104 // partial write or truncated wire surfaces as None,
2105 // never as a zero-padded silent decode.
2106 assert!(KernAddrs::from_payload(&[]).is_none());
2107 assert!(KernAddrs::from_payload(&[0u8; KernAddrs::WIRE_LEN - 1]).is_none());
2108 assert!(KernAddrs::from_payload(&[0u8; KernAddrs::WIRE_LEN + 1]).is_none());
2109 }
2110
2111 #[test]
2112 fn stimulus_from_payload_requires_exact_24_bytes() {
2113 // Exact-length match (24 bytes): an undersized buffer would
2114 // truncate and an oversized one carries bytes the guest never
2115 // frames (send_stimulus/send_step_end write exactly 24), so a
2116 // torn / hostile frame is dropped, not promoted by a prefix read.
2117 let n = std::mem::size_of::<StimulusPayload>();
2118 assert_eq!(n, 24);
2119 assert!(StimulusEvent::from_payload(&[0u8; 23]).is_none());
2120 assert!(StimulusEvent::from_payload(&[0u8; 25]).is_none());
2121 assert!(StimulusEvent::from_payload(&[0u8; 24]).is_some());
2122 }
2123
2124 #[test]
2125 fn kern_addrs_has_phys_present_bit_distinguishes_zero_vs_absent() {
2126 // Pins the bias-sentinel contract on the present-bit
2127 // accessor. A struct constructed via KernAddrs::new with
2128 // phys_base=0 surfaces as present (encoded biased = 1).
2129 // A hand-decoded all-zero payload (the "guest never
2130 // sent" wire state) surfaces as absent (raw biased 0 →
2131 // wrapping_sub(1) = u64::MAX → has_phys_present_bit
2132 // returns false).
2133 let present = KernAddrs::new(0u64, 0u64, None);
2134 assert!(present.has_phys_present_bit());
2135 let absent = KernAddrs::from_payload(&[0u8; KernAddrs::WIRE_LEN])
2136 .expect("zero-length payload decodes; shape is valid");
2137 assert!(
2138 !absent.has_phys_present_bit(),
2139 "zero-bias slot decodes to u64::MAX; has_phys_present_bit must surface absent"
2140 );
2141 }
2142
2143 #[test]
2144 fn kern_addrs_to_payload_byte_layout_is_le_phys_first() {
2145 // Pin the on-wire byte layout directly via slot offsets.
2146 // A reordering refactor that swapped slots would silently
2147 // pass the roundtrip-equality tests above (encoder and
2148 // decoder would flip together) — this test catches the
2149 // slot-swap class by asserting against fixed byte
2150 // positions.
2151 let a = KernAddrs::new(
2152 0x1111_2222_3333_4444u64,
2153 0xaaaa_bbbb_cccc_ddddu64,
2154 Some(0x5555_6666_7777_8888u64),
2155 );
2156 let p = a.to_payload();
2157 // phys_base biased: ...4444 + 1 = ...4445; LE first byte = 0x45.
2158 assert_eq!(p[0], 0x45, "phys_base slot is [0..8] LE biased");
2159 assert_eq!(
2160 u64::from_le_bytes(p[..8].try_into().unwrap()),
2161 0x1111_2222_3333_4445
2162 );
2163 assert_eq!(
2164 u64::from_le_bytes(p[8..16].try_into().unwrap()),
2165 0xaaaa_bbbb_cccc_ddddu64
2166 );
2167 // kernel_text_runtime_kva biased: ...8888 + 1 = ...8889.
2168 assert_eq!(
2169 u64::from_le_bytes(p[16..24].try_into().unwrap()),
2170 0x5555_6666_7777_8889u64
2171 );
2172 }
2173
2174 #[test]
2175 fn kern_addrs_u64_max_runtime_collapses_to_absent_roundtrip() {
2176 // Documents the bias-encoding boundary collision: a
2177 // `kernel_text_runtime_kva` of `u64::MAX` wraps to biased 0
2178 // on encode, which the decoder reads as the "guest could
2179 // not derive" sentinel (None). u64::MAX is non-canonical
2180 // and impossible as a real `_text` KVA, AND the
2181 // downstream KERN_ADDRS dispatch arm in dispatch.rs
2182 // triple-gates derived offsets (kernel-half threshold +
2183 // non-negative slide + ≤1GiB max-slide-bound + link
2184 // canonical) which catch any synthesized variant of this
2185 // collision before it reaches the shared Arc. Test pins
2186 // the symmetric collapse so a future encoder refactor
2187 // that broke the "absent sentinel = biased 0" contract
2188 // (e.g. switched to a different sentinel value) trips
2189 // here loudly.
2190 let max_runtime = KernAddrs::new(0u64, 0u64, Some(u64::MAX));
2191 let payload = max_runtime.to_payload();
2192 // Biased slot reads 0 — collides with the absent encoding.
2193 assert_eq!(
2194 u64::from_le_bytes(payload[16..24].try_into().unwrap()),
2195 0,
2196 "Some(u64::MAX) biased-add wraps to 0; collides with absent sentinel"
2197 );
2198 // Roundtrip surfaces it as None, not Some(u64::MAX).
2199 let decoded = KernAddrs::from_payload(&payload).expect("decode");
2200 assert_eq!(
2201 decoded.kernel_text_runtime_kva, None,
2202 "u64::MAX runtime decodes to None via the bias collision"
2203 );
2204 }
2205}