ktstr/vmm/
wire.rs

1//! Shared wire-format types for the host/guest virtio-console port-1
2//! TLV stream and the multiport control protocol.
3//!
4//! Both [`super::guest_comms`] (guest-only senders) and
5//! [`super::host_comms`] (host-only consumers) reference this module.
6//! Splitting the wire format out of the transport modules keeps the
7//! frame layout authoritative — a producer change here lands in both
8//! the guest writer and the host parser without a hand-sync step.
9//!
10//! # Postcard wire-format pin inventory (contributor guide)
11//!
12//! Every type that crosses the in-VM postcard TLV channel MUST be
13//! externally-tagged (postcard cannot decode `#[serde(untagged)]`
14//! or `#[serde(tag, content)]` enums — encode raises `WontImplement`
15//! at runtime and the host surfaces it as
16//! `ERR_NO_TEST_FUNCTION_OUTPUT`). The compile-time
17//! `#[derive(serde::Serialize, serde::Deserialize)]` itself does
18//! NOT catch the shape mismatch; the contract is verified by the
19//! per-type roundtrip pin tests listed below. A contributor adding
20//! a new postcard payload MUST add a roundtrip pin in the
21//! corresponding location and update this inventory.
22//!
23//! Inventory (type → test name → file):
24//!   - `AssertResult` → `assert_result_postcard_roundtrip`
25//!     → `src/assert/tests_serde.rs`
26//!   - `KernAddrs` → `kern_addrs_roundtrip_all_present` (+ 4 sibling
27//!     boundary pins) → `src/vmm/wire.rs` (this file)
28//!   - `KernelOpRequestPayload` / `KernelOpReplyPayload`
29//!     → 4 tests → `src/vmm/wire.rs` (this file)
30//!   - `PayloadMetrics`
31//!     → `payload_metrics_postcard_roundtrip`
32//!     → `src/test_support/payload.rs`
33//!   - `WorkloadConfig` → `payload_roundtrip`
34//!     → `src/test_support/payload.rs`
35//!   - `WorkerReport` → `worker_report_postcard_roundtrip` (+ 3 sibling
36//!     pins covering `Vec<WorkerReport>` + all `ExitInfo` variants)
37//!     → `src/workload/spawn/tests_integration.rs`
38//!   - `PersistedCastAnalysis` → see `src/vmm/cast_analysis_load`
39//!     module's tests
40//!
41//! # Frame layout
42//!
43//! Each guest→host bulk message is a 16-byte [`ShmMessage`] header
44//! followed by `length` payload bytes. The host's
45//! [`super::host_comms::parse_tlv_stream`] consumes this format. CRC32
46//! covers payload bytes only, not the header.
47//!
48//! ```text
49//! offset  size  field
50//! ------  ----  ----------------------------------------------
51//!   0      4    msg_type (u32 LE)  — see [`MsgType`]
52//!   4      4    length   (u32 LE)  — payload bytes following
53//!   8      4    crc32    (u32 LE)  — crc32fast over payload
54//!  12      4    _pad     (u32 LE)  — reserved, MUST be zero
55//!  16      N    payload  (N=length bytes)
56//! ```
57//!
58//! # Control protocol
59//!
60//! [`VirtioConsoleControl`] mirrors the kernel uapi `struct
61//! virtio_console_control` for multiport handshake messages on the
62//! c_ivq / c_ovq queues (8 bytes: id u32, event u16, value u16).
63//! [`ControlEvent`] enumerates the event discriminants the kernel and
64//! the host VMM exchange during port enumeration.
65//!
66//! Many of the typed wrappers and constants in this module are part
67//! of the public bulk API surface; the lib build does not yet read
68//! every variant from internal call sites (the typed `MsgType` enum,
69//! `ControlEvent`, `VirtioConsoleControl`, `NUM_PORTS`, `PORT1_NAME`,
70//! and the `from_wire` reverse mappings are reachable via the public
71//! crate path for downstream test code and wire-format tests). The
72//! module-level `#[allow(dead_code)]` matches the `VmResult` field
73//! pattern in `result.rs` — public surface that the in-tree readers
74//! do not exercise without the unused-X lint firing.
75
76#![allow(dead_code)]
77
78use zerocopy::{FromBytes, IntoBytes};
79
80// ---------------------------------------------------------------------------
81// MsgType — typed message-type discriminant
82// ---------------------------------------------------------------------------
83
84/// Message-type discriminant for the bulk TLV stream.
85///
86/// Each variant maps to a 32-bit on-wire value via [`Self::wire_value`].
87/// The values are 4-character ASCII tags chosen so the integer literal
88/// itself spells the tag in hex (e.g. `0x4558_4954` reads as `"EXIT"`
89/// — `45`='E', `58`='X', `49`='I', `54`='T'). Because the wire format
90/// is little-endian, a raw byte-level hex dump of a captured frame
91/// shows the bytes in reverse order (e.g. `54 49 58 45` for the
92/// `Exit` tag, which spells `"TIXE"` byte-by-byte). The integer
93/// hex value spells the tag; the on-wire bytes are reversed.
94///
95/// On-wire values are stable across host/guest builds — adding a new
96/// variant requires picking a fresh ASCII tag and updating
97/// [`Self::from_wire`] to recognise it. Existing tags must never be
98/// repurposed.
99#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
100pub enum MsgType {
101    /// Stimulus event from the guest step executor — emitted at each
102    /// step's START (the StepStart frame).
103    Stimulus,
104    /// Per-step END frame from the guest step executor, emitted at the
105    /// end of each step's hold while its workers are still alive.
106    /// Reuses the [`StimulusPayload`] body (same 24 bytes) carrying the
107    /// step's coincident end-of-hold (elapsed_ms, total_iterations) and
108    /// the SAME 1-indexed `step_index` as its StepStart. The host pairs
109    /// `StepStart[k]` -> `StepEnd[k]` for each step's OWN throughput
110    /// (step-local iteration_rate), which — unlike the cross-step
111    /// `StepStart[k]` -> `StepStart[k+1]` delta — does not read ~0 for
112    /// workers respawned per step.
113    StepEnd,
114    /// Scenario start marker. Sets a fresh watchdog deadline.
115    ScenarioStart,
116    /// Scenario end marker (payload: two 8-byte LE u64s — elapsed_ms
117    /// then the final cumulative `total_iterations`; see
118    /// [`SCENARIO_END_PAYLOAD_SIZE`] / [`parse_scenario_end`]).
119    ScenarioEnd,
120    /// Pause the watchdog clock. Wall time while paused doesn't
121    /// count against the workload budget.
122    ScenarioPause,
123    /// Resume the watchdog clock after a pause. Extends the deadline
124    /// by the pause duration — gives back the paused time.
125    ScenarioResume,
126    /// Guest exit code (payload: 4-byte LE i32).
127    Exit,
128    /// Test result (payload: postcard-encoded `AssertResult`).
129    TestResult,
130    /// Scheduler process exit (payload: 4-byte LE i32 exit code).
131    SchedExit,
132    /// Guest crash diagnostic (payload: UTF-8 panic + backtrace).
133    /// Reserved tag — never travels on the bulk port. Panic
134    /// diagnostics are written directly to COM2 (`/dev/ttyS1`)
135    /// because `virtio_console` TX can block on host backpressure
136    /// and blocking inside a fault handler would deadlock the
137    /// guest before the diagnostic reached the host.
138    Crash,
139    /// Per-payload-invocation metrics (payload: postcard-encoded
140    /// `PayloadMetrics`).
141    PayloadMetrics,
142    /// Coverage profraw blob.
143    Profraw,
144    /// Guest→host stdout chunk. Payload: opaque UTF-8 bytes. Each
145    /// frame carries one chunk read from the guest's stdout pipe;
146    /// host concatenates chunks in arrival order to reconstruct the
147    /// stream. Replaces the prior COM2 stdout redirect.
148    Stdout,
149    /// Guest→host stderr chunk. Payload: opaque UTF-8 bytes. Same
150    /// chunked semantics as [`Self::Stdout`].
151    Stderr,
152    /// Guest→host scheduler-log chunk. Payload: opaque UTF-8 bytes
153    /// from the scheduler child process's captured log. Replaces
154    /// the prior COM2 SCHED_OUTPUT_START/END dump path. The host
155    /// concatenates chunks in arrival order; the existing
156    /// `SCHED_OUTPUT_START` / `SCHED_OUTPUT_END` delimiters and the
157    /// embedded BPF verifier section are preserved verbatim
158    /// inside the chunk bytes.
159    SchedLog,
160    /// Guest→host lifecycle phase event. Payload: 1-byte
161    /// [`LifecyclePhase`] discriminant followed by an optional
162    /// UTF-8 reason buffer (used by `SchedulerNotAttached`'s
163    /// suffix detail). Replaces the prior `KTSTR_INIT_STARTED` /
164    /// `KTSTR_PAYLOAD_STARTING` / `SCHEDULER_DIED` /
165    /// `SCHEDULER_NOT_ATTACHED` sentinel strings on COM2.
166    Lifecycle,
167    /// Guest→host shell-exec exit. Payload: 4-byte LE i32 exit
168    /// code from `cargo ktstr shell --exec <cmd>`. Replaces the
169    /// prior COM2 `KTSTR_EXEC_EXIT=N` sentinel line.
170    ExecExit,
171    /// Guest→host kernel ring-buffer dump. Payload: opaque UTF-8
172    /// bytes from `rmesg::logs_raw`. Sent on the
173    /// initramfs-extraction failure path so the host sees the
174    /// kernel OOM messages without scraping COM2.
175    Dmesg,
176    /// Guest→host probe-pipeline JSON output. Payload: opaque UTF-8
177    /// bytes from the probe output stream. Replaces the prior
178    /// COM2 ProbeDrain path so probe JSON does not interleave
179    /// with sched-log dumps on the same serial port.
180    ProbeOutput,
181    /// Guest→host on-demand snapshot request (payload:
182    /// [`SnapshotRequestPayload`]). The freeze coordinator's bulk-drain
183    /// path intercepts this frame, runs the CAPTURE / WATCH dispatch,
184    /// and replies with [`MsgType::SnapshotReply`] on port 1 RX.
185    SnapshotRequest,
186    /// Host→guest snapshot reply (payload: [`SnapshotReplyPayload`]).
187    /// Sent on port 1 RX so the guest's blocking read on
188    /// `/dev/vport0p1` wakes within microseconds. Reply payload
189    /// carries the matching request_id, the status, and a UTF-8
190    /// reason buffer for the failure path.
191    SnapshotReply,
192    /// Guest→host kernel-memory write/read op request (payload:
193    /// postcard-encoded [`KernelOpRequestPayload`]). Carries the
194    /// `Op::WriteKernel{Hot,Cold}` / `Op::ReadKernel{Hot,Cold}`
195    /// invocation from the guest's step executor; variable-length
196    /// payload rides this distinct MSG_TYPE rather than extending
197    /// the fixed-72-byte [`SnapshotRequestPayload`].
198    KernelOpRequest,
199    /// Host→guest reply to [`MsgType::KernelOpRequest`] (payload:
200    /// postcard-encoded [`KernelOpReplyPayload`]).
201    KernelOpReply,
202    /// Guest→host wprof Perfetto-format trace blob (payload: raw
203    /// `.pb` bytes from `wprof -T trace.pb`). The freeze
204    /// coordinator writes the payload as `wprof.pb` next to the
205    /// failure-dump JSON so the operator picks it up alongside the
206    /// rest of the per-test debugging artefacts.
207    WprofTrace,
208    /// Guest→host wprof trace CHUNK (payload: a ≤`MAX_BULK_FRAME_PAYLOAD`
209    /// slice of the `.pb`). A wprof trace larger than the single-frame
210    /// bulk-port ceiling is split into ordered `WprofTraceChunk` frames
211    /// terminated by a final [`Self::WprofTrace`] frame (the last slice);
212    /// the host concatenates the chunk payloads in arrival order and
213    /// appends the terminal `WprofTrace` payload to reconstruct the `.pb`.
214    /// A trace that fits in one frame ships as a lone `WprofTrace` (no
215    /// chunks) — the reassembly is a no-op for that fast path.
216    ///
217    /// Like the `Stdout`/`Stderr`/`SchedLog` transports, a large blob is
218    /// split across frames and concatenated on the host. It DIVERGES from
219    /// them in using a distinct terminal frame type rather than uniform
220    /// same-type frames: a Perfetto `.pb` is useless if truncated (a partial
221    /// protobuf still passes the leading-tag/size shape check but decodes to
222    /// garbage), whereas partial stdout is still useful. The terminal frame
223    /// lets the host distinguish a complete trace (terminal present → write
224    /// the `.pb`) from a transport that tore mid-ship (chunks but no terminal
225    /// → write nothing, so the post_vm `.pb`-landed assert fails loudly
226    /// instead of shipping a plausible-but-corrupt artifact). Stdout/SchedLog
227    /// need no such marker — their in-band `SCHED_OUTPUT_START/END` content
228    /// delimiters, not the framing, bound their payloads.
229    WprofTraceChunk,
230    /// Guest→host system-ready signal (payload: empty).
231    ///
232    /// Emitted by the guest's `ktstr_guest_init` after
233    /// `mount_filesystems()` completes, so by the time the host
234    /// observes the frame the guest's `setup_per_cpu_areas` and
235    /// KASLR randomization (both kernel-boot prerequisites) are
236    /// already done. The freeze coordinator's bulk-drain dispatch
237    /// promotes a CRC-valid SYS_RDY frame into the monitor's
238    /// boot-complete eventfd, so the monitor's pre-sample
239    /// `epoll_wait` returns within microseconds rather than
240    /// waiting for the 5 s fallback timeout. Replaces an earlier
241    /// trigger that fired on the first port-0 TX byte (kernel
242    /// printk via `/dev/hvc0`), which depended on incidental
243    /// console traffic rather than an explicit readiness signal.
244    SysRdy,
245    /// Guest→host scheduler-swap notification (payload: empty).
246    ///
247    /// Emitted by the guest's `kill_current_scheduler`
248    /// (`Op::DetachScheduler` / `RestartScheduler` / `ReplaceScheduler`)
249    /// AFTER `wait_for_scx_disabled` returns, so by the time the host
250    /// observes the frame the kernel has already NULLed `*scx_root`
251    /// (`RCU_INIT_POINTER(scx_root, NULL)` precedes
252    /// `scx_set_enable_state(SCX_DISABLED)` in kernel/sched/ext.c) and
253    /// the prior scx_sched object is unlinked (`*scx_root` NULLed) and
254    /// its slab is subject to RCU-grace-period reuse. The freeze
255    /// coordinator decodes a CRC-valid frame and SYNCHRONOUSLY
256    /// invalidates the periodic-capture accessor (mirroring the
257    /// watchpoint poll's Detached teardown) rather than waiting up to
258    /// one SCAN_INTERVAL for the poll to notice the rebind — collapsing
259    /// the post-swap periodic-capture defer window. Coordinator-internal:
260    /// carries no test verdict.
261    SchedSwapNotify,
262}
263
264impl MsgType {
265    /// 32-bit on-wire discriminant for this message type. The value is
266    /// the big-endian ASCII representation of a 4-character tag.
267    pub const fn wire_value(self) -> u32 {
268        match self {
269            MsgType::Stimulus => MSG_TYPE_STIMULUS,
270            MsgType::StepEnd => MSG_TYPE_STEP_END,
271            MsgType::ScenarioStart => MSG_TYPE_SCENARIO_START,
272            MsgType::ScenarioEnd => MSG_TYPE_SCENARIO_END,
273            MsgType::ScenarioPause => MSG_TYPE_SCENARIO_PAUSE,
274            MsgType::ScenarioResume => MSG_TYPE_SCENARIO_RESUME,
275            MsgType::Exit => MSG_TYPE_EXIT,
276            MsgType::TestResult => MSG_TYPE_TEST_RESULT,
277            MsgType::SchedExit => MSG_TYPE_SCHED_EXIT,
278            MsgType::Crash => MSG_TYPE_CRASH,
279            MsgType::PayloadMetrics => MSG_TYPE_PAYLOAD_METRICS,
280            MsgType::Profraw => MSG_TYPE_PROFRAW,
281            MsgType::WprofTrace => MSG_TYPE_WPROF_TRACE,
282            MsgType::WprofTraceChunk => MSG_TYPE_WPROF_TRACE_CHUNK,
283            MsgType::SnapshotRequest => MSG_TYPE_SNAPSHOT_REQUEST,
284            MsgType::SnapshotReply => MSG_TYPE_SNAPSHOT_REPLY,
285            MsgType::KernelOpRequest => MSG_TYPE_KERNEL_OP_REQUEST,
286            MsgType::KernelOpReply => MSG_TYPE_KERNEL_OP_REPLY,
287            MsgType::SysRdy => MSG_TYPE_SYS_RDY,
288            MsgType::SchedSwapNotify => MSG_TYPE_SCHED_SWAP_NOTIFY,
289            MsgType::Stdout => MSG_TYPE_STDOUT,
290            MsgType::Stderr => MSG_TYPE_STDERR,
291            MsgType::SchedLog => MSG_TYPE_SCHED_LOG,
292            MsgType::Lifecycle => MSG_TYPE_LIFECYCLE,
293            MsgType::ExecExit => MSG_TYPE_EXEC_EXIT,
294            MsgType::Dmesg => MSG_TYPE_DMESG,
295            MsgType::ProbeOutput => MSG_TYPE_PROBE_OUTPUT,
296        }
297    }
298
299    /// Reverse the wire mapping. Returns `None` when `value` is not a
300    /// recognised discriminant — callers can either skip the frame or
301    /// surface the unknown tag for diagnostics.
302    pub const fn from_wire(value: u32) -> Option<Self> {
303        match value {
304            MSG_TYPE_STIMULUS => Some(MsgType::Stimulus),
305            MSG_TYPE_STEP_END => Some(MsgType::StepEnd),
306            MSG_TYPE_SCENARIO_START => Some(MsgType::ScenarioStart),
307            MSG_TYPE_SCENARIO_END => Some(MsgType::ScenarioEnd),
308            MSG_TYPE_SCENARIO_PAUSE => Some(MsgType::ScenarioPause),
309            MSG_TYPE_SCENARIO_RESUME => Some(MsgType::ScenarioResume),
310            MSG_TYPE_EXIT => Some(MsgType::Exit),
311            MSG_TYPE_TEST_RESULT => Some(MsgType::TestResult),
312            MSG_TYPE_SCHED_EXIT => Some(MsgType::SchedExit),
313            MSG_TYPE_CRASH => Some(MsgType::Crash),
314            MSG_TYPE_PAYLOAD_METRICS => Some(MsgType::PayloadMetrics),
315            MSG_TYPE_PROFRAW => Some(MsgType::Profraw),
316            MSG_TYPE_WPROF_TRACE => Some(MsgType::WprofTrace),
317            MSG_TYPE_WPROF_TRACE_CHUNK => Some(MsgType::WprofTraceChunk),
318            MSG_TYPE_SNAPSHOT_REQUEST => Some(MsgType::SnapshotRequest),
319            MSG_TYPE_SNAPSHOT_REPLY => Some(MsgType::SnapshotReply),
320            MSG_TYPE_KERNEL_OP_REQUEST => Some(MsgType::KernelOpRequest),
321            MSG_TYPE_KERNEL_OP_REPLY => Some(MsgType::KernelOpReply),
322            MSG_TYPE_SYS_RDY => Some(MsgType::SysRdy),
323            MSG_TYPE_SCHED_SWAP_NOTIFY => Some(MsgType::SchedSwapNotify),
324            MSG_TYPE_STDOUT => Some(MsgType::Stdout),
325            MSG_TYPE_STDERR => Some(MsgType::Stderr),
326            MSG_TYPE_SCHED_LOG => Some(MsgType::SchedLog),
327            MSG_TYPE_LIFECYCLE => Some(MsgType::Lifecycle),
328            MSG_TYPE_EXEC_EXIT => Some(MsgType::ExecExit),
329            MSG_TYPE_DMESG => Some(MsgType::Dmesg),
330            MSG_TYPE_PROBE_OUTPUT => Some(MsgType::ProbeOutput),
331            _ => None,
332        }
333    }
334
335    /// `true` for control frames the freeze coordinator interprets
336    /// internally and that must NOT surface as test verdict entries
337    /// in [`super::host_comms::BulkDrainResult`]. Both the
338    /// coordinator's mid-run `bulk_messages_for_closure` filter and
339    /// `collect_results`'s post-run drain key on this single
340    /// classifier so the gate stays in lockstep — adding a new
341    /// internal control frame is a one-line update here.
342    ///
343    /// The current internal set:
344    ///   - [`MsgType::SnapshotRequest`] — has its matching
345    ///     [`MsgType::SnapshotReply`] delivered over port-1 RX; the
346    ///     request itself carries no test verdict.
347    ///   - [`MsgType::SnapshotReply`] — host→guest only on port-1 RX.
348    ///     A guest TX frame stamped with this tag is illegitimate
349    ///     (only the host coordinator emits replies); drop it instead
350    ///     of bucketing it as a phantom verdict entry. Including the
351    ///     tag in the internal set keeps the dispatch and the
352    ///     `collect_results` post-run drain in lockstep — both filter
353    ///     the same way.
354    ///   - [`MsgType::KernelOpRequest`] — paired with its
355    ///     [`MsgType::KernelOpReply`] over port-1 RX (the cold-path
356    ///     kernel-op roundtrip); the request carries no test verdict.
357    ///   - [`MsgType::KernelOpReply`] — host→guest only on port-1 RX,
358    ///     same illegitimate-guest-TX reasoning as `SnapshotReply`.
359    ///   - [`MsgType::SysRdy`] — its only semantic is the eventfd
360    ///     promotion that releases the monitor's pre-sample
361    ///     `epoll_wait`.
362    ///   - [`MsgType::SchedSwapNotify`] — its only semantic is the
363    ///     synchronous periodic-capture accessor teardown the freeze
364    ///     coordinator performs on a CRC-valid frame; carries no test
365    ///     verdict.
366    pub const fn is_coordinator_internal(self) -> bool {
367        matches!(
368            self,
369            MsgType::SnapshotRequest
370                | MsgType::SnapshotReply
371                | MsgType::KernelOpRequest
372                | MsgType::KernelOpReply
373                | MsgType::SysRdy
374                | MsgType::SchedSwapNotify
375        )
376    }
377}
378
379/// Lifecycle phase carried in the 1-byte header of a
380/// [`MsgType::Lifecycle`] payload. Replaces the prior
381/// `KTSTR_INIT_STARTED` / `KTSTR_PAYLOAD_STARTING` /
382/// `SCHEDULER_DIED` / `SCHEDULER_NOT_ATTACHED` COM2 sentinels.
383///
384/// `SchedulerNotAttached` carries an optional UTF-8 reason suffix
385/// (the bytes following the 1-byte phase header in the TLV
386/// payload) — every other variant has an empty suffix.
387#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
388pub enum LifecyclePhase {
389    /// Init started — devtmpfs mounted, initramfs verified,
390    /// equivalent to the legacy `KTSTR_INIT_STARTED` sentinel.
391    InitStarted,
392    /// Payload starting — guest dispatch is about to invoke the
393    /// `#[ktstr_test]` body. Equivalent to the legacy
394    /// `KTSTR_PAYLOAD_STARTING` sentinel.
395    PayloadStarting,
396    /// Scheduler process exited during startup. Equivalent to the
397    /// legacy `SCHEDULER_DIED` sentinel.
398    SchedulerDied,
399    /// Scheduler stayed alive but never attached to sched_ext (BPF
400    /// verifier reject, ops mismatch, sysfs absent). Equivalent to
401    /// the legacy `SCHEDULER_NOT_ATTACHED:<reason>` sentinel; the
402    /// reason suffix lives in the bytes after the 1-byte phase
403    /// header.
404    SchedulerNotAttached,
405    /// The injected verifier workload dispatched: after attach, at least
406    /// one worker of the `--ktstr-verifier-workload` run made forward
407    /// progress on-CPU (a positive, scheduler-agnostic dispatch proof).
408    /// Emitted by `ktstr_guest_init` Phase 5 only when a verifier-workload
409    /// run recorded a worker with non-zero `iterations` under a confirmed
410    /// SCHED_EXT policy (so a fair-class fallback cannot false-confirm).
411    /// Given a `PayloadStarting` frame, the ABSENCE of this frame means the
412    /// scheduler attached (sched_ext `enabled`) but never dispatched the
413    /// workload — a distinct, worse failure than never attaching. Carries
414    /// an empty suffix. Has no legacy COM2 sentinel equivalent.
415    WorkloadDispatched,
416}
417
418impl LifecyclePhase {
419    /// 1-byte on-wire discriminant. `0` is reserved as the
420    /// "unknown / invalid" sentinel — host parsers reject zero
421    /// rather than silently mapping it to a known phase.
422    pub const fn wire_value(self) -> u8 {
423        match self {
424            LifecyclePhase::InitStarted => 1,
425            LifecyclePhase::PayloadStarting => 2,
426            LifecyclePhase::SchedulerDied => 3,
427            LifecyclePhase::SchedulerNotAttached => 4,
428            LifecyclePhase::WorkloadDispatched => 5,
429        }
430    }
431
432    /// Reverse the wire mapping. Returns `None` for `0`
433    /// (reserved sentinel) or any value not present in the variant
434    /// list — host parsers skip unknown phases and log them rather
435    /// than panicking.
436    pub const fn from_wire(value: u8) -> Option<Self> {
437        match value {
438            1 => Some(LifecyclePhase::InitStarted),
439            2 => Some(LifecyclePhase::PayloadStarting),
440            3 => Some(LifecyclePhase::SchedulerDied),
441            4 => Some(LifecyclePhase::SchedulerNotAttached),
442            5 => Some(LifecyclePhase::WorkloadDispatched),
443            _ => None,
444        }
445    }
446}
447
448// ---------------------------------------------------------------------------
449// On-wire u32 discriminants
450// ---------------------------------------------------------------------------
451//
452// Kept as `pub const` for callers that compare a parsed frame's
453// `msg_type` field directly (e.g. the freeze coordinator's stream
454// filter). [`MsgType::wire_value`] is the typed entry point; the
455// constants are the same values exposed for raw-byte comparisons.
456
457/// Stimulus event from the guest step executor (step START frame).
458pub const MSG_TYPE_STIMULUS: u32 = 0x5354_494D; // "STIM"
459
460/// Per-step END frame from the guest step executor (reuses the
461/// [`StimulusPayload`] body; see [`MsgType::StepEnd`]).
462pub const MSG_TYPE_STEP_END: u32 = 0x5354_454E; // "STEN"
463
464/// Scenario start marker.
465pub const MSG_TYPE_SCENARIO_START: u32 = 0x5343_5354; // "SCST"
466
467/// Pause watchdog clock.
468pub const MSG_TYPE_SCENARIO_PAUSE: u32 = 0x5343_5050; // "SCPP"
469
470/// Resume watchdog clock after pause.
471pub const MSG_TYPE_SCENARIO_RESUME: u32 = 0x5343_5252; // "SCRR"
472
473/// Scenario end marker.
474pub const MSG_TYPE_SCENARIO_END: u32 = 0x5343_454E; // "SCEN"
475
476/// Guest exit code (payload: 4-byte i32).
477pub const MSG_TYPE_EXIT: u32 = 0x4558_4954; // "EXIT"
478
479/// Test result (payload: postcard-encoded AssertResult).
480pub const MSG_TYPE_TEST_RESULT: u32 = 0x5445_5354; // "TEST"
481
482/// Scheduler process exit (payload: 4-byte i32 exit code).
483pub const MSG_TYPE_SCHED_EXIT: u32 = 0x5343_4458; // "SCDX"
484
485/// Guest crash diagnostic (payload: UTF-8 panic + backtrace).
486pub const MSG_TYPE_CRASH: u32 = 0x4352_5348; // "CRSH"
487
488/// Per-payload-invocation metrics
489/// (payload: postcard-encoded `crate::test_support::PayloadMetrics`).
490pub const MSG_TYPE_PAYLOAD_METRICS: u32 = 0x504d_4554; // "PMET"
491
492/// Coverage profraw blob (payload: raw `.profraw` bytes serialized by
493/// `__llvm_profile_write_buffer`).
494pub const MSG_TYPE_PROFRAW: u32 = 0x5052_4157; // "PRAW"
495
496/// wprof Perfetto-format trace blob (payload: raw `.pb` bytes
497/// produced by `/bin/wprof -T trace.pb` during auto-repro). The
498/// host's freeze coordinator writes the payload to a sibling of
499/// the failure-dump file so the operator finds it under
500/// [`crate::test_support::sidecar_dir`] alongside the JSON dump.
501pub const MSG_TYPE_WPROF_TRACE: u32 = 0x5750_5246; // "WPRF"
502
503/// Guest→host wprof trace CHUNK (one ordered slice of a `.pb` too large
504/// for a single bulk frame; the stream is terminated by a
505/// [`MSG_TYPE_WPROF_TRACE`] frame carrying the final slice). See
506/// [`MsgType::WprofTraceChunk`].
507pub const MSG_TYPE_WPROF_TRACE_CHUNK: u32 = 0x5750_5243; // "WPRC"
508
509/// Guest→host on-demand snapshot request
510/// (payload: [`SnapshotRequestPayload`]).
511pub const MSG_TYPE_SNAPSHOT_REQUEST: u32 = 0x534e_5251; // "SNRQ"
512
513/// Host→guest on-demand snapshot reply
514/// (payload: [`SnapshotReplyPayload`]).
515pub const MSG_TYPE_SNAPSHOT_REPLY: u32 = 0x534e_5250; // "SNRP"
516
517/// Guest→host system-ready signal (payload: empty).
518///
519/// Tag spelled `"SRDY"` in hex digits; on-wire bytes (LE) are
520/// `0x59 0x44 0x52 0x53` (`"YDRS"` byte-by-byte). The freeze
521/// coordinator's bulk-drain dispatch promotes a CRC-valid
522/// `MSG_TYPE_SYS_RDY` frame into the monitor's boot-complete
523/// eventfd. See [`MsgType::SysRdy`] for the protocol contract.
524pub const MSG_TYPE_SYS_RDY: u32 = 0x5352_4459; // "SRDY"
525
526/// Guest→host scheduler-swap notification (payload: empty).
527///
528/// Tag spelled `"SCSW"` (SCheduler SWap) in hex digits; on-wire bytes
529/// (LE) are `0x57 0x53 0x43 0x53` (`"WSCS"` byte-by-byte). Emitted by
530/// the guest's `kill_current_scheduler` after `wait_for_scx_disabled`
531/// returns (so `*scx_root` is already NULL); the freeze coordinator
532/// synchronously invalidates the periodic-capture accessor on a
533/// CRC-valid frame. See [`MsgType::SchedSwapNotify`] for the protocol
534/// contract.
535pub const MSG_TYPE_SCHED_SWAP_NOTIFY: u32 = 0x5343_5357; // "SCSW"
536
537/// Guest→host stdout chunk (payload: opaque UTF-8 bytes).
538///
539/// Replaces the prior COM2 stdout redirect: the guest dups fd 1
540/// onto the write-end of an internal pipe and a forwarder thread
541/// chunks the pipe's read-end into TLV frames bounded by
542/// [`super::bulk::MAX_BULK_FRAME_PAYLOAD`].
543pub const MSG_TYPE_STDOUT: u32 = 0x534f_5554; // "SOUT"
544
545/// Guest→host stderr chunk (payload: opaque UTF-8 bytes).
546///
547/// Same chunked redirect semantics as [`MSG_TYPE_STDOUT`], applied
548/// to fd 2.
549pub const MSG_TYPE_STDERR: u32 = 0x5345_5252; // "SERR"
550
551/// Guest→host scheduler-log chunk (payload: opaque UTF-8 bytes).
552///
553/// Replaces the prior COM2 SCHED_OUTPUT_START/END dump in
554/// `dump_sched_output`. The host concatenates chunks in arrival
555/// order; the embedded `SCHED_OUTPUT_START` / `SCHED_OUTPUT_END`
556/// markers and the BPF verifier section travel verbatim inside
557/// the chunk bytes.
558pub const MSG_TYPE_SCHED_LOG: u32 = 0x5343_4c47; // "SCLG"
559
560/// Guest→host lifecycle phase event.
561///
562/// Payload layout: 1-byte [`LifecyclePhase`] discriminant followed
563/// by an optional UTF-8 reason buffer (used by
564/// `SchedulerNotAttached`'s suffix detail; empty for every other
565/// phase). Replaces the COM2 `KTSTR_INIT_STARTED` /
566/// `KTSTR_PAYLOAD_STARTING` / `SCHEDULER_DIED` /
567/// `SCHEDULER_NOT_ATTACHED` sentinel strings.
568pub const MSG_TYPE_LIFECYCLE: u32 = 0x4c49_4645; // "LIFE"
569
570/// Guest→host kernel address parameters (payload: 24 bytes LE).
571///
572/// Sent BEFORE `MSG_TYPE_SYS_RDY` so the monitor has `phys_base`
573/// and `page_offset_base` before its first sample iteration.
574/// Payload layout: 24 bytes encoded by [`KernAddrs::to_payload`]:
575///   `[phys_base + 1 : u64 LE, page_offset_base : u64 LE, kernel_text_runtime_kva + 1 : u64 LE]`
576/// The guest reads these from `/proc/iomem` and `/proc/kallsyms`
577/// after `mount_filesystems` — by that point `__startup_64`,
578/// `kernel_randomize_memory`, `cpu_init → syscall_init`, and the
579/// post-relocation kallsyms table population have all run, so the
580/// values are final regardless of KASLR configuration.
581pub const MSG_TYPE_KERN_ADDRS: u32 = 0x4b41_4452; // "KADR"
582
583/// Typed payload for [`MSG_TYPE_KERN_ADDRS`].
584///
585/// Three u64 fields published by the guest at boot so the host can
586/// translate kernel virtual addresses without walking guest page
587/// tables and recover the virt-KASLR slide without a separate
588/// in-VMM derivation. The wire layout uses bias-by-1 on the
589/// `phys_base` and `kernel_text_runtime_kva` slots so 0 stays the
590/// "not yet received / could not derive" sentinel; `page_offset_base`
591/// is unbiased (today the guest always sends 0 and the host
592/// re-derives via page-table walk — left in the layout for a future
593/// extension that bypasses the walk).
594///
595/// Constructors:
596///   - [`Self::new`]: bare fields, no sentinel logic. Used by
597///     [`crate::vmm::guest_comms::send_kern_addrs`] on the guest
598///     side.
599///   - [`Self::from_payload`]: decodes a 24-byte payload, strips
600///     the +1 bias on the biased slots, validates the length. Used
601///     by the host dispatch arm in
602///     `crate::vmm::freeze_coord::dispatch::dispatch_bulk_message`.
603///
604/// Field semantics:
605///   - `phys_base = 0` is a legitimate KASLR-off value (the
606///     payload encodes it biased as `1`, decoder strips back to
607///     `0`). [`Self::has_phys_present_bit`] reports whether the
608///     guest sent a non-zero biased phys_base (i.e. the payload
609///     carries phys_base data at all).
610///   - `kernel_text_runtime_kva` is wrapped in `Option<u64>` so
611///     the decoder distinguishes "guest could not read kallsyms"
612///     (`None`) from "guest read kallsyms and KASLR is off"
613///     (`Some(link_kva)`). The bias-by-1 encoding handles the
614///     former (biased 0 → `None`); a non-zero biased value
615///     decodes to `Some(raw)`.
616#[derive(Debug, Clone, Copy)]
617pub struct KernAddrs {
618    /// Guest-derived `phys_base` (the KASLR-physical slide), or 0
619    /// when KASLR-physical is off (`__startup_64` left
620    /// `phys_base = 0`). Compare with the host's expected
621    /// load-address to recover the physical KASLR offset.
622    pub phys_base: u64,
623    /// Symbol KVA of the guest's `page_offset_base` global (NOT the
624    /// runtime value the symbol points at — host dereferences via
625    /// `monitor::symbols::text_kva_to_pa_with_base` + `read_u64` once
626    /// it has `phys_base` resolved). Read by
627    /// `crate::vmm::guest_comms::read_kernel_page_offset_base_from_kallsyms`
628    /// (called from `vmm::rust_init::init`) from `/proc/kallsyms`.
629    /// Storage class: `.data..ro_after_init` per
630    /// `arch/x86/kernel/head64.c:63` — written during
631    /// `kernel_randomize_memory()` in `start_kernel`, frozen after
632    /// `mark_rodata_ro`. `0` means (a) arm64 (no `page_offset_base`
633    /// global — `PAGE_OFFSET` is compile-time per
634    /// `arch/arm64/include/asm/memory.h:43-45`), (b)
635    /// CONFIG_RANDOMIZE_MEMORY=n (symbol absent), or (c) kallsyms
636    /// unreadable (kptr_restrict elevated); the host falls back to
637    /// `resolve_page_offset_with_tcr` (the page-table walk) in
638    /// every 0 case.
639    pub page_offset_base: u64,
640    /// Runtime KVA of `_text` (the kernel image start symbol)
641    /// from the guest's `/proc/kallsyms`, when readable. The
642    /// host derives `virt_kaslr = runtime - link_text_kva` using
643    /// the link-time KVA extracted from vmlinux at coordinator
644    /// init. `None` when the guest could not read kallsyms
645    /// (kptr_restrict masked, /proc not mountable, symbol absent).
646    pub kernel_text_runtime_kva: Option<u64>,
647}
648
649impl KernAddrs {
650    /// Wire-format byte length. Exact-match check on the receive
651    /// side so a future payload extension trips a decoder
652    /// rejection rather than silently dropping the new bytes.
653    pub const WIRE_LEN: usize = 24;
654
655    /// Construct from bare field values. Caller owns the
656    /// "did I read kallsyms?" decision via the `Option` on
657    /// `kernel_text_runtime_kva`.
658    pub fn new(
659        phys_base: u64,
660        page_offset_base: u64,
661        kernel_text_runtime_kva: Option<u64>,
662    ) -> Self {
663        Self {
664            phys_base,
665            page_offset_base,
666            kernel_text_runtime_kva,
667        }
668    }
669
670    /// Encode to a 24-byte LE payload with `+1` bias on the
671    /// biased slots. Caller transmits this on the wire. Takes
672    /// `self` by value since [`Self`] is `Copy` and the encoder
673    /// reads each field at most once.
674    pub fn to_payload(self) -> [u8; Self::WIRE_LEN] {
675        let mut buf = [0u8; Self::WIRE_LEN];
676        buf[..8].copy_from_slice(&(self.phys_base.wrapping_add(1)).to_le_bytes());
677        buf[8..16].copy_from_slice(&self.page_offset_base.to_le_bytes());
678        // bias 0 → encodes as 0 (sentinel: guest could not derive)
679        let runtime_biased = match self.kernel_text_runtime_kva {
680            Some(kva) => kva.wrapping_add(1),
681            None => 0,
682        };
683        buf[16..24].copy_from_slice(&runtime_biased.to_le_bytes());
684        buf
685    }
686
687    /// Decode from a wire payload. Returns `None` on length
688    /// mismatch (exact match required — short payloads never
689    /// publish either slot to avoid a partial-init race; longer
690    /// payloads indicate a protocol extension the decoder
691    /// doesn't understand).
692    pub fn from_payload(payload: &[u8]) -> Option<Self> {
693        if payload.len() != Self::WIRE_LEN {
694            return None;
695        }
696        let phys_biased = u64::from_le_bytes(payload[..8].try_into().ok()?);
697        let page_offset_base = u64::from_le_bytes(payload[8..16].try_into().ok()?);
698        let runtime_biased = u64::from_le_bytes(payload[16..24].try_into().ok()?);
699        Some(Self {
700            // biased 0 means "guest didn't send" — but the
701            // unbiased phys_base = 0 is legitimate (KASLR off).
702            // `has_phys_present_bit` distinguishes the two on the
703            // host side.
704            phys_base: phys_biased.wrapping_sub(1),
705            page_offset_base,
706            kernel_text_runtime_kva: if runtime_biased == 0 {
707                None
708            } else {
709                Some(runtime_biased.wrapping_sub(1))
710            },
711        })
712    }
713
714    /// True iff the encoded payload had a non-zero biased
715    /// `phys_base` slot (i.e. the guest sent phys_base data).
716    /// Distinguishes "guest sent phys_base = 0" (KASLR off, valid)
717    /// from "guest didn't send phys_base at all" (truncated wire
718    /// path, treat as absent). Computed from the post-decode
719    /// `phys_base` field: encoded `phys_biased = phys_base + 1`
720    /// is non-zero iff `phys_base != u64::MAX`. Wrap-around case
721    /// (`phys_base = u64::MAX` encodes to biased 0) is impossible
722    /// in practice — kernel `phys_base` is a low physical address,
723    /// never the all-ones sentinel.
724    pub fn has_phys_present_bit(&self) -> bool {
725        self.phys_base != u64::MAX
726    }
727}
728
729/// Guest→host shell-exec exit code (payload: 4-byte LE i32).
730///
731/// Replaces the prior COM2 `KTSTR_EXEC_EXIT=N` sentinel line
732/// emitted by `cargo ktstr shell --exec <cmd>`.
733pub const MSG_TYPE_EXEC_EXIT: u32 = 0x4558_4358; // "EXCX"
734
735/// Guest→host kernel ring-buffer dump (payload: opaque UTF-8 bytes).
736///
737/// Sent on the initramfs-extraction failure path so the host sees
738/// the kernel OOM messages without scraping COM2.
739pub const MSG_TYPE_DMESG: u32 = 0x444d_5347; // "DMSG"
740
741/// Guest→host probe-pipeline JSON output (payload: opaque UTF-8
742/// bytes).
743///
744/// Replaces the prior COM2 ProbeDrain path so probe output and
745/// scheduler-log dumps stop interleaving on the same serial port.
746pub const MSG_TYPE_PROBE_OUTPUT: u32 = 0x5052_4f42; // "PROB"
747
748/// Guest→host kernel-memory write/read op request (payload:
749/// postcard-encoded [`KernelOpRequestPayload`]).
750///
751/// Carries an [`Op::WriteKernelHot`](crate::scenario::ops::Op::WriteKernelHot)
752/// / [`Op::WriteKernelCold`](crate::scenario::ops::Op::WriteKernelCold)
753/// / [`Op::ReadKernelHot`](crate::scenario::ops::Op::ReadKernelHot)
754/// / [`Op::ReadKernelCold`](crate::scenario::ops::Op::ReadKernelCold)
755/// request from the guest's step executor to the host coordinator.
756/// Variable-length payload (target + value bytes do not fit in the
757/// 72-byte [`SnapshotRequestPayload`]), so this rides a distinct
758/// MSG_TYPE_* with a postcard-encoded body rather than extending the
759/// fixed-size snapshot envelope.
760pub const MSG_TYPE_KERNEL_OP_REQUEST: u32 = 0x4b4f_5251; // "KORQ"
761
762/// Host→guest reply to a [`MSG_TYPE_KERNEL_OP_REQUEST`] (payload:
763/// postcard-encoded [`KernelOpReplyPayload`]). Echoes the request id
764/// the guest stamped, carries the status + reason + (for reads) the
765/// value bytes the host coordinator read.
766pub const MSG_TYPE_KERNEL_OP_REPLY: u32 = 0x4b4f_5250; // "KORP"
767
768// ---------------------------------------------------------------------------
769// ShmMessage — TLV header
770// ---------------------------------------------------------------------------
771
772/// 16-byte TLV header preceding each payload on the wire.
773///
774/// Used as the framing header for the bulk virtio-console port-1
775/// channel; the type name `ShmMessage` is retained from the
776/// predecessor SHM ring transport (now removed in favour of the
777/// virtio-console port). CRC32 covers payload bytes only (not the
778/// header).
779///
780/// SAFETY: `repr(C)` with four `u32` fields produces a 16-byte struct
781/// with no padding (every field is 4-aligned). `_pad` is reserved for
782/// future schema use; current writers MUST set it to 0 and current
783/// readers ignore it. zerocopy derives produce no panics — every bit
784/// pattern is valid for `u32`.
785#[repr(C)]
786#[derive(
787    Clone, Copy, Default, Debug, FromBytes, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout,
788)]
789pub struct ShmMessage {
790    pub msg_type: u32,
791    pub length: u32,
792    pub crc32: u32,
793    pub _pad: u32,
794}
795
796const _SHM_MESSAGE_SIZE: () = assert!(std::mem::size_of::<ShmMessage>() == 16);
797
798/// Size in bytes of the on-wire [`ShmMessage`] header.
799pub const FRAME_HEADER_SIZE: usize = std::mem::size_of::<ShmMessage>();
800
801// ---------------------------------------------------------------------------
802// ShmEntry — parsed TLV entry
803// ---------------------------------------------------------------------------
804
805/// A single parsed message extracted from the bulk byte stream.
806///
807/// `crc_ok` is `true` when the recomputed payload CRC matched the
808/// guest's stored value. CRC mismatches do not stop the walk — the
809/// parser yields the entry with `crc_ok=false` and continues with the
810/// next frame. Downstream consumers may filter on `crc_ok` to drop
811/// corrupted entries.
812#[derive(Debug, Clone)]
813pub struct ShmEntry {
814    pub msg_type: u32,
815    pub payload: Vec<u8>,
816    /// `true` when the recomputed payload CRC matched the on-wire CRC.
817    pub crc_ok: bool,
818}
819
820// ---------------------------------------------------------------------------
821// Stimulus payload — guest step executor → host
822// ---------------------------------------------------------------------------
823
824/// Payload for stimulus events written by the guest step executor.
825///
826/// Compact 24-byte struct describing the state after each step's ops
827/// are applied. The host correlates these with monitor samples to map
828/// scheduler telemetry to scenario phases.
829#[repr(C)]
830#[derive(Clone, Copy, Default, Debug, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout)]
831pub struct StimulusPayload {
832    /// Milliseconds since scenario start.
833    pub elapsed_ms: u32,
834    /// Index of the step that was just applied.
835    pub step_index: u16,
836    /// Number of ops applied in this step.
837    pub op_count: u16,
838    /// Bitmask of Op variant discriminants present in this step.
839    pub op_kinds: u32,
840    /// Number of live cgroups after this step: sum of step-local
841    /// cgroups (from the current Step's `CgroupDef`s + `Op`s) and
842    /// Backdrop-owned cgroups that persist across every Step.
843    pub cgroup_count: u16,
844    /// Total worker handles after this step: sum of step-local
845    /// workers and Backdrop-spawned workers that persist across
846    /// every Step.
847    pub worker_count: u16,
848    /// Sum of all workers' iteration counts at this step boundary.
849    /// Read from shared MAP_SHARED counters in the step executor.
850    pub total_iterations: u64,
851}
852
853const _STIMULUS_SIZE: () = assert!(std::mem::size_of::<StimulusPayload>() == 24);
854
855/// Deserialized stimulus event.
856#[derive(Debug, Clone)]
857pub struct StimulusEvent {
858    pub elapsed_ms: u32,
859    pub step_index: u16,
860    pub op_count: u16,
861    pub op_kinds: u32,
862    pub cgroup_count: u16,
863    pub worker_count: u16,
864    pub total_iterations: u64,
865}
866
867impl StimulusEvent {
868    /// Deserialize from raw payload bytes. Requires EXACTLY a
869    /// [`StimulusPayload`]-sized (24-byte) buffer — a shorter buffer would
870    /// truncate and a longer one would carry trailing bytes the guest
871    /// never frames (`send_stimulus`/`send_step_end` always write exactly
872    /// 24 bytes), so both are rejected (matching [`KernAddrs::from_payload`]'s
873    /// exact-length gate). A torn or hostile oversized frame is dropped
874    /// rather than promoted by reading a 24-byte prefix.
875    pub fn from_payload(data: &[u8]) -> Option<Self> {
876        if data.len() != std::mem::size_of::<StimulusPayload>() {
877            return None;
878        }
879        Some(StimulusEvent {
880            elapsed_ms: u32::from_ne_bytes(data[0..4].try_into().ok()?),
881            step_index: u16::from_ne_bytes(data[4..6].try_into().ok()?),
882            op_count: u16::from_ne_bytes(data[6..8].try_into().ok()?),
883            op_kinds: u32::from_ne_bytes(data[8..12].try_into().ok()?),
884            cgroup_count: u16::from_ne_bytes(data[12..14].try_into().ok()?),
885            worker_count: u16::from_ne_bytes(data[14..16].try_into().ok()?),
886            total_iterations: u64::from_ne_bytes(data[16..24].try_into().ok()?),
887        })
888    }
889}
890
891/// Size in bytes of the [`MsgType::ScenarioEnd`] payload: two
892/// little-endian `u64`s — scenario-relative elapsed milliseconds
893/// followed by the final cumulative worker iteration count.
894pub const SCENARIO_END_PAYLOAD_SIZE: usize = 16;
895
896/// Parse the [`MsgType::ScenarioEnd`] payload written by
897/// [`crate::vmm::guest_comms::send_scenario_end`]: `elapsed_ms`
898/// (LE `u64`, scenario-relative) followed by `total_iterations`
899/// (LE `u64`, the cumulative worker iteration count summed across
900/// every live handle at the LAST step's end). The iteration count is
901/// the right boundary the final step's `iteration_rate` delta needs —
902/// the host folds it into a synthetic terminal
903/// [`crate::timeline::StimulusEvent`] (see
904/// [`crate::timeline::StimulusEvent::terminal`]). Returns `None` for a
905/// short/torn payload so a CRC-bad or truncated frame is skipped
906/// rather than misread.
907pub fn parse_scenario_end(payload: &[u8]) -> Option<(u64, u64)> {
908    if payload.len() < SCENARIO_END_PAYLOAD_SIZE {
909        return None;
910    }
911    let elapsed_ms = u64::from_le_bytes(payload[0..8].try_into().ok()?);
912    let total_iterations = u64::from_le_bytes(payload[8..16].try_into().ok()?);
913    Some((elapsed_ms, total_iterations))
914}
915
916// ---------------------------------------------------------------------------
917// Snapshot request/reply TLV payloads
918// ---------------------------------------------------------------------------
919
920/// Maximum length, in bytes, of a snapshot tag (capture name or
921/// watchpoint symbol path) carried inside the
922/// [`SnapshotRequestPayload`]. Tags longer than this bound are
923/// truncated by the guest before publishing; the host treats the
924/// first NUL as the boundary, or stops at this size if no NUL is
925/// present.
926pub const SNAPSHOT_TAG_MAX: usize = 64;
927
928/// Maximum length, in bytes, of a host-supplied reason string carried
929/// inside the [`SnapshotReplyPayload`]. Same semantics as the tag
930/// buffer (NUL-terminated when shorter, truncated when longer). Sized
931/// to hold typed-Err diagnostics that name the failing condition
932/// (e.g. `kaslr_offset == 0`, `kern_virt_kaslr` Arc state) PLUS the
933/// failing symbol + KVA PLUS the actionable remediation tip (e.g.
934/// `set #[ktstr_test(kaslr = false)]`). The longest such diagnostic
935/// today — Fix C's high-half/zero-offset rejection at
936/// `crate::vmm::freeze_coord::snapshot::arm_user_watchpoint` — is
937/// ~343 bytes when rendered with a typical symbol + KVA; 512 gives
938/// ~170 bytes of headroom for future diagnostics. The original
939/// 64-byte buffer and an intermediate 256-byte size both truncated
940/// this message before the remediation tail.
941pub const SNAPSHOT_REASON_MAX: usize = 512;
942
943/// Snapshot request kind: no request pending. Used as the sentinel
944/// value for an uninitialised request slot (this discriminant must
945/// not appear on the wire — the framing of a TLV with
946/// `MSG_TYPE_SNAPSHOT_REQUEST` already implies a request).
947pub const SNAPSHOT_KIND_NONE: u32 = 0;
948
949/// Snapshot request kind: capture-now. The host runs
950/// `freeze_and_dispatch(FreezeMode::Capture { gate_on_exit_kind: false })` and stores the resulting
951/// `FailureDumpReport` on the bridge keyed by the request tag.
952pub const SNAPSHOT_KIND_CAPTURE: u32 = 1;
953
954/// Snapshot request kind: hardware-watchpoint registration. The host
955/// resolves the symbol path through the vmlinux ELF symtab,
956/// allocates a free user watchpoint slot, programs the hardware
957/// watchpoint via `KVM_SET_GUEST_DEBUG`, and replies. A future
958/// guest write to the resolved KVA fires the corresponding debug
959/// exit and synthesises a snapshot tagged by the symbol.
960pub const SNAPSHOT_KIND_WATCH: u32 = 2;
961
962/// Reply status: success — the host completed the requested action
963/// (capture stored, or watchpoint armed).
964pub const SNAPSHOT_STATUS_OK: u32 = 1;
965
966/// Reply status: failure — the host rejected or could not complete
967/// the request. The reason buffer carries a UTF-8 diagnostic.
968pub const SNAPSHOT_STATUS_ERR: u32 = 2;
969
970/// Outcome of a guest-driven snapshot request: ok, error with reason,
971/// or transport failure (port unavailable / not in guest / timeout).
972#[derive(Debug)]
973pub enum SnapshotRequestResult {
974    /// Host completed the request. For
975    /// [`SNAPSHOT_KIND_CAPTURE`] this means the report
976    /// was stored on the bridge under the supplied tag; for
977    /// [`SNAPSHOT_KIND_WATCH`] this means the hardware
978    /// watchpoint was armed.
979    Ok,
980    /// Host accepted the request but completed it as a failure. The
981    /// reason carries the host-supplied diagnostic text (truncated to
982    /// [`SNAPSHOT_REASON_MAX`] bytes).
983    HostError { reason: String },
984    /// Transport failed (called from host context, port not yet open,
985    /// host did not reply within `timeout`, malformed reply frame).
986    /// The supplied diagnostic names the underlying cause.
987    TransportError { reason: String },
988}
989
990/// Snapshot request payload (72 bytes).
991///
992/// Sent guest→host as the payload of a [`MsgType::SnapshotRequest`]
993/// frame on virtio-console port 1 TX. The guest fills every field
994/// before publishing; the trailing zeros in `tag` form the NUL
995/// terminator when the supplied tag is shorter than
996/// [`SNAPSHOT_TAG_MAX`].
997///
998/// SAFETY: `repr(C)` with `u32 + u32 + [u8; 64]` produces a 72-byte
999/// struct with no padding (every field is naturally aligned;
1000/// trailing array of `u8` requires no end-of-struct padding).
1001/// Every bit pattern is valid for `u32` and `u8`. zerocopy derives
1002/// produce no panics.
1003#[repr(C)]
1004#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout)]
1005pub struct SnapshotRequestPayload {
1006    /// Monotonic request id the guest stamped before publishing.
1007    /// The host echoes this value into the matching
1008    /// [`SnapshotReplyPayload::request_id`] so the guest's blocking
1009    /// reader can pair against the original request.
1010    pub request_id: u32,
1011    /// Request kind: one of [`SNAPSHOT_KIND_CAPTURE`] /
1012    /// [`SNAPSHOT_KIND_WATCH`]. [`SNAPSHOT_KIND_NONE`] is invalid on
1013    /// the wire — the host rejects it with [`SNAPSHOT_STATUS_ERR`].
1014    pub kind: u32,
1015    /// Tag — UTF-8, NUL-terminated when shorter than the buffer;
1016    /// truncated to [`SNAPSHOT_TAG_MAX`] when longer. For
1017    /// [`SNAPSHOT_KIND_CAPTURE`] the tag is the snapshot name (key
1018    /// the bridge stores the report under); for
1019    /// [`SNAPSHOT_KIND_WATCH`] the tag is the symbol path the host
1020    /// resolves through vmlinux ELF.
1021    pub tag: [u8; SNAPSHOT_TAG_MAX],
1022}
1023
1024const _SNAPSHOT_REQUEST_PAYLOAD_SIZE: () =
1025    assert!(std::mem::size_of::<SnapshotRequestPayload>() == 8 + SNAPSHOT_TAG_MAX);
1026
1027/// Snapshot reply payload (520 bytes: `u32 request_id + u32 status + [u8; 512] reason`).
1028///
1029/// Sent host→guest as the payload of a [`MsgType::SnapshotReply`]
1030/// frame on virtio-console port 1 RX. Mirrors the request layout —
1031/// the guest matches `request_id` against its outstanding request
1032/// and reads `status`/`reason` to surface the host's verdict.
1033///
1034/// SAFETY: identical layout reasoning as [`SnapshotRequestPayload`].
1035#[repr(C)]
1036#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout)]
1037pub struct SnapshotReplyPayload {
1038    /// Echo of the request's `request_id`. The guest's blocking
1039    /// reader spins until it observes this value match its
1040    /// outstanding request.
1041    pub request_id: u32,
1042    /// Reply status: [`SNAPSHOT_STATUS_OK`] when the host completed
1043    /// the request, [`SNAPSHOT_STATUS_ERR`] otherwise.
1044    pub status: u32,
1045    /// Reason — UTF-8, NUL-terminated when shorter than the buffer;
1046    /// truncated to [`SNAPSHOT_REASON_MAX`] when longer. Empty
1047    /// (all-zero) on the success path.
1048    pub reason: [u8; SNAPSHOT_REASON_MAX],
1049}
1050
1051const _SNAPSHOT_REPLY_PAYLOAD_SIZE: () =
1052    assert!(std::mem::size_of::<SnapshotReplyPayload>() == 8 + SNAPSHOT_REASON_MAX);
1053
1054// ---------------------------------------------------------------------------
1055// KernelOp request/reply payloads (postcard-encoded, variable-length)
1056// ---------------------------------------------------------------------------
1057
1058/// Hot/cold orchestration discriminant for kernel-memory ops on the
1059/// wire. Encoded inside [`KernelOpRequestPayload`].
1060#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1061pub enum KernelOpMode {
1062    /// Hot: dispatched on a host worker thread without freeze
1063    /// rendezvous. Mirrors `Op::WriteKernelHot` / `Op::ReadKernelHot`
1064    /// orchestration. Caller is responsible for guest-side sync.
1065    Hot,
1066    /// Cold: dispatched inside a freeze rendezvous with every vCPU
1067    /// parked. Mirrors `Op::WriteKernelCold` / `Op::ReadKernelCold`
1068    /// orchestration. Coherent with respect to guest state.
1069    Cold,
1070}
1071
1072/// Direction discriminant: write vs read. Inside
1073/// [`KernelOpRequestPayload`] the kind picks WHICH `GuestKernel::*`
1074/// method family the host dispatcher invokes.
1075#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1076pub enum KernelOpDirection {
1077    /// Write: `values` contains the bytes to write; reply carries
1078    /// success/error and the per-write byte count.
1079    Write,
1080    /// Read: `values` is empty; reply carries the bytes read into
1081    /// `KernelOpReplyPayload::read_values`.
1082    Read,
1083}
1084
1085/// Wire-encoded [`crate::scenario::ops::KernelTarget`] variant tag.
1086/// Mirrors the `KernelTarget` enum variants 1:1; postcard encodes
1087/// the tag + the variant payload that follows in [`KernelOpTarget`].
1088#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1089pub enum KernelOpTarget {
1090    /// Kernel symbol (text/data/bss), resolved at dispatch via
1091    /// runtime kernel image base + KASLR.
1092    Symbol(String),
1093    /// Direct-mapped KVA; translated via `kva - PAGE_OFFSET`.
1094    Direct(u64),
1095    /// Vmalloc'd KVA; translated via page-table walk through CR3.
1096    Kva(u64),
1097    /// Per-CPU field of a kernel struct. Resolved at dispatch via
1098    /// `symbol_kva + __per_cpu_offset[cpu] + BTF byte offset of field`.
1099    PerCpuField {
1100        /// Symbol naming the per-CPU template (e.g. `"runqueues"`).
1101        symbol: String,
1102        /// Field within the symbol's struct (e.g. `"clock"`).
1103        field: String,
1104        /// CPU index whose per-CPU instance to address.
1105        cpu: u32,
1106    },
1107    /// Per-task field of a `struct task_struct` — SCX-managed tasks
1108    /// only. Resolved at dispatch time by walking `init_task.tasks`
1109    /// plus each leader's `signal->thread_head` to locate the
1110    /// `task_struct *` whose `pid` matches AND whose `start_time`
1111    /// matches `expected_start_time_ns` (anti-PID-reuse identity
1112    /// guard), then adding the BTF-resolved nested-path byte offset
1113    /// of `field` within `struct task_struct`.
1114    ///
1115    /// `pid` is the GUEST-side `pid_t` (positive). Both
1116    /// thread-group leaders AND non-leader threads are addressable:
1117    /// the walker iterates leaders via `for_each_process` semantics
1118    /// (`include/linux/sched/signal.h:639`), and for each leader
1119    /// also walks `leader->signal->thread_head` via
1120    /// `for_each_thread` semantics (same header L654-659).
1121    ///
1122    /// `expected_start_time_ns` is the value `task->start_time` had
1123    /// at WorkSpec spawn time. The kernel sets `start_time` once via
1124    /// `ktime_get_ns()` in `kernel/fork.c::copy_process`
1125    /// (`include/linux/sched.h:1127`); the value never changes
1126    /// after that. Caller records it at spawn time (e.g. via
1127    /// `/proc/<pid>/stat` field 22 + sysconf-to-ns conversion).
1128    /// The dispatcher rejects writes when the observed
1129    /// `task->start_time` differs — catches the PID-reuse hazard
1130    /// where the original worker exited and the kernel recycled
1131    /// the PID for an unrelated task.
1132    ///
1133    /// `field` is a dot-separated nested-member path. **SCX-only**:
1134    /// the dispatcher's class gate accepts ONLY tasks whose
1135    /// `sched_class` is `ext_sched_class`. Recommended fields:
1136    /// - `"scx.dsq_vtime"` — SCX DSQ priority-queue ordering key;
1137    ///   preserved across dequeue/enqueue cycles
1138    ///   (`kernel/sched/ext.c`).
1139    /// - `"start_boottime"` — task fork timestamp; observable in
1140    ///   `/proc/<pid>/stat` field 22.
1141    ///
1142    /// **DO NOT** write `"se.vruntime"` — EEVDF's `place_entity`
1143    /// (`kernel/sched/fair.c:5329-5414`, since 6.6) overwrites
1144    /// `se->vruntime` on every enqueue via `avg_vruntime(cfs_rq) -
1145    /// se->vlag`. Direct vruntime writes are silently discarded for
1146    /// sleeping tasks (which is our validation gate). TaskField
1147    /// rejects non-SCX tasks before reaching this field anyway.
1148    ///
1149    /// Eight-layer task validation before any write/read lands:
1150    /// 1. `task->pid == requested_pid` (anti-mismatch),
1151    /// 2. `task->start_time` within
1152    ///    `[expected_start_time_ns, expected_start_time_ns + 10ms)`
1153    ///    (anti-PID-reuse identity; the 10ms window absorbs the
1154    ///    `/proc/<pid>/stat` CLK_TCK quantization since the kernel's
1155    ///    `start_time` carries sub-tick ns precision while the
1156    ///    caller's value is rounded down to a tick boundary),
1157    /// 3. `task->__state & TASK_DEAD == 0` (lifetime),
1158    /// 4. `task->on_rq == 0` (rb-tree / DSQ ordering safety per
1159    ///    `task_on_rq_queued` at `kernel/sched/sched.h:2399`),
1160    /// 5. `task->scx.dsq == NULL` AND `task->scx.runnable_node` is
1161    ///    list-empty (SCX maintains `runnable_node` linkage
1162    ///    independent of dsq pointer per
1163    ///    `include/linux/sched/ext.h:227`),
1164    /// 6. `task->sched_class == &ext_sched_class` (the canonical
1165    ///    SCX-managed gate),
1166    /// 7. (REMOVED) a former `task->policy == SCHED_EXT` gate: SCX
1167    ///    claims fair-policy tasks via `sched_class` without changing
1168    ///    `task->policy`, so a policy check would wrongly reject
1169    ///    SCX-managed tasks that forked under `SCHED_NORMAL`. The
1170    ///    number is kept so the surviving gates retain their labels.
1171    /// 8. `task->start_boottime != 0` (anti-slab-recycle: a
1172    ///    freshly-zeroed slab page reads zero; live tasks have this
1173    ///    set to non-zero `ktime_get_boottime_ns()` at fork).
1174    TaskField {
1175        /// Guest-side PID of the target task. Both leaders and
1176        /// non-leader threads are addressable via the dispatcher's
1177        /// per-thread walker.
1178        pid: u32,
1179        /// `task->start_time` (`u64`, nanoseconds) recorded at
1180        /// WorkSpec spawn time. Used by the L2 anti-PID-reuse
1181        /// identity check.
1182        expected_start_time_ns: u64,
1183        /// Nested member path within `struct task_struct`. Dot-
1184        /// separated; first segment is a direct member of
1185        /// `task_struct`, subsequent segments descend through named
1186        /// composite members.
1187        field: String,
1188    },
1189}
1190
1191/// Wire-encoded [`crate::scenario::ops::KernelValue`] variant tag.
1192/// Mirrors the four `KernelValue` enum variants 1:1.
1193#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1194pub enum KernelOpValue {
1195    /// 32-bit unsigned, little-endian on the wire and at the
1196    /// resolved PA. Atomic when the resolved host PA is 4-byte
1197    /// aligned (see `GuestKernel::write_*_u32` doc).
1198    U32(u32),
1199    /// 64-bit unsigned, little-endian. Atomic at 8-byte alignment.
1200    U64(u64),
1201    /// Variable-length byte payload. Written non-atomically; the
1202    /// dispatcher emits a Release fence after the copy.
1203    Bytes(Vec<u8>),
1204    /// 32-bit unsigned read-modify-write OR mask. The cold-path
1205    /// dispatcher reads the live u32 at the resolved host PA,
1206    /// ORs the carried mask into it, and writes the new value
1207    /// back as two separate `read_u32` / `write_u32` calls —
1208    /// atomic by quiesce because the freeze rendezvous parks
1209    /// every guest vCPU before the RMW runs (no concurrent
1210    /// kernel writer can interleave). No `compare_exchange` loop
1211    /// in the cold path. Mirrors
1212    /// [`crate::scenario::ops::KernelValue::OrU32`] — see that
1213    /// variant's doc for the full atomicity, ordering, and
1214    /// width-correctness contract (the canonical
1215    /// `SCX_RQ_CLK_VALID` use case + the
1216    /// `kernel/sched/sched.h:802` u32-width citation for the
1217    /// `struct scx_rq.flags` field that motivated keeping the
1218    /// variant u32 rather than u64). Hot-path support is a
1219    /// future variant — it would require `AtomicU32::from_ptr`
1220    /// + cmpxchg + strict alignment rejection.
1221    OrU32(u32),
1222}
1223
1224/// One write/read pair inside a [`KernelOpRequestPayload`] batch.
1225/// `value` is the bytes to write for a [`KernelOpDirection::Write`]
1226/// request and a placeholder ignored by the dispatcher for a
1227/// [`KernelOpDirection::Read`] request (the value-width discriminant
1228/// IS still load-bearing for reads — it picks the read method
1229/// family).
1230#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1231pub struct KernelOpEntry {
1232    /// Address to write or read.
1233    pub target: KernelOpTarget,
1234    /// Value to write (or value-width hint for a read).
1235    pub value: KernelOpValue,
1236}
1237
1238/// Postcard-encoded payload for [`MsgType::KernelOpRequest`].
1239///
1240/// Carries an entire `Op::WriteKernel{Hot,Cold}` /
1241/// `Op::ReadKernel{Hot,Cold}` invocation including the full
1242/// `Vec<(KernelTarget, KernelValue)>` batch — variable-length, hence
1243/// the postcard encoding rather than a zerocopy fixed-size struct.
1244///
1245/// For write-direction payloads the executor's adjacent-cold-op
1246/// auto-merge pre-pass folds N adjacent `Op::WriteKernelCold`
1247/// singletons into one payload with N entries — multi-CPU seeds
1248/// (e.g. `with_uptime` writing per-CPU `rq.clock` on every CPU)
1249/// land in ONE freeze rendezvous with no inter-CPU skew. Reads
1250/// remain one-per-rendezvous until a follow-up batch adds
1251/// per-entry direction + tag.
1252#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1253pub struct KernelOpRequestPayload {
1254    /// Monotonic request id; the host echoes it into the matching
1255    /// [`KernelOpReplyPayload::request_id`].
1256    pub request_id: u32,
1257    /// Hot vs cold orchestration.
1258    pub mode: KernelOpMode,
1259    /// Write vs read direction.
1260    pub direction: KernelOpDirection,
1261    /// Bridge-keyed tag for the response. For reads the tag becomes
1262    /// the bridge entry key; for writes the tag is informational
1263    /// only (the executor surfaces it in the success record).
1264    pub tag: String,
1265    /// Ordered batch entries. For [`KernelOpDirection::Write`] all
1266    /// entries' `value` carries the bytes to write; for
1267    /// [`KernelOpDirection::Read`] only `target` + the value-width
1268    /// discriminant are load-bearing.
1269    pub entries: Vec<KernelOpEntry>,
1270}
1271
1272/// Postcard-encoded payload for `MsgType::KernelOpReply`.
1273///
1274/// Mirrors the request id so the guest's blocking reader can pair
1275/// against the original request. Status carries success/failure; on
1276/// failure `reason` describes the host-side error. For
1277/// `KernelOpDirection::Read` requests `read_values` carries the
1278/// per-entry bytes the host coordinator read; empty for writes.
1279#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
1280pub struct KernelOpReplyPayload {
1281    /// Echo of the request's `request_id`.
1282    pub request_id: u32,
1283    /// `true` when the host completed every entry in the batch;
1284    /// `false` when any entry failed (reason describes the first
1285    /// failure).
1286    pub success: bool,
1287    /// Human-readable diagnostic on the failure path; empty on
1288    /// success.
1289    pub reason: String,
1290    /// For a `KernelOpDirection::Read` request: one
1291    /// [`KernelOpValue`] per request entry in iteration order. Empty
1292    /// for writes.
1293    pub read_values: Vec<KernelOpValue>,
1294}
1295
1296/// Upper bound on the on-wire size of a postcard-encoded
1297/// [`KernelOpReplyPayload`] frame the guest accepts on port-1 RX.
1298///
1299/// 1 MiB covers every realistic batch shape:
1300/// * `with_uptime` writing per-CPU `rq.clock` on 1024 CPUs:
1301///   ~9 KiB (well under cap).
1302/// * Bulk `read_*_bytes` of a struct page (4 KiB) per CPU on a
1303///   128-CPU host: ~520 KiB (within cap).
1304/// * Per-CPU 1 KiB `Bytes` read on 1024 CPUs: ~1 MiB (right at cap).
1305///
1306/// **Per-op entry budget**: callers that need replies larger than
1307/// 1 MiB must split the request across multiple ops; the cap
1308/// rejects forged or accidentally-huge lengths BEFORE the
1309/// `vec![0u8; length]` allocation in
1310/// [`crate::vmm::guest_comms`]'s frame reader, so a hostile or
1311/// buggy host cannot OOM the guest's PID 1 init.
1312pub const KERNEL_OP_REPLY_MAX: usize = 1024 * 1024;
1313
1314/// Upper bound on [`KernelOpRequestPayload::tag`] bytes.
1315/// `req.tag` is a `String` and downstream formatters (the reply
1316/// `reason` field, tracing emits) embed it inline. Without a
1317/// bound, a framework bug or test-author misuse producing a
1318/// multi-megabyte tag would inflate the postcard-encoded reply
1319/// past [`KERNEL_OP_REPLY_MAX`] and the reply would silently
1320/// drop at the guest's RX cap, surfacing only as a 30-second
1321/// transport timeout. Tags longer than this cap are truncated at
1322/// decode time in `src/vmm/freeze_coord/dispatch.rs`'s
1323/// `MsgType::KernelOpRequest` arm with a UTF-8 char-boundary
1324/// walk-down to avoid the `String::truncate` mid-codepoint panic.
1325/// 256 bytes fits operator-readable test-name and scenario-phase
1326/// labels with headroom; framework code that benignly produces
1327/// longer tags loses suffix bytes from the diagnostic but the op
1328/// itself continues normally.
1329pub const KERNEL_OP_TAG_MAX: usize = 256;
1330/// Upper bound on [`KernelOpReplyPayload::reason`] bytes. Pairs
1331/// with [`KERNEL_OP_TAG_MAX`] for the reply-side bound:
1332/// coordinator-generated reasons embed the request tag inline and
1333/// otherwise format diagnostic text from typed-error payloads.
1334/// 256 bytes fits diagnostic messages like
1335/// "PA validation rejected: pa=0x... reason=wrong-half" plus the
1336/// request_id and the truncated tag.
1337pub const KERNEL_OP_REASON_MAX: usize = 256;
1338
1339/// Outcome of a guest-driven kernel-memory op request: the host
1340/// returned a reply (caller inspects [`KernelOpReplyPayload::success`])
1341/// or the transport failed (port not open, timeout, malformed frame).
1342///
1343/// Distinct from a `host_error` variant the way [`SnapshotRequestResult`]
1344/// distinguishes — kernel-op replies are postcard-encoded with
1345/// arbitrary structure, so the "host completed but op failed" carrier
1346/// is the reply payload's `success: false` + `reason`. The
1347/// `TransportError` arm covers cases where the guest never receives a
1348/// usable reply at all.
1349#[derive(Debug)]
1350pub enum KernelOpRequestResult {
1351    /// Host returned a postcard-decoded reply. The caller inspects
1352    /// `reply.success` to distinguish op success from host-side op
1353    /// failure; `reply.reason` carries the failure diagnostic when
1354    /// `success == false`.
1355    Ok(KernelOpReplyPayload),
1356    /// Transport failed (called from host context, port not yet open,
1357    /// host did not reply within `timeout`, malformed reply frame).
1358    /// The supplied diagnostic names the underlying cause.
1359    TransportError { reason: String },
1360}
1361
1362// ---------------------------------------------------------------------------
1363// ControlEvent — multiport control protocol discriminants
1364// ---------------------------------------------------------------------------
1365
1366/// Multiport control-event discriminant. Mirrors the kernel uapi
1367/// `enum virtio_console_event` in `include/uapi/linux/virtio_console.h`.
1368///
1369/// The on-wire value is a u16. [`Self::wire_value`] returns the value
1370/// the kernel and the host VMM exchange on the c_ivq / c_ovq queues;
1371/// [`Self::from_wire`] reverses the mapping for a host-side parser.
1372#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
1373pub enum ControlEvent {
1374    /// Guest-side: driver finished probing, host may begin port
1375    /// enumeration.
1376    DeviceReady,
1377    /// Host-side: announce a new port to the guest.
1378    PortAdd,
1379    /// Host-side: tear down a port.
1380    PortRemove,
1381    /// Guest-side: per-port driver finished setup.
1382    PortReady,
1383    /// Host-side: mark a port as the system console.
1384    ConsolePort,
1385    /// Host-side: terminal resize event.
1386    Resize,
1387    /// Bidirectional: open/close indication for a port.
1388    PortOpen,
1389    /// Host-side: PORT_NAME header followed by name bytes.
1390    PortName,
1391}
1392
1393impl ControlEvent {
1394    /// 16-bit on-wire discriminant. Values match the kernel uapi
1395    /// constants `VIRTIO_CONSOLE_*`.
1396    pub const fn wire_value(self) -> u16 {
1397        match self {
1398            ControlEvent::DeviceReady => 0,
1399            ControlEvent::PortAdd => 1,
1400            ControlEvent::PortRemove => 2,
1401            ControlEvent::PortReady => 3,
1402            ControlEvent::ConsolePort => 4,
1403            ControlEvent::Resize => 5,
1404            ControlEvent::PortOpen => 6,
1405            ControlEvent::PortName => 7,
1406        }
1407    }
1408
1409    /// Reverse the wire mapping. Returns `None` for unknown
1410    /// discriminants — the host parser is expected to log + skip such
1411    /// frames rather than panic.
1412    pub const fn from_wire(value: u16) -> Option<Self> {
1413        match value {
1414            0 => Some(ControlEvent::DeviceReady),
1415            1 => Some(ControlEvent::PortAdd),
1416            2 => Some(ControlEvent::PortRemove),
1417            3 => Some(ControlEvent::PortReady),
1418            4 => Some(ControlEvent::ConsolePort),
1419            5 => Some(ControlEvent::Resize),
1420            6 => Some(ControlEvent::PortOpen),
1421            7 => Some(ControlEvent::PortName),
1422            _ => None,
1423        }
1424    }
1425}
1426
1427// ---------------------------------------------------------------------------
1428// VirtioConsoleControl — wire-format control message
1429// ---------------------------------------------------------------------------
1430
1431/// Wire-format control message exchanged on c_ivq / c_ovq.
1432///
1433/// Mirrors `struct virtio_console_control` in
1434/// `include/uapi/linux/virtio_console.h`: id (u32), event (u16),
1435/// value (u16). The kernel's wire format is little-endian; on the LE
1436/// hosts ktstr targets (x86_64, aarch64), `repr(C)` produces the
1437/// correct byte order via zerocopy `IntoBytes` / `FromBytes`.
1438///
1439/// SAFETY: `repr(C)` produces an 8-byte struct with no padding when
1440/// every field is naturally aligned (u32 at offset 0, u16 at offset
1441/// 4, u16 at offset 6). The `packed` qualifier is unnecessary because
1442/// the natural alignment matches the kernel's expected wire layout
1443/// and is checked by [`std::mem::size_of`] below. Every bit pattern
1444/// is valid for u32/u16. zerocopy derives produce no panics.
1445#[repr(C)]
1446#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, zerocopy::Immutable, zerocopy::KnownLayout)]
1447pub struct VirtioConsoleControl {
1448    pub id: u32,
1449    pub event: u16,
1450    pub value: u16,
1451}
1452
1453const _VIRTIO_CONSOLE_CONTROL_SIZE: () = assert!(std::mem::size_of::<VirtioConsoleControl>() == 8);
1454
1455// ---------------------------------------------------------------------------
1456// Multiport device constants
1457// ---------------------------------------------------------------------------
1458
1459/// Number of multiport ports the device exposes.
1460///
1461/// Port 0 is the kernel console (`/dev/hvc0`); port 1 is the
1462/// host-bound bulk TLV stream (`/dev/vport0p1`); port 2 is the
1463/// scheduler stats bridge (`/dev/vport0p2`) carrying raw byte
1464/// passthrough between the host's [`super::sched_stats::SchedStatsClient`]
1465/// and the guest's `scx_stats` Unix-socket relay. Three ports →
1466/// eight queues per virtio-v1.2 §5.3.5 (`2 + 2 * num_ports`).
1467pub const NUM_PORTS: u32 = 3;
1468
1469/// Port-1 device-name advertised to the guest. The kernel exposes
1470/// this as `/sys/class/virtio-ports/vport0p1/name`; the guest init
1471/// reads from this path to discover the bulk channel device node.
1472pub const PORT1_NAME: &str = "ktstr-bulk";
1473
1474/// Port-2 device-name advertised to the guest. The kernel exposes
1475/// this as `/sys/class/virtio-ports/vport0p2/name`; the guest init
1476/// reads from this path to discover the scheduler-stats relay
1477/// device node and connects it to the scheduler's
1478/// `/var/run/scx/root/stats` Unix socket.
1479pub const PORT2_NAME: &str = "ktstr-stats";
1480
1481#[cfg(test)]
1482mod tests {
1483    use super::*;
1484
1485    /// `parse_scenario_end` round-trips the two LE u64s the guest
1486    /// writes, and rejects a short/torn payload (returns None rather
1487    /// than misreading) — the host folds the parsed iteration count
1488    /// into the terminal StimulusEvent for the last step's rate.
1489    #[test]
1490    fn parse_scenario_end_round_trip_and_short_payload() {
1491        let mut payload = [0u8; SCENARIO_END_PAYLOAD_SIZE];
1492        payload[0..8].copy_from_slice(&12_345u64.to_le_bytes());
1493        payload[8..16].copy_from_slice(&987_654u64.to_le_bytes());
1494        assert_eq!(parse_scenario_end(&payload), Some((12_345, 987_654)));
1495        // A short payload (e.g. only the elapsed field) is rejected.
1496        assert_eq!(parse_scenario_end(&payload[..8]), None);
1497        assert_eq!(parse_scenario_end(&[]), None);
1498    }
1499
1500    /// `ShmMessage` round-trips through bytes — guards against an
1501    /// accidental field reorder or a stray padding byte that would
1502    /// shift the on-wire layout for both guest writer and host
1503    /// reader.
1504    #[test]
1505    fn shm_message_round_trip_through_bytes() {
1506        let f = ShmMessage {
1507            msg_type: MSG_TYPE_EXIT,
1508            length: 4,
1509            crc32: 0xDEAD_BEEF,
1510            _pad: 0,
1511        };
1512        let bytes = f.as_bytes();
1513        assert_eq!(bytes.len(), FRAME_HEADER_SIZE);
1514        let back = ShmMessage::read_from_bytes(bytes).expect("16-byte slice deserializes");
1515        let msg_type = back.msg_type;
1516        let length = back.length;
1517        let crc32 = back.crc32;
1518        let pad = back._pad;
1519        assert_eq!(msg_type, MSG_TYPE_EXIT);
1520        assert_eq!(length, 4);
1521        assert_eq!(crc32, 0xDEAD_BEEF);
1522        assert_eq!(pad, 0);
1523    }
1524
1525    /// Every msg_type constant is distinct — a copy/paste error
1526    /// that aliased two ids would silently misroute messages.
1527    #[test]
1528    fn msg_type_constants_are_unique() {
1529        let ids = [
1530            MSG_TYPE_STIMULUS,
1531            MSG_TYPE_STEP_END,
1532            MSG_TYPE_SCENARIO_START,
1533            MSG_TYPE_SCENARIO_END,
1534            MSG_TYPE_SCENARIO_PAUSE,
1535            MSG_TYPE_SCENARIO_RESUME,
1536            MSG_TYPE_EXIT,
1537            MSG_TYPE_TEST_RESULT,
1538            MSG_TYPE_SCHED_EXIT,
1539            MSG_TYPE_CRASH,
1540            MSG_TYPE_PAYLOAD_METRICS,
1541            MSG_TYPE_PROFRAW,
1542            MSG_TYPE_WPROF_TRACE,
1543            MSG_TYPE_WPROF_TRACE_CHUNK,
1544            MSG_TYPE_SNAPSHOT_REQUEST,
1545            MSG_TYPE_SNAPSHOT_REPLY,
1546            MSG_TYPE_KERNEL_OP_REQUEST,
1547            MSG_TYPE_KERNEL_OP_REPLY,
1548            MSG_TYPE_SYS_RDY,
1549            MSG_TYPE_SCHED_SWAP_NOTIFY,
1550            MSG_TYPE_STDOUT,
1551            MSG_TYPE_STDERR,
1552            MSG_TYPE_SCHED_LOG,
1553            MSG_TYPE_LIFECYCLE,
1554            MSG_TYPE_EXEC_EXIT,
1555            MSG_TYPE_DMESG,
1556            MSG_TYPE_PROBE_OUTPUT,
1557        ];
1558        for (i, a) in ids.iter().enumerate() {
1559            for b in &ids[i + 1..] {
1560                assert_ne!(a, b, "duplicate MSG_TYPE id 0x{a:08x}");
1561            }
1562        }
1563    }
1564
1565    /// Pin the on-wire byte order of `msg_type` to little-endian.
1566    /// The integer literal `0x4558_4954` spells `"EXIT"` in hex digits
1567    /// (`45`='E', `58`='X', `49`='I', `54`='T'), but the LE encoding
1568    /// places the least-significant byte first — so a raw byte dump
1569    /// of a serialized `ShmMessage` shows `[0x54, 0x49, 0x58, 0x45]`,
1570    /// which spells `"TIXE"` byte-by-byte. A future change that
1571    /// flipped the host to big-endian or switched zerocopy's
1572    /// serialization order would silently break the wire contract
1573    /// with the kernel virtio_console driver and every existing
1574    /// guest writer; this test fails loudly instead.
1575    #[test]
1576    fn msg_type_exit_wire_bytes_are_le() {
1577        let f = ShmMessage {
1578            msg_type: MSG_TYPE_EXIT,
1579            length: 0,
1580            crc32: 0,
1581            _pad: 0,
1582        };
1583        let bytes = f.as_bytes();
1584        // First 4 bytes of the header are msg_type as a u32 LE.
1585        assert_eq!(&bytes[..4], &MSG_TYPE_EXIT.to_le_bytes());
1586        // Spell-out check: the LE byte sequence is "TIXE", not "EXIT".
1587        // If the wire ever flips to BE, this assertion fails before the
1588        // guest driver sees the malformed frame.
1589        assert_eq!(&bytes[..4], b"TIXE");
1590    }
1591
1592    /// `ShmMessage` header is exactly 16 bytes with no padding.
1593    #[test]
1594    fn shm_message_size_is_16() {
1595        assert_eq!(FRAME_HEADER_SIZE, 16);
1596        assert_eq!(std::mem::size_of::<ShmMessage>(), 16);
1597    }
1598
1599    /// Every [`MsgType`] variant round-trips through
1600    /// `wire_value` → `from_wire`.
1601    #[test]
1602    fn msg_type_round_trips() {
1603        let all = [
1604            MsgType::Stimulus,
1605            MsgType::StepEnd,
1606            MsgType::ScenarioStart,
1607            MsgType::ScenarioEnd,
1608            MsgType::ScenarioPause,
1609            MsgType::ScenarioResume,
1610            MsgType::Exit,
1611            MsgType::TestResult,
1612            MsgType::SchedExit,
1613            MsgType::Crash,
1614            MsgType::PayloadMetrics,
1615            MsgType::Profraw,
1616            MsgType::WprofTrace,
1617            MsgType::WprofTraceChunk,
1618            MsgType::SnapshotRequest,
1619            MsgType::SnapshotReply,
1620            MsgType::KernelOpRequest,
1621            MsgType::KernelOpReply,
1622            MsgType::SysRdy,
1623            MsgType::SchedSwapNotify,
1624            MsgType::Stdout,
1625            MsgType::Stderr,
1626            MsgType::SchedLog,
1627            MsgType::Lifecycle,
1628            MsgType::ExecExit,
1629            MsgType::Dmesg,
1630            MsgType::ProbeOutput,
1631        ];
1632        for variant in all {
1633            let v = variant.wire_value();
1634            assert_eq!(MsgType::from_wire(v), Some(variant));
1635        }
1636    }
1637
1638    /// `MsgType::from_wire` returns `None` for an unrecognised
1639    /// discriminant — the bulk parser must surface unknown tags as
1640    /// errors rather than treat them as a known variant.
1641    #[test]
1642    fn msg_type_from_wire_unknown_returns_none() {
1643        assert_eq!(MsgType::from_wire(0xDEAD_BEEF), None);
1644        assert_eq!(MsgType::from_wire(0), None);
1645    }
1646
1647    /// `MsgType::wire_value` matches the corresponding `MSG_TYPE_*`
1648    /// constant — guards against a typo that would diverge the typed
1649    /// API from the on-wire constant.
1650    #[test]
1651    fn msg_type_wire_value_matches_constants() {
1652        assert_eq!(MsgType::Stimulus.wire_value(), MSG_TYPE_STIMULUS);
1653        assert_eq!(MsgType::StepEnd.wire_value(), MSG_TYPE_STEP_END);
1654        assert_eq!(MsgType::ScenarioStart.wire_value(), MSG_TYPE_SCENARIO_START);
1655        assert_eq!(MsgType::ScenarioPause.wire_value(), MSG_TYPE_SCENARIO_PAUSE);
1656        assert_eq!(
1657            MsgType::ScenarioResume.wire_value(),
1658            MSG_TYPE_SCENARIO_RESUME
1659        );
1660        assert_eq!(MsgType::ScenarioEnd.wire_value(), MSG_TYPE_SCENARIO_END);
1661        assert_eq!(MsgType::Exit.wire_value(), MSG_TYPE_EXIT);
1662        assert_eq!(MsgType::TestResult.wire_value(), MSG_TYPE_TEST_RESULT);
1663        assert_eq!(MsgType::SchedExit.wire_value(), MSG_TYPE_SCHED_EXIT);
1664        assert_eq!(MsgType::Crash.wire_value(), MSG_TYPE_CRASH);
1665        assert_eq!(
1666            MsgType::PayloadMetrics.wire_value(),
1667            MSG_TYPE_PAYLOAD_METRICS
1668        );
1669        assert_eq!(MsgType::Profraw.wire_value(), MSG_TYPE_PROFRAW);
1670        assert_eq!(MsgType::WprofTrace.wire_value(), MSG_TYPE_WPROF_TRACE);
1671        assert_eq!(
1672            MsgType::WprofTraceChunk.wire_value(),
1673            MSG_TYPE_WPROF_TRACE_CHUNK
1674        );
1675        assert_eq!(
1676            MsgType::SnapshotRequest.wire_value(),
1677            MSG_TYPE_SNAPSHOT_REQUEST
1678        );
1679        assert_eq!(MsgType::SnapshotReply.wire_value(), MSG_TYPE_SNAPSHOT_REPLY);
1680        assert_eq!(
1681            MsgType::KernelOpRequest.wire_value(),
1682            MSG_TYPE_KERNEL_OP_REQUEST
1683        );
1684        assert_eq!(
1685            MsgType::KernelOpReply.wire_value(),
1686            MSG_TYPE_KERNEL_OP_REPLY
1687        );
1688        assert_eq!(MsgType::SysRdy.wire_value(), MSG_TYPE_SYS_RDY);
1689        assert_eq!(MsgType::Stdout.wire_value(), MSG_TYPE_STDOUT);
1690        assert_eq!(MsgType::Stderr.wire_value(), MSG_TYPE_STDERR);
1691        assert_eq!(MsgType::SchedLog.wire_value(), MSG_TYPE_SCHED_LOG);
1692        assert_eq!(MsgType::Lifecycle.wire_value(), MSG_TYPE_LIFECYCLE);
1693        assert_eq!(MsgType::ExecExit.wire_value(), MSG_TYPE_EXEC_EXIT);
1694        assert_eq!(MsgType::Dmesg.wire_value(), MSG_TYPE_DMESG);
1695        assert_eq!(MsgType::ProbeOutput.wire_value(), MSG_TYPE_PROBE_OUTPUT);
1696        assert_eq!(
1697            MsgType::SchedSwapNotify.wire_value(),
1698            MSG_TYPE_SCHED_SWAP_NOTIFY
1699        );
1700    }
1701
1702    /// `is_coordinator_internal` flips on for SnapshotRequest,
1703    /// SnapshotReply, KernelOpRequest, KernelOpReply, SysRdy, and
1704    /// SchedSwapNotify and stays off for every test-verdict-bearing
1705    /// variant. The
1706    /// Reply variants are host→guest only on port-1 RX; a guest TX
1707    /// frame stamped with one of those tags is illegitimate and
1708    /// must be dropped rather than bucketed as a phantom verdict
1709    /// entry. Pinning this matrix here means a future contributor
1710    /// adding a new control frame must explicitly opt into the
1711    /// gate (or explicitly opt out by adding a "verdict-bearing"
1712    /// entry to the test) — the freeze coord's mid-run filter and
1713    /// `collect_results`'s post-run drain both key on this single
1714    /// classifier (search for `is_coordinator_internal` in
1715    /// `crate::vmm::freeze_coord`).
1716    #[test]
1717    fn is_coordinator_internal_matches_filter_set() {
1718        let internal = [
1719            MsgType::SnapshotRequest,
1720            MsgType::SnapshotReply,
1721            MsgType::KernelOpRequest,
1722            MsgType::KernelOpReply,
1723            MsgType::SysRdy,
1724            MsgType::SchedSwapNotify,
1725        ];
1726        let verdict = [
1727            MsgType::Stimulus,
1728            MsgType::StepEnd,
1729            MsgType::ScenarioStart,
1730            MsgType::ScenarioEnd,
1731            MsgType::ScenarioPause,
1732            MsgType::ScenarioResume,
1733            MsgType::Exit,
1734            MsgType::TestResult,
1735            MsgType::SchedExit,
1736            MsgType::Crash,
1737            MsgType::PayloadMetrics,
1738            MsgType::Profraw,
1739            MsgType::WprofTrace,
1740            MsgType::WprofTraceChunk,
1741            MsgType::Stdout,
1742            MsgType::Stderr,
1743            MsgType::SchedLog,
1744            MsgType::Lifecycle,
1745            MsgType::ExecExit,
1746            MsgType::Dmesg,
1747            MsgType::ProbeOutput,
1748        ];
1749        for v in internal {
1750            assert!(
1751                v.is_coordinator_internal(),
1752                "{v:?} must be classified as coordinator-internal"
1753            );
1754        }
1755        for v in verdict {
1756            assert!(
1757                !v.is_coordinator_internal(),
1758                "{v:?} carries test verdict data and must NOT be filtered out"
1759            );
1760        }
1761    }
1762
1763    /// `MsgType::SchedSwapNotify` round-trips `wire_value` →
1764    /// `from_wire`, carries the stable `"SCSW"` (0x5343_5357)
1765    /// discriminant, and is classified coordinator-internal so the
1766    /// freeze coord's mid-run filter and `collect_results`'s post-run
1767    /// drain both drop it rather than bucketing a phantom verdict.
1768    #[test]
1769    fn sched_swap_notify_round_trips() {
1770        assert_eq!(
1771            MsgType::from_wire(MsgType::SchedSwapNotify.wire_value()),
1772            Some(MsgType::SchedSwapNotify)
1773        );
1774        assert_eq!(MsgType::SchedSwapNotify.wire_value(), 0x5343_5357);
1775        assert!(MsgType::SchedSwapNotify.is_coordinator_internal());
1776    }
1777
1778    /// `LifecyclePhase` round-trips through `wire_value` →
1779    /// `from_wire`. Phase values are byte-stable across builds so
1780    /// the host never silently misclassifies a future guest's
1781    /// phase signal.
1782    #[test]
1783    fn lifecycle_phase_round_trips() {
1784        let all = [
1785            LifecyclePhase::InitStarted,
1786            LifecyclePhase::PayloadStarting,
1787            LifecyclePhase::SchedulerDied,
1788            LifecyclePhase::SchedulerNotAttached,
1789            LifecyclePhase::WorkloadDispatched,
1790        ];
1791        for p in all {
1792            let v = p.wire_value();
1793            assert_eq!(LifecyclePhase::from_wire(v), Some(p));
1794        }
1795    }
1796
1797    /// `LifecyclePhase::from_wire(0)` returns `None` — `0` is
1798    /// reserved as the unknown / invalid sentinel so a
1799    /// zero-initialised payload byte never silently maps to
1800    /// `InitStarted`.
1801    #[test]
1802    fn lifecycle_phase_zero_is_reserved() {
1803        assert_eq!(LifecyclePhase::from_wire(0), None);
1804        assert_eq!(LifecyclePhase::from_wire(0xFF), None);
1805    }
1806
1807    /// Pin the `LifecyclePhase` discriminants. Wire values are part
1808    /// of the protocol contract — a future change that reorders
1809    /// the enum variants would silently shift this mapping unless
1810    /// pinned by an explicit assertion here.
1811    #[test]
1812    fn lifecycle_phase_wire_values_are_stable() {
1813        assert_eq!(LifecyclePhase::InitStarted.wire_value(), 1);
1814        assert_eq!(LifecyclePhase::PayloadStarting.wire_value(), 2);
1815        assert_eq!(LifecyclePhase::SchedulerDied.wire_value(), 3);
1816        assert_eq!(LifecyclePhase::SchedulerNotAttached.wire_value(), 4);
1817        assert_eq!(LifecyclePhase::WorkloadDispatched.wire_value(), 5);
1818    }
1819
1820    /// `SnapshotRequestPayload` round-trips through bytes — guards
1821    /// against an accidental field reorder or a stray padding byte
1822    /// that would shift the on-wire layout for both guest writer
1823    /// and host parser.
1824    #[test]
1825    fn snapshot_request_payload_round_trip_through_bytes() {
1826        let mut tag = [0u8; SNAPSHOT_TAG_MAX];
1827        tag[..6].copy_from_slice(b"hello!");
1828        let p = SnapshotRequestPayload {
1829            request_id: 0xDEAD_BEEF,
1830            kind: SNAPSHOT_KIND_CAPTURE,
1831            tag,
1832        };
1833        let bytes = p.as_bytes();
1834        assert_eq!(bytes.len(), 8 + SNAPSHOT_TAG_MAX);
1835        let back = SnapshotRequestPayload::read_from_bytes(bytes).expect("payload deserializes");
1836        let request_id = back.request_id;
1837        let kind = back.kind;
1838        assert_eq!(request_id, 0xDEAD_BEEF);
1839        assert_eq!(kind, SNAPSHOT_KIND_CAPTURE);
1840        assert_eq!(&back.tag[..6], b"hello!");
1841    }
1842
1843    /// `SnapshotReplyPayload` round-trips through bytes.
1844    #[test]
1845    fn snapshot_reply_payload_round_trip_through_bytes() {
1846        let mut reason = [0u8; SNAPSHOT_REASON_MAX];
1847        reason[..4].copy_from_slice(b"oops");
1848        let p = SnapshotReplyPayload {
1849            request_id: 0xCAFE_BABE,
1850            status: SNAPSHOT_STATUS_ERR,
1851            reason,
1852        };
1853        let bytes = p.as_bytes();
1854        assert_eq!(bytes.len(), 8 + SNAPSHOT_REASON_MAX);
1855        let back = SnapshotReplyPayload::read_from_bytes(bytes).expect("payload deserializes");
1856        let request_id = back.request_id;
1857        let status = back.status;
1858        assert_eq!(request_id, 0xCAFE_BABE);
1859        assert_eq!(status, SNAPSHOT_STATUS_ERR);
1860        assert_eq!(&back.reason[..4], b"oops");
1861    }
1862
1863    /// Snapshot kind constants are distinct.
1864    #[test]
1865    fn snapshot_kind_constants_are_unique() {
1866        assert_ne!(SNAPSHOT_KIND_NONE, SNAPSHOT_KIND_CAPTURE);
1867        assert_ne!(SNAPSHOT_KIND_NONE, SNAPSHOT_KIND_WATCH);
1868        assert_ne!(SNAPSHOT_KIND_CAPTURE, SNAPSHOT_KIND_WATCH);
1869    }
1870
1871    /// Snapshot status constants are distinct.
1872    #[test]
1873    fn snapshot_status_constants_are_unique() {
1874        assert_ne!(SNAPSHOT_STATUS_OK, SNAPSHOT_STATUS_ERR);
1875    }
1876
1877    /// Every [`ControlEvent`] variant round-trips through
1878    /// `wire_value` → `from_wire`.
1879    #[test]
1880    fn control_event_round_trips() {
1881        let all = [
1882            ControlEvent::DeviceReady,
1883            ControlEvent::PortAdd,
1884            ControlEvent::PortRemove,
1885            ControlEvent::PortReady,
1886            ControlEvent::ConsolePort,
1887            ControlEvent::Resize,
1888            ControlEvent::PortOpen,
1889            ControlEvent::PortName,
1890        ];
1891        for variant in all {
1892            let v = variant.wire_value();
1893            assert_eq!(ControlEvent::from_wire(v), Some(variant));
1894        }
1895    }
1896
1897    /// `ControlEvent::from_wire` returns `None` for unknown values.
1898    #[test]
1899    fn control_event_from_wire_unknown_returns_none() {
1900        assert_eq!(ControlEvent::from_wire(8), None);
1901        assert_eq!(ControlEvent::from_wire(0xFFFF), None);
1902    }
1903
1904    /// `ControlEvent` discriminants match the kernel uapi numbers
1905    /// (`VIRTIO_CONSOLE_*` in `include/uapi/linux/virtio_console.h`).
1906    #[test]
1907    fn control_event_discriminants_match_uapi() {
1908        assert_eq!(ControlEvent::DeviceReady.wire_value(), 0);
1909        assert_eq!(ControlEvent::PortAdd.wire_value(), 1);
1910        assert_eq!(ControlEvent::PortRemove.wire_value(), 2);
1911        assert_eq!(ControlEvent::PortReady.wire_value(), 3);
1912        assert_eq!(ControlEvent::ConsolePort.wire_value(), 4);
1913        assert_eq!(ControlEvent::Resize.wire_value(), 5);
1914        assert_eq!(ControlEvent::PortOpen.wire_value(), 6);
1915        assert_eq!(ControlEvent::PortName.wire_value(), 7);
1916    }
1917
1918    /// `VirtioConsoleControl` is exactly 8 bytes — matches the
1919    /// kernel uapi struct.
1920    #[test]
1921    fn virtio_console_control_size_is_8() {
1922        assert_eq!(std::mem::size_of::<VirtioConsoleControl>(), 8);
1923    }
1924
1925    /// `VirtioConsoleControl` round-trips through bytes — pins the
1926    /// repr(C) layout against an accidental field reorder that would
1927    /// produce malformed control frames on the c_ivq / c_ovq queues.
1928    #[test]
1929    fn virtio_console_control_round_trip() {
1930        let c = VirtioConsoleControl {
1931            id: 1,
1932            event: ControlEvent::PortOpen.wire_value(),
1933            value: 1,
1934        };
1935        let bytes = c.as_bytes();
1936        assert_eq!(bytes.len(), 8);
1937        let back = VirtioConsoleControl::read_from_bytes(bytes).unwrap();
1938        let id = back.id;
1939        let event = back.event;
1940        let value = back.value;
1941        assert_eq!(id, 1);
1942        assert_eq!(event, ControlEvent::PortOpen.wire_value());
1943        assert_eq!(value, 1);
1944    }
1945
1946    /// `KernelOpRequestPayload` round-trips through postcard with
1947    /// every `KernelOpTarget` + `KernelOpValue` variant present —
1948    /// pins encode/decode against an accidental serde derive
1949    /// breakage on either side. The wire format the freeze coord
1950    /// (host) decodes is exactly what the guest's
1951    /// [`crate::vmm::guest_comms::request_kernel_op`] encodes, so a
1952    /// round-trip mismatch surfaces as a silent host-side parse
1953    /// failure rather than a typed error.
1954    #[test]
1955    fn kernel_op_request_payload_postcard_round_trip() {
1956        let payload = KernelOpRequestPayload {
1957            request_id: 0xCAFEBABE,
1958            mode: KernelOpMode::Cold,
1959            direction: KernelOpDirection::Write,
1960            tag: "with_uptime".into(),
1961            entries: vec![
1962                KernelOpEntry {
1963                    target: KernelOpTarget::Symbol("jiffies".into()),
1964                    value: KernelOpValue::U64(42),
1965                },
1966                KernelOpEntry {
1967                    target: KernelOpTarget::Direct(0xffff_8000_0000_1000),
1968                    value: KernelOpValue::U32(7),
1969                },
1970                KernelOpEntry {
1971                    target: KernelOpTarget::Kva(0xffff_c000_dead_beef),
1972                    value: KernelOpValue::Bytes(vec![1, 2, 3, 4, 5]),
1973                },
1974                KernelOpEntry {
1975                    target: KernelOpTarget::PerCpuField {
1976                        symbol: "runqueues".into(),
1977                        field: "clock".into(),
1978                        cpu: 3,
1979                    },
1980                    value: KernelOpValue::U64(0xDEAD_BEEF_CAFE_F00D),
1981                },
1982                KernelOpEntry {
1983                    target: KernelOpTarget::TaskField {
1984                        pid: 12345,
1985                        expected_start_time_ns: 1_700_000_000_000,
1986                        field: "scx.dsq_vtime".into(),
1987                    },
1988                    value: KernelOpValue::U64(30 * 86400 * 1_000_000_000),
1989                },
1990            ],
1991        };
1992        let bytes = postcard::to_allocvec(&payload).expect("encode");
1993        let back: KernelOpRequestPayload = postcard::from_bytes(&bytes).expect("decode");
1994        assert_eq!(back, payload);
1995    }
1996
1997    /// `KernelOpReplyPayload` round-trips through postcard. The
1998    /// reply carries success/failure + (for reads) the per-entry
1999    /// values the host coordinator read — both code paths must
2000    /// survive encode/decode unchanged.
2001    #[test]
2002    fn kernel_op_reply_payload_postcard_round_trip() {
2003        let success = KernelOpReplyPayload {
2004            request_id: 0x1234_5678,
2005            success: true,
2006            reason: String::new(),
2007            read_values: vec![
2008                KernelOpValue::U64(100),
2009                KernelOpValue::U32(200),
2010                KernelOpValue::Bytes(vec![0xAB, 0xCD, 0xEF]),
2011            ],
2012        };
2013        let bytes = postcard::to_allocvec(&success).expect("encode success");
2014        let back: KernelOpReplyPayload = postcard::from_bytes(&bytes).expect("decode success");
2015        assert_eq!(back, success);
2016
2017        let failure = KernelOpReplyPayload {
2018            request_id: 0xFEED_FACE,
2019            success: false,
2020            reason: "host: symbol 'jiffies' not found in vmlinux".into(),
2021            read_values: vec![],
2022        };
2023        let bytes = postcard::to_allocvec(&failure).expect("encode failure");
2024        let back: KernelOpReplyPayload = postcard::from_bytes(&bytes).expect("decode failure");
2025        assert_eq!(back, failure);
2026    }
2027
2028    /// `KERNEL_OP_REPLY_MAX` envelope check: a representative
2029    /// large reply (1024-CPU per-CPU u64) fits comfortably; the
2030    /// cap is 1 MiB which bounds OOM exposure while accommodating
2031    /// realistic batch shapes. A regression that shrunk the cap
2032    /// below ~10 KiB would silently truncate large kernel-op
2033    /// replies; one that grew it beyond 1 MiB would widen the
2034    /// OOM-attack surface.
2035    #[test]
2036    fn kernel_op_reply_max_envelope_check() {
2037        // 1024 CPUs * KernelOpValue::U64 (~9 bytes each + bookkeeping)
2038        // is well under 1 MiB. Build a representative reply and
2039        // verify its encoded size sits inside the cap.
2040        let big = KernelOpReplyPayload {
2041            request_id: 1,
2042            success: true,
2043            reason: String::new(),
2044            read_values: (0..1024u64).map(KernelOpValue::U64).collect(),
2045        };
2046        let bytes = postcard::to_allocvec(&big).expect("encode 1024-CPU reply");
2047        assert!(
2048            bytes.len() < KERNEL_OP_REPLY_MAX,
2049            "1024-CPU kernel-op reply ({} bytes) must fit under \
2050             KERNEL_OP_REPLY_MAX ({KERNEL_OP_REPLY_MAX} bytes)",
2051            bytes.len(),
2052        );
2053        // The cap is exactly 1 MiB — large enough for per-CPU 1 KiB
2054        // Bytes reads on 1024 CPUs, small enough to keep OOM
2055        // exposure bounded for a forged frame.
2056        assert_eq!(KERNEL_OP_REPLY_MAX, 1024 * 1024);
2057    }
2058
2059    // ----- KernAddrs wire-format pins -----
2060    //
2061    // Pin both the typed encode/decode contract AND the on-wire
2062    // byte layout. The byte-layout pin (last test) catches slot-
2063    // swap regressions that a roundtrip-equality test alone would
2064    // miss when encoder and decoder both flip the same way.
2065
2066    #[test]
2067    fn kern_addrs_roundtrip_all_present() {
2068        let a = KernAddrs::new(
2069            0x12345678u64,
2070            0xffff_8880_0000_0000u64,
2071            Some(0xffff_ffff_8200_0000u64),
2072        );
2073        let payload = a.to_payload();
2074        assert_eq!(payload.len(), KernAddrs::WIRE_LEN);
2075        let b = KernAddrs::from_payload(&payload).expect("decode");
2076        assert_eq!(b.phys_base, a.phys_base);
2077        assert_eq!(b.page_offset_base, a.page_offset_base);
2078        assert_eq!(b.kernel_text_runtime_kva, a.kernel_text_runtime_kva);
2079        assert!(b.has_phys_present_bit());
2080    }
2081
2082    #[test]
2083    fn kern_addrs_roundtrip_kallsyms_absent() {
2084        // The None branch on kernel_text_runtime_kva must decode
2085        // back to None (NOT Some(u64::MAX) via wrapping_sub(1) on
2086        // a raw-0 biased slot). The biased-0 sentinel is the
2087        // wire-format "guest could not derive" marker.
2088        let a = KernAddrs::new(0u64, 0u64, None);
2089        let payload = a.to_payload();
2090        let b = KernAddrs::from_payload(&payload).expect("decode");
2091        assert_eq!(
2092            b.kernel_text_runtime_kva, None,
2093            "biased-0 runtime slot must decode to None"
2094        );
2095        // phys_base = 0 IS a valid KASLR-off value; the biased
2096        // encoder writes 1 (non-zero) so has_phys_present_bit
2097        // surfaces present.
2098        assert!(b.has_phys_present_bit());
2099    }
2100
2101    #[test]
2102    fn kern_addrs_from_payload_rejects_length_mismatch() {
2103        // Exact-length match required so a protocol-extension
2104        // partial write or truncated wire surfaces as None,
2105        // never as a zero-padded silent decode.
2106        assert!(KernAddrs::from_payload(&[]).is_none());
2107        assert!(KernAddrs::from_payload(&[0u8; KernAddrs::WIRE_LEN - 1]).is_none());
2108        assert!(KernAddrs::from_payload(&[0u8; KernAddrs::WIRE_LEN + 1]).is_none());
2109    }
2110
2111    #[test]
2112    fn stimulus_from_payload_requires_exact_24_bytes() {
2113        // Exact-length match (24 bytes): an undersized buffer would
2114        // truncate and an oversized one carries bytes the guest never
2115        // frames (send_stimulus/send_step_end write exactly 24), so a
2116        // torn / hostile frame is dropped, not promoted by a prefix read.
2117        let n = std::mem::size_of::<StimulusPayload>();
2118        assert_eq!(n, 24);
2119        assert!(StimulusEvent::from_payload(&[0u8; 23]).is_none());
2120        assert!(StimulusEvent::from_payload(&[0u8; 25]).is_none());
2121        assert!(StimulusEvent::from_payload(&[0u8; 24]).is_some());
2122    }
2123
2124    #[test]
2125    fn kern_addrs_has_phys_present_bit_distinguishes_zero_vs_absent() {
2126        // Pins the bias-sentinel contract on the present-bit
2127        // accessor. A struct constructed via KernAddrs::new with
2128        // phys_base=0 surfaces as present (encoded biased = 1).
2129        // A hand-decoded all-zero payload (the "guest never
2130        // sent" wire state) surfaces as absent (raw biased 0 →
2131        // wrapping_sub(1) = u64::MAX → has_phys_present_bit
2132        // returns false).
2133        let present = KernAddrs::new(0u64, 0u64, None);
2134        assert!(present.has_phys_present_bit());
2135        let absent = KernAddrs::from_payload(&[0u8; KernAddrs::WIRE_LEN])
2136            .expect("zero-length payload decodes; shape is valid");
2137        assert!(
2138            !absent.has_phys_present_bit(),
2139            "zero-bias slot decodes to u64::MAX; has_phys_present_bit must surface absent"
2140        );
2141    }
2142
2143    #[test]
2144    fn kern_addrs_to_payload_byte_layout_is_le_phys_first() {
2145        // Pin the on-wire byte layout directly via slot offsets.
2146        // A reordering refactor that swapped slots would silently
2147        // pass the roundtrip-equality tests above (encoder and
2148        // decoder would flip together) — this test catches the
2149        // slot-swap class by asserting against fixed byte
2150        // positions.
2151        let a = KernAddrs::new(
2152            0x1111_2222_3333_4444u64,
2153            0xaaaa_bbbb_cccc_ddddu64,
2154            Some(0x5555_6666_7777_8888u64),
2155        );
2156        let p = a.to_payload();
2157        // phys_base biased: ...4444 + 1 = ...4445; LE first byte = 0x45.
2158        assert_eq!(p[0], 0x45, "phys_base slot is [0..8] LE biased");
2159        assert_eq!(
2160            u64::from_le_bytes(p[..8].try_into().unwrap()),
2161            0x1111_2222_3333_4445
2162        );
2163        assert_eq!(
2164            u64::from_le_bytes(p[8..16].try_into().unwrap()),
2165            0xaaaa_bbbb_cccc_ddddu64
2166        );
2167        // kernel_text_runtime_kva biased: ...8888 + 1 = ...8889.
2168        assert_eq!(
2169            u64::from_le_bytes(p[16..24].try_into().unwrap()),
2170            0x5555_6666_7777_8889u64
2171        );
2172    }
2173
2174    #[test]
2175    fn kern_addrs_u64_max_runtime_collapses_to_absent_roundtrip() {
2176        // Documents the bias-encoding boundary collision: a
2177        // `kernel_text_runtime_kva` of `u64::MAX` wraps to biased 0
2178        // on encode, which the decoder reads as the "guest could
2179        // not derive" sentinel (None). u64::MAX is non-canonical
2180        // and impossible as a real `_text` KVA, AND the
2181        // downstream KERN_ADDRS dispatch arm in dispatch.rs
2182        // triple-gates derived offsets (kernel-half threshold +
2183        // non-negative slide + ≤1GiB max-slide-bound + link
2184        // canonical) which catch any synthesized variant of this
2185        // collision before it reaches the shared Arc. Test pins
2186        // the symmetric collapse so a future encoder refactor
2187        // that broke the "absent sentinel = biased 0" contract
2188        // (e.g. switched to a different sentinel value) trips
2189        // here loudly.
2190        let max_runtime = KernAddrs::new(0u64, 0u64, Some(u64::MAX));
2191        let payload = max_runtime.to_payload();
2192        // Biased slot reads 0 — collides with the absent encoding.
2193        assert_eq!(
2194            u64::from_le_bytes(payload[16..24].try_into().unwrap()),
2195            0,
2196            "Some(u64::MAX) biased-add wraps to 0; collides with absent sentinel"
2197        );
2198        // Roundtrip surfaces it as None, not Some(u64::MAX).
2199        let decoded = KernAddrs::from_payload(&payload).expect("decode");
2200        assert_eq!(
2201            decoded.kernel_text_runtime_kva, None,
2202            "u64::MAX runtime decodes to None via the bias collision"
2203        );
2204    }
2205}