ktstr/scenario/ops/types/
op.rs

1//! `Op` operation taxonomy + `CpusetSpec` enum with its constructor
2//! impl block. Owns the variant-set (and `OpKind` discriminator
3//! enum) plus the cpuset-spec construction surface. Resolution-time
4//! `CpusetSpec` logic lives in a sibling impl block in
5//! [`super::resolve`] — Rust permits multiple impl blocks across
6//! files in the same crate; the split tracks the construction /
7//! resolution responsibility boundary.
8//!
9//! See the parent module ([`super`]) for the file-layout overview
10//! and the cross-impl-block convention.
11
12use std::borrow::Cow;
13use std::collections::BTreeSet;
14
15use crate::workload::{AffinityIntent, WorkSpec};
16
17use super::CgroupDef;
18
19/// Atomic operation on the cgroup topology.
20///
21/// Names use `Cow<'static, str>` so ops can reference compile-time
22/// literals (zero-cost) or runtime-generated strings (owned).
23///
24/// # `#[non_exhaustive]`
25///
26/// `Op` is `#[non_exhaustive]` — see [`crate::non_exhaustive`] for
27/// the cross-crate pattern-match rule. `Op`-specific construction
28/// convention: prefer the per-op constructors (e.g. `Op::add_cgroup`,
29/// `Op::run_payload`) over naming variants directly; new
30/// constructors are added alongside new variants and are the stable
31/// surface.
32#[derive(Clone, Debug, strum::EnumDiscriminants)]
33#[strum_discriminants(name(OpKind))]
34#[strum_discriminants(derive(strum::EnumIter))]
35#[strum_discriminants(vis(pub))]
36#[non_exhaustive]
37pub enum Op {
38    /// Create a new cgroup under the managed cgroup parent, with no
39    /// cpuset, no controller knobs, and no workers — the
40    /// operator-friendly way to declare an empty move-target cgroup
41    /// that later receives tasks via [`Op::MoveAllTasks`] or
42    /// similar. For mid-step cgroups that need cpuset / cpu /
43    /// memory / io / pids / workers, use [`Op::add_cgroup_def`]
44    /// instead; for setup-time cgroups with the same knobs, declare
45    /// via [`super::super::Step::with_defs`].
46    AddCgroup { name: Cow<'static, str> },
47    /// Create a cgroup mid-step from a full [`CgroupDef`] — cpuset,
48    /// cpu/memory/io/pids knobs, and worker spawns all apply in one
49    /// op, mirroring the way `Step::with_defs` materializes a
50    /// step-local CgroupDef at setup time. Use this when the
51    /// add-cgroup-with-cpuset-and-workers sequence needs to happen
52    /// after the step's setup pass (e.g. driven by an earlier op's
53    /// observed state) instead of as part of the step's setup. The
54    /// embedded `def` is dedup-checked the same way `apply_setup`
55    /// rejects collisions with prior Backdrop or step-local
56    /// CgroupDef declarations.
57    AddCgroupDef { def: CgroupDef },
58    /// Remove a cgroup (stops its workers first). Permitted against
59    /// both step-local and Backdrop-owned cgroups; removing a
60    /// Backdrop cgroup mid-scenario drops it from the Backdrop
61    /// tracking list so a later `Op::AddCgroup` with the same name
62    /// can re-create the cgroup. A typo'd cgroup name surfaces
63    /// later as a kernel-layer "cgroup missing" error on the next
64    /// op that references the name, not at the RemoveCgroup site.
65    RemoveCgroup { cgroup: Cow<'static, str> },
66    /// Set a cgroup's cpuset to the resolved CPU set.
67    SetCpuset {
68        cgroup: Cow<'static, str>,
69        cpus: CpusetSpec,
70    },
71    /// Clear a cgroup's cpuset (allow all CPUs).
72    ClearCpuset { cgroup: Cow<'static, str> },
73    /// Read both cgroups' cpusets and swap them.
74    SwapCpusets {
75        a: Cow<'static, str>,
76        b: Cow<'static, str>,
77    },
78    /// Spawn workers and place them according to `placement`.
79    ///
80    /// The work type is used as-is; gauntlet `work_type_override` does
81    /// not apply. Use [`CgroupDef`] with `swappable(true)` when the
82    /// work type should be overridable.
83    ///
84    /// Placement contract (bullets follow [`SpawnPlacement`] variant
85    /// declaration order):
86    ///   * [`SpawnPlacement::RunnerCgroup`] — spawn workers in the
87    ///     spawner's own cgroup; the handler issues ZERO cgroup ops
88    ///     and the workers inherit whatever cgroup the test runner
89    ///     sits in. `WorkSpec::workers_pct` is rejected for this
90    ///     placement because there's no managed cgroup whose cpuset
91    ///     would supply the percentage denominator.
92    ///   * [`SpawnPlacement::Cgroup`] — spawn workers and move them
93    ///     into the named cgroup; the cgroup must already exist
94    ///     (declared via [`CgroupDef`] in `Step.setup`, via
95    ///     [`Op::AddCgroup`] / [`Op::AddCgroupDef`] earlier in the
96    ///     same step, or on the persistent
97    ///     [`Backdrop`](crate::scenario::Backdrop)).
98    Spawn {
99        placement: SpawnPlacement,
100        work: WorkSpec,
101    },
102    /// Stop all workers in a cgroup (does not remove the cgroup).
103    /// Permitted against both step-local and Backdrop-owned cgroups;
104    /// stopping a Backdrop cgroup's workers mid-scenario leaves the
105    /// cgroup hierarchy intact but makes subsequent ops that expect
106    /// those workers (e.g. wait/kill payload) fail to find them.
107    StopCgroup { cgroup: Cow<'static, str> },
108    /// Set worker affinity in a cgroup. Resolved at apply time via
109    /// `resolve_affinity_for_cgroup()`.
110    SetAffinity {
111        cgroup: Cow<'static, str>,
112        affinity: AffinityIntent,
113    },
114    /// Move all tasks from one cgroup to another.
115    ///
116    /// Each task is moved via `cgroup.procs`. If any move fails, the
117    /// error propagates and handle name keys are left unchanged (workers
118    /// remain addressed under `from`). On success, handle name keys are
119    /// updated to `to` so subsequent ops address the moved workers.
120    ///
121    /// # Self-move rejection
122    ///
123    /// A self-move (`from == to`) is rejected at handler entry — the
124    /// kernel cgroup.procs write is idempotent on same-cgroup targets
125    /// so the op would silently no-op, masking either a stale op the
126    /// test author forgot to remove or a typo. The bail names both
127    /// sides so the operator can pick the right fix. The check also
128    /// catches the symmetric empty-string pair (`("", "")`), which
129    /// would otherwise no-op a RunnerCgroup-to-RunnerCgroup transfer.
130    ///
131    /// # Empty-string source
132    ///
133    /// Passing `from = ""` matches workers spawned by
134    /// [`Op::Spawn`] with [`SpawnPlacement::RunnerCgroup`] —
135    /// RunnerCgroup-placement handles are tracked under the
136    /// empty-string key (workers stay in the spawner's own cgroup,
137    /// outside any managed hierarchy). `Op::move_all_tasks("",
138    /// "named")` is the canonical way to materialize
139    /// RunnerCgroup-placement workers into a managed cgroup
140    /// mid-scenario; after the move the captured handles re-key
141    /// to `"named"` and lose their empty-string identity,
142    /// behaving like any other managed worker (lifetime tied to
143    /// `"named"`'s ownership slot per the table below).
144    ///
145    /// # Lifetime / ownership-direction asymmetry
146    ///
147    /// `MoveAllTasks` is asymmetric with respect to cgroup ownership:
148    /// the legality of a move depends on the relative lifetimes of
149    /// the `from` and `to` cgroups, not just on which one is the
150    /// source.
151    ///
152    /// | `from` ownership      | `to` ownership        | Outcome |
153    /// |-----------------------|-----------------------|---------|
154    /// | step-local            | step-local            | Allowed; both die at step teardown together. |
155    /// | step-local            | Backdrop (persistent) | Allowed; handle ownership transfers from step-local set to Backdrop set so the worker survives step teardown. |
156    /// | Backdrop              | Backdrop              | Allowed; both persist for the scenario. |
157    /// | Backdrop              | step-local            | **Rejected at apply time.** A persistent worker would be stranded inside a cgroup that gets `rmdir`'d at step boundary; the kernel migrates the orphaned task to the cgroup root with a frozen-task warning in dmesg. The `bail!` diagnostic names the offending pair and tells the operator to either declare the destination in the Backdrop too, or move the worker back into a Backdrop-owned cgroup. |
158    ///
159    /// The Backdrop→Backdrop and step→step cases are unconditionally
160    /// allowed because both endpoints share a lifetime; the
161    /// step→Backdrop case is allowed because the kernel moves
162    /// reference-count once and the framework's
163    /// `ScenarioState::rename_handles`
164    /// transfers the handle into the persistent slot in the same
165    /// step. The Backdrop→step case is the only one that produces
166    /// a guaranteed orphan, hence the asymmetric reject.
167    ///
168    /// # Backdrop-setup exemption
169    ///
170    /// `MoveAllTasks` ops running INSIDE a Backdrop's `setup_ops`
171    /// pass (`state.target_backdrop=true`) are exempt from the
172    /// Backdrop→step-local check: at that point, "step-local"
173    /// cgroups don't exist yet (the Backdrop is the only cgroup
174    /// scope), and the rule reduces to a pure source-ownership
175    /// check that the apply path handles already.
176    MoveAllTasks {
177        from: Cow<'static, str>,
178        to: Cow<'static, str>,
179    },
180    /// Spawn a userspace [`Payload`](crate::test_support::Payload)
181    /// binary in the background and track its
182    /// [`PayloadHandle`](crate::scenario::payload_run::PayloadHandle)
183    /// under the step's payload-handle set.
184    ///
185    /// Subsequent [`Op::WaitPayload`] / [`Op::KillPayload`] address
186    /// the running child by the composite
187    /// (`Payload::name`, `cgroup`) key — the same payload can run
188    /// concurrently in two different cgroups without a dedup
189    /// collision, but the lookup from the waiting op must match
190    /// the pair the run op recorded. See [`Op::WaitPayload`] /
191    /// [`Op::KillPayload`] for the ambiguity rules when the
192    /// waiting op supplies only the name.
193    ///
194    /// Only [`PayloadKind::Binary`](crate::test_support::PayloadKind::Binary)
195    /// payloads are spawnable; scheduler-kind payloads are rejected at
196    /// apply time with an actionable error.
197    ///
198    /// `args` is appended to `payload.default_args`. `cgroup`, when
199    /// set, places the child in the named cgroup (resolved relative
200    /// to the scenario's parent cgroup) via
201    /// [`PayloadRun::in_cgroup`](crate::scenario::payload_run::PayloadRun::in_cgroup);
202    /// unset inherits the spawning process's cgroup.
203    ///
204    /// Handles not explicitly consumed by `WaitPayload` / `KillPayload`
205    /// are drained at step-teardown by `collect_step` (step-local) or
206    /// at scenario end by `collect_backdrop` (when the handle lives on
207    /// the Backdrop), matching the [`CgroupDef::workload`] semantics.
208    ///
209    /// # Scheduler-kind rejection across surfaces
210    ///
211    /// Three surfaces accept a `&Payload` and each rejects a
212    /// scheduler-kind Payload differently — deliberately, to match
213    /// the lifecycle of the caller:
214    ///
215    /// | Surface                                                                                   | Rejection             | When          |
216    /// |-------------------------------------------------------------------------------------------|-----------------------|---------------|
217    /// | [`PayloadRun::run`](crate::scenario::payload_run::PayloadRun::run) (`ctx.payload(&X)...`) | `Err(anyhow::Error)`  | scenario-time |
218    /// | [`CgroupDef::workload`]                                                                   | `panic!`              | declaration-time |
219    /// | `Op::RunPayload` (this variant)                                                           | `Err(anyhow::Error)`  | apply-ops-time |
220    ///
221    /// Rationale: `CgroupDef::workload` is a builder invoked during
222    /// test construction (nextest `--list` phase) — a panic there
223    /// surfaces the misuse before any VM boot, with a full
224    /// backtrace pointing at the offending call. `ctx.payload()`
225    /// and `Op::RunPayload` both run inside an executing scenario
226    /// where one bad misuse should not crash the whole test run;
227    /// they `bail!` with an actionable message and let the
228    /// surrounding step-sequence skip to teardown. The three
229    /// paths are symmetric in *what* they reject (scheduler-kind
230    /// Payloads in non-scheduler slots); they differ only in
231    /// *how* the misuse is surfaced, matched to caller context.
232    RunPayload {
233        payload: &'static crate::test_support::Payload,
234        args: Vec<String>,
235        cgroup: Option<Cow<'static, str>>,
236    },
237    /// Block until the payload named `name` exits naturally, then
238    /// evaluate its checks and record metrics to the per-test sidecar.
239    ///
240    /// The target is looked up by composite key (`name`, `cgroup`).
241    /// `cgroup: None` matches the unique live copy (whatever its
242    /// placement); if two or more copies of the same payload are
243    /// live in different cgroups, the lookup bails with an
244    /// "ambiguous — specify cgroup" error so the test doesn't
245    /// silently wait on the wrong one. Use
246    /// [`Op::wait_payload_in_cgroup`] to disambiguate.
247    ///
248    /// A consumed or unknown `(name, cgroup)` pair returns `Err`
249    /// with an actionable message — test authors must not silently
250    /// wait for payloads that were never started or have already
251    /// been consumed by a prior `WaitPayload`/`KillPayload`.
252    ///
253    /// **No timeout.** `WaitPayload` waits indefinitely for the
254    /// child to exit. A binary that never terminates (e.g. a
255    /// benchmark configured without `--runtime=N`, or a stress-ng
256    /// run without `--timeout`) will hang the step until the
257    /// outer test watchdog fires. For time-boxed long-running
258    /// payloads, prefer [`KillPayload`](Self::KillPayload) paired
259    /// with a [`super::super::HoldSpec::fixed`] / [`super::super::HoldSpec::frac`] step
260    /// boundary that guarantees forward progress; the payload's
261    /// own CLI (`--runtime`, `--timeout`) is the reliable way to
262    /// cap a single invocation's runtime.
263    ///
264    /// Check failures from the payload are recorded to the sidecar
265    /// for regression analysis but do NOT fail the step or the test
266    /// in-process. Use
267    /// [`ctx.payload(&X).run()`](crate::scenario::payload_run::PayloadRun::run)
268    /// directly if the test body needs to gate on check results.
269    WaitPayload {
270        name: Cow<'static, str>,
271        cgroup: Option<Cow<'static, str>>,
272    },
273    /// SIGKILL the payload named `name`, reap the child, evaluate
274    /// checks, and record metrics. Mirrors the behavior of
275    /// step-teardown drain for an explicitly-targeted payload.
276    ///
277    /// The target is looked up by composite key (`name`, `cgroup`)
278    /// — see [`Op::WaitPayload`] for the ambiguity rules.
279    ///
280    /// A consumed or unknown `(name, cgroup)` pair returns `Err`
281    /// with an actionable message, identical to [`Op::WaitPayload`]'s
282    /// lookup semantics.
283    ///
284    /// Check failures from the payload are recorded to the sidecar
285    /// for regression analysis but do NOT fail the step or the test
286    /// in-process. Use
287    /// [`ctx.payload(&X).run()`](crate::scenario::payload_run::PayloadRun::run)
288    /// directly if the test body needs to gate on check results.
289    KillPayload {
290        name: Cow<'static, str>,
291        cgroup: Option<Cow<'static, str>>,
292    },
293    /// Freeze every task in the named cgroup via `cgroup.freeze`.
294    ///
295    /// Writes `"1"` to the cgroup's `cgroup.freeze` file. The kernel's
296    /// `cgroup_freeze_write` dispatches the asynchronous freeze path;
297    /// tasks transition to the frozen state without external SIGSTOP,
298    /// and `cgroup.events` reaches `frozen 1` once every task has
299    /// parked. Idempotent — freezing an already-frozen cgroup is a
300    /// no-op.
301    ///
302    /// # Auto-unfreeze at teardown
303    ///
304    /// `Op::FreezeCgroup` is paired with [`Op::UnfreezeCgroup`] to
305    /// release. A test that omits the unfreeze still tears down
306    /// cleanly: [`crate::cgroup::CgroupManager::remove_cgroup`]
307    /// auto-unfreezes the cgroup before draining tasks (see the
308    /// kernel's `cgroup_freezer_migrate_task`, which clears the
309    /// task's freeze state when it migrates to an unfrozen
310    /// destination), so step teardown is robust to a stuck-frozen
311    /// cgroup. Pair the ops explicitly when the scenario needs
312    /// observable unfreeze timing inside the step body.
313    ///
314    /// # Worked example
315    ///
316    /// Three-Step suspend/resume sequence: a `Backdrop`-resident
317    /// long-running workload is paused mid-scenario and resumed
318    /// later, exercising how the scheduler responds to a sudden
319    /// idle window.
320    ///
321    /// ```text
322    /// Step 1 (run): apply cgroup; workload spins for 2s.
323    /// Step 2 (suspend): Op::freeze_cgroup("workers"); hold 1s.
324    ///                   The cgroup's tasks park via cgroup.freeze,
325    ///                   schedstat gauges drop to zero, and the
326    ///                   scheduler observes a sudden idle subtree.
327    /// Step 3 (resume): Op::unfreeze_cgroup("workers"); hold 2s.
328    ///                  Tasks return to runnable state, the
329    ///                  scheduler must re-pick them onto the
330    ///                  cgroup's CPUs without spuriously preempting
331    ///                  unrelated workloads.
332    /// ```
333    ///
334    /// # Observer-cgroup deadlock warning
335    ///
336    /// Do NOT freeze a cgroup that hosts the test's own observation
337    /// machinery. The freeze path stops every task in the cgroup —
338    /// including any thread that:
339    /// - opens `/proc/<pid>/sched` or other procfs entries owned by
340    ///   tasks inside the frozen cgroup, then waits on the read,
341    /// - holds a futex shared with frozen tasks (the unfreeze must
342    ///   land before the wait can complete),
343    /// - synchronously waits on a stalled-task pipe whose
344    ///   producer is in the frozen cgroup.
345    ///
346    /// The framework's stimulus-event SHM ring and the `BlkWorker`
347    /// epoll loop both run outside the test cgroup tree, so they
348    /// are unaffected — but a test author who explicitly places an
349    /// observer thread inside the same cgroup as its observation
350    /// targets will deadlock the scenario when the freeze fires.
351    /// Place observers in a sibling cgroup (or in the parent) so
352    /// `cgroup.freeze` is scoped to the workload subtree alone.
353    ///
354    /// Pair with [`Op::UnfreezeCgroup`] to release. Useful for
355    /// scheduler suspend/resume tests where the test body wants to
356    /// observe how the scheduler handles a suddenly-frozen workload
357    /// and the resumption sequence afterwards.
358    ///
359    /// Treats a missing cgroup as a step failure: the
360    /// `cgroup.freeze` write fails with `ENOENT` and the error
361    /// propagates via the `apply_ops` `with_context` chain.
362    /// Freezing a non-existent cgroup is NOT a no-op; only
363    /// freezing an already-frozen cgroup is.
364    FreezeCgroup { cgroup: Cow<'static, str> },
365    /// Unfreeze every task in the named cgroup via `cgroup.freeze`.
366    ///
367    /// Writes `"0"` to the cgroup's `cgroup.freeze` file. Inverse of
368    /// [`Op::FreezeCgroup`]. Idempotent.
369    UnfreezeCgroup { cgroup: Cow<'static, str> },
370    /// Capture a host-side diagnostic snapshot under `name`. The
371    /// freeze coordinator pauses every vCPU long enough to read
372    /// the BPF map state, vCPU registers, and per-CPU
373    /// counters into a
374    /// [`FailureDumpReport`](crate::monitor::dump::FailureDumpReport),
375    /// then resumes the guest. The report is keyed by `name` on
376    /// the active
377    /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge);
378    /// downstream test code reads it via
379    /// [`Snapshot`](crate::scenario::snapshot::Snapshot).
380    ///
381    /// On-demand snapshots are orthogonal to the error-class
382    /// freeze trigger — the request flows through a separate
383    /// channel, does not transition the coordinator's
384    /// `freeze_state`, and is serviced even after `Done`. The only
385    /// scheduling rule: at most one capture in flight at a time
386    /// (each request waits for the previous freeze's vCPUs to
387    /// fully resume before issuing).
388    ///
389    /// **Guest → host wire.** In-guest scenarios submit the request
390    /// over the virtio-console port-1 TLV stream: `request_snapshot`
391    /// builds a `SnapshotRequestPayload` and writes it via
392    /// `write_msg(MsgType::SnapshotRequest, ...)` to `/dev/vport0p1`
393    /// (`src/vmm/guest_comms.rs`). The host coordinator decodes the
394    /// `MSG_TYPE_SNAPSHOT_REQUEST` frame, runs
395    /// `freeze_and_dispatch(FreezeMode::Capture { .. })`, and the
396    /// installed `CaptureCallback` returns the resulting report
397    /// through a paired reply frame. See
398    /// [`CaptureCallback`](crate::scenario::snapshot::CaptureCallback)
399    /// for the full protocol.
400    ///
401    /// **No active bridge ⇒ no-op.** When the executor runs in a
402    /// context with no installed
403    /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
404    /// (e.g. unit tests that exercise the executor without
405    /// spinning up a VM), this op emits a `tracing::warn!` and
406    /// continues. Existing scenarios that never declare snapshot
407    /// ops keep their behavior unchanged.
408    ///
409    /// # Example
410    ///
411    /// Declare a snapshot mid-step, fetch the captured report
412    /// after the scenario completes, and assert against a
413    /// BTF-rendered field:
414    ///
415    /// ```ignore
416    /// use ktstr::scenario::ops::{CgroupDef, HoldSpec, Op, Step, execute_steps};
417    /// use ktstr::scenario::snapshot::{Snapshot, SnapshotBridge};
418    ///
419    /// // Wire up the bridge before execute_steps runs (host-side
420    /// // VM setup typically performs this step automatically).
421    /// let bridge = SnapshotBridge::new(/* capture callback */);
422    /// let _guard = bridge.clone().set_thread_local();
423    ///
424    /// let steps = vec![Step {
425    ///     setup: vec![CgroupDef::named("workers").workers(2)].into(),
426    ///     ops: vec![Op::capture_snapshot("after_spawn")],
427    ///     hold: HoldSpec::FULL,
428    /// }];
429    /// execute_steps(ctx, steps)?;
430    ///
431    /// // Inspection.
432    /// let captured = bridge.drain();
433    /// let report = captured.get("after_spawn").expect("snapshot recorded");
434    /// let snap = Snapshot::new(report);
435    /// let nr_cpus = snap.var("nr_cpus_onln").as_u64()?;
436    /// assert!(nr_cpus > 0, "snapshot captured live nr_cpus_onln");
437    /// ```
438    CaptureSnapshot { name: Cow<'static, str> },
439    /// Capture a snapshot whenever the guest writes to the named
440    /// kernel symbol. The snapshot is tagged with the symbol
441    /// itself; one fire = one capture.
442    ///
443    /// Symbol resolution at op execution time is a verbatim match
444    /// against the vmlinux ELF symbol table: the freeze coordinator
445    /// walks `Elf::syms` and accepts the symbol whose strtab entry
446    /// equals the requested string byte-for-byte. There is no
447    /// prefix stripping, BTF lookup, kallsyms walk, or per-CPU
448    /// offset arithmetic — the string must match an entry that
449    /// `nm vmlinux` would print (e.g. `"jiffies_64"`,
450    /// `"scx_watchdog_timestamp"`).
451    ///
452    /// The `register_watch` callback on a host-side
453    /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
454    /// is for **host-side unit testing only** — it lets in-process
455    /// executor tests record the symbol and return without arming
456    /// any hardware. Production in-VM scenarios run via the
457    /// virtio-console port 1 `MSG_TYPE_SNAPSHOT_REQUEST` TLV frame
458    /// and the host coordinator's `arm_user_watchpoint` path
459    /// (`src/vmm/freeze_coord/mod.rs`); the thread-local bridge is
460    /// never installed inside the guest.
461    ///
462    /// # Guard rails
463    ///
464    /// - **Maximum of 3 watch ops per scenario.** The KVM
465    ///   hardware-watchpoint plumbing reserves slot 0 for the
466    ///   existing `*scx_root->exit_kind` trigger (used by the
467    ///   error-trigger path); only the remaining three user
468    ///   watchpoint slots are available for on-demand watches. The
469    ///   bridge's `register_watch` rejects a 4th
470    ///   `Op::WatchSnapshot` and fails the step when the cap is
471    ///   exceeded.
472    /// - **Symbol resolution failures bail immediately.** A
473    ///   missing symbol or unaligned address surfaces as an `Err`
474    ///   from `execute_steps` so the test author notices the
475    ///   watch did not attach. Silent degradation would leave the
476    ///   scenario running with no captures and look identical to
477    ///   a healthy passing run.
478    /// - **4-byte alignment.** The resolved KVA must be 4-byte
479    ///   aligned: the framework arms 4-byte data-write watches,
480    ///   which require `addr & 0x3 == 0` on every supported
481    ///   architecture. Mis-aligned addresses bail at setup with
482    ///   the resolved KVA in the error.
483    /// - **Silent-misfire detection (KASLR-on guests).** When the
484    ///   host coordinator's `kaslr_offset` is zero AND the
485    ///   resolved kernel symbol lives in the x86_64 high-half
486    ///   address range, `arm_user_watchpoint` emits a
487    ///   `tracing::warn!` (once per unique `(symbol, link_kva)`
488    ///   per process) noting the arm targets the link-time KVA
489    ///   while the runtime symbol lives at `link_kva +
490    ///   runtime_kaslr_slide`. The arm STILL completes (rejecting
491    ///   it would regress every caller running before the host
492    ///   coordinator's runtime-KASLR-slide derivation lands);
493    ///   operators who hit the warn can boot the guest with the
494    ///   `nokaslr` cmdline to use `Op::WatchSnapshot`, or omit
495    ///   the op from KASLR-on test runs entirely.
496    ///
497    /// **Guest → host wire.** The registration request rides the
498    /// same ioeventfd doorbell as [`Op::CaptureSnapshot`] (separate tag
499    /// namespace), so symbol resolution + user watchpoint slot
500    /// allocation + `KVM_SET_GUEST_DEBUG` arming happen on the host
501    /// without a vCPU userspace exit. Once armed, the
502    /// `KVM_EXIT_DEBUG` dispatch path drives the resulting
503    /// captures directly into the freeze coordinator (no
504    /// per-fire doorbell write needed). See
505    /// [`WatchRegisterCallback`](crate::scenario::snapshot::WatchRegisterCallback)
506    /// for the full protocol.
507    ///
508    /// Note: high-frequency variables (rq counters, jiffies)
509    /// will fire watches every few microseconds and fire
510    /// thousands of times (each overwriting the prior capture
511    /// under the same tag); the framework does not rate-limit
512    /// captures, so the test author owns the frequency choice.
513    /// Use [`Op::CaptureSnapshot`] for time-driven captures when
514    /// frequency is the concern.
515    WatchSnapshot { symbol: Cow<'static, str> },
516    /// Live-vCPU write of one or more [`KernelTarget`] / [`KernelValue`]
517    /// pairs into running guest memory. The host coordinator routes
518    /// each pair to the appropriate `GuestKernel::write_*` helper
519    /// (no freeze rendezvous, vCPUs keep executing). A Release fence
520    /// is issued after the last write so a weakly-ordered guest's
521    /// `smp_load_acquire` observes the bytes in write order — but
522    /// concurrent guest readers can still race against in-flight
523    /// stores, and the caller owns any guest-side synchronisation
524    /// the test requires (`READ_ONCE` / `smp_load_acquire` on the
525    /// target field).
526    ///
527    /// Same orchestration pattern as the existing
528    /// `BpfMapAccessor::write_value` path: synchronous host-side
529    /// memory mutation on a worker thread, no vCPU pause. Use this
530    /// for scratch fields, debug flags, scx-ktstr-private state,
531    /// and anything the guest reads with proper barriers.
532    ///
533    /// **Batch shape.** `writes` carries 1+ pairs; the executor
534    /// issues them in order. For a single write the
535    /// [`Op::write_kernel_hot`](#method.write_kernel_hot) singleton
536    /// constructor wraps a 1-element vec.
537    ///
538    /// **Dispatch.** The executor's arm calls
539    /// `dispatch_kernel_op_request` (`src/scenario/ops/dispatch.rs:2386`), which
540    /// uses the in-process `SnapshotBridge` callback when one is
541    /// installed (the test-fixture seam) and falls back to the
542    /// virtio-console port-1 wire path (`MsgType::KernelOpRequest`)
543    /// in-guest. The wire request is consumed by
544    /// `dispatch_kernel_op_batch` (`src/vmm/freeze_coord/kernel_op_dispatch.rs`),
545    /// invoked from the freeze coordinator's apply path.
546    ///
547    /// **See also.** [`KernelTarget`] — scroll to the
548    /// "Semantic risk" section for the single source of truth
549    /// on which scheduler-bookkeeping targets are safe vs
550    /// silently load-bearing.
551    WriteKernelHot {
552        /// Ordered list of `(target, value)` pairs to write.
553        writes: Vec<(KernelTarget, KernelValue)>,
554    },
555    /// Auto-freezing batched write of one or more
556    /// [`KernelTarget`] / [`KernelValue`] pairs while every vCPU is
557    /// parked at the freeze rendezvous. Reuses the same coordinator
558    /// path that [`Op::CaptureSnapshot`] triggers: one rendezvous,
559    /// every write in the batch lands while paused, then resume.
560    ///
561    /// **Batching is a hard correctness requirement.** Multi-CPU
562    /// seeds (e.g. a planned `with_uptime` helper writing per-CPU
563    /// `rq.clock` on every CPU at the same instant) must land in
564    /// ONE freeze window —
565    /// N separate cold-write ops would mean N rendezvous cycles
566    /// and observable inter-CPU skew. The variant payload is a
567    /// `Vec` precisely to make batched writes the natural shape.
568    /// The executor's `apply_ops` pre-pass auto-merges adjacent
569    /// singleton `Op::WriteKernelCold` ops into one merged op as
570    /// a safety net — N adjacent `write_kernel_cold(...)` calls
571    /// collapse into one rendezvous regardless of whether the
572    /// caller used [`crate::scenario::ops::Op::write_kernel_cold_batch`]
573    /// or chained singletons.
574    ///
575    /// **Dispatch.** The executor's arm calls
576    /// `dispatch_kernel_op_request` (`src/scenario/ops/dispatch.rs:2386`), which
577    /// uses the in-process `SnapshotBridge` callback when one is
578    /// installed (the test-fixture seam) and falls back to the
579    /// virtio-console port-1 wire path (`MsgType::KernelOpRequest`)
580    /// in-guest. The wire request lands at the freeze coordinator's
581    /// rendezvous boundary via
582    /// `dispatch_kernel_op_batch` (`src/vmm/freeze_coord/kernel_op_dispatch.rs`).
583    ///
584    /// Use this for: multi-field atomic writes, all-CPUs-at-once
585    /// seeding, one-shot setup that must complete before the guest
586    /// observes any partial state. Use [`Op::WriteKernelHot`] when
587    /// the guest is OK with live-write semantics + caller-side
588    /// synchronisation.
589    ///
590    /// **See also.** [`KernelTarget`] — scroll to the
591    /// "Semantic risk" section for the single source of truth
592    /// on which scheduler-bookkeeping targets are safe vs
593    /// silently load-bearing.
594    WriteKernelCold {
595        /// Ordered list of `(target, value)` pairs to write inside
596        /// a single freeze rendezvous.
597        writes: Vec<(KernelTarget, KernelValue)>,
598    },
599    /// Live-vCPU read of a [`KernelTarget`] into the
600    /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
601    /// drain log keyed by `tag`. Mirrors [`Op::WriteKernelHot`]:
602    /// no freeze rendezvous, host-side worker thread issues the
603    /// read while the guest keeps executing. The caller assumes
604    /// the read may race against guest writes; for read-write
605    /// coherency pair the op with a guest-side `smp_store_release`
606    /// on the target.
607    ///
608    /// Use this for: read-back of values previously written via
609    /// [`Op::WriteKernelHot`], lightweight polling of single fields
610    /// the test wants to observe without pausing the guest.
611    ///
612    /// **Width.** The `width` field picks which
613    /// `crate::monitor::guest::GuestKernel` `read_*` family the
614    /// host dispatcher invokes — `u32` / `u64` / `Bytes(len)`.
615    /// The reply lands as a [`crate::vmm::wire::KernelOpValue`] of
616    /// the matching shape in the bridge's drain log; a u32 field
617    /// must be read with `KernelValueWidth::u32()` (a u64 read of
618    /// a u32 field returns the field's bytes plus 4 adjacent
619    /// bytes).
620    ///
621    /// **Dispatch.** Same bridge-first / wire-fallback model as
622    /// [`Op::WriteKernelHot`]; the wire request is consumed by
623    /// `dispatch_kernel_op_batch` (`src/vmm/freeze_coord/kernel_op_dispatch.rs`).
624    ReadKernelHot {
625        /// Bridge-keyed tag under which the read result lands.
626        tag: Cow<'static, str>,
627        /// Address to read.
628        target: KernelTarget,
629        /// Width specifier: picks the read family + the reply
630        /// value shape.
631        width: KernelValueWidth,
632    },
633    /// Auto-freezing read of a [`KernelTarget`] into the
634    /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
635    /// drain log keyed by `tag`, taken while every vCPU is parked
636    /// at the freeze rendezvous. Reuses the same coordinator path
637    /// that [`Op::CaptureSnapshot`] triggers. Coherent with
638    /// respect to guest state — no concurrent guest write can race
639    /// against the read.
640    ///
641    /// Use this for: ground-truth reads that must reflect a stable
642    /// guest state, snapshot-style point-in-time reads. Note: each
643    /// `Op::ReadKernelCold` triggers its OWN freeze rendezvous —
644    /// `apply_ops`'s pre-pass folds adjacent
645    /// `Op::WriteKernelCold` ops into one rendezvous but does NOT
646    /// fold reads (per-entry wire tags are needed for the
647    /// multi-read reply-routing contract; queued as a wire-format
648    /// follow-up). For multi-read coherent snapshots, prefer
649    /// [`Op::CaptureSnapshot`] (which already orchestrates a single
650    /// rendezvous for all snapshot reads).
651    ///
652    /// **Width.** Same `width` semantics as [`Op::ReadKernelHot`]:
653    /// pick the read family explicitly so the dispatcher invokes
654    /// the matching `GuestKernel::read_*` helper.
655    ///
656    /// **Dispatch.** Bridge-first / wire-fallback like the other
657    /// `*Kernel*` variants; the wire request lands at the freeze
658    /// coordinator's rendezvous boundary via
659    /// `dispatch_kernel_op_batch` (`src/vmm/freeze_coord/kernel_op_dispatch.rs`).
660    ReadKernelCold {
661        /// Bridge-keyed tag under which the read result lands.
662        tag: Cow<'static, str>,
663        /// Address to read.
664        target: KernelTarget,
665        /// Width specifier: picks the read family + the reply
666        /// value shape.
667        width: KernelValueWidth,
668    },
669    /// Attach a scheduler mid-scenario: spawn the named staged
670    /// scheduler from `/staging/schedulers/<name>/` inside the guest
671    /// and wait for it to publish its first BPF object accessors.
672    ///
673    /// **Dispatch** (`dispatch_attach_scheduler` at
674    /// `src/scenario/ops/dispatch.rs:2032`): waits up to 60s for the
675    /// accessor-init worker to quiesce (handles the case where the
676    /// boot scheduler's first publish is still in flight), captures
677    /// the pre-spawn publish seqno, spawns the staged scheduler
678    /// binary, re-installs the sched_exit_monitor against the new
679    /// SCHED_PID, then waits up to 30s for a fresh accessor publish.
680    ///
681    /// **Already-attached behavior.** No framework-level idempotency
682    /// guard: if a scheduler is already running, the kernel rejects
683    /// the new attach at the `scx_enable_state() != SCX_DISABLED`
684    /// gate (`kernel/sched/ext.c:6837`, returns `-EBUSY`); the
685    /// spawned binary exits, no fresh publish lands, and the dispatch
686    /// bails on the 30s publish-wait timeout. Use
687    /// [`Op::DetachScheduler`] (then `AttachScheduler`) or
688    /// [`Op::ReplaceScheduler`] to swap schedulers.
689    ///
690    /// The `scheduler` reference holds a `'static` lifetime: the
691    /// test author declares each [`crate::test_support::Scheduler`]
692    /// at static scope (via `declare_scheduler!` or a
693    /// `static MY_SCHED: Scheduler = ...` item) and passes the
694    /// borrow into the constructor. The staging slot that ships the
695    /// binary into the initramfs is `KtstrTestEntry::staged_schedulers`;
696    /// the dispatch arm reads its path via
697    /// `test_support::staged::staged_scheduler_binary_path`.
698    AttachScheduler {
699        scheduler: &'static crate::test_support::Scheduler,
700    },
701    /// Detach the currently-running scheduler.
702    ///
703    /// **Dispatch** (`dispatch_detach_scheduler` →
704    /// `kill_current_scheduler` at `src/scenario/ops/dispatch.rs:1896`):
705    /// stops the host's sched_exit_monitor so the intentional kill
706    /// isn't promoted into a test-fatal scheduler-died signal,
707    /// writes `'S'` to `/proc/sysrq-trigger` to start the kernel-
708    /// side `scx_disable` cascade asynchronously (avoiding the
709    /// D-state stall inside `scx_flush_disable_work`'s
710    /// `kthread_flush_work(&sch->disable_work)` at
711    /// `kernel/sched/ext.c:6145`, reached on the struct_ops detach
712    /// path via `bpf_scx_unreg` at `kernel/sched/ext.c:7666`), sends
713    /// `SIGTERM` to the
714    /// scheduler pid, waits up to `SCHED_LIFECYCLE_KILL_GRACE` (10s)
715    /// for the kernel BPF state to reach `SCX_DISABLED`, then
716    /// clears the `SCHED_PID` atomic (defined in
717    /// `src/vmm/rust_init/mod.rs`) so subsequent
718    /// `crate::vmm::rust_init::sched_pid()` reads return `None`.
719    ///
720    /// Bails when no scheduler is currently attached (SCHED_PID is
721    /// 0), when the SIGTERM syscall fails, or when the
722    /// `SCX_DISABLED` wait times out. NOT idempotent: a second
723    /// detach with no scheduler attached bails rather than no-oping.
724    /// For defensive "ensure clean slate" scaffolds, gate on
725    /// `crate::vmm::rust_init::sched_pid()` returning `Some` before
726    /// emitting the Detach step rather than relying on no-op
727    /// tolerance.
728    DetachScheduler,
729    /// Kill the currently-running scheduler and respawn the BOOT
730    /// scheduler. Useful for hot-restart validation of the boot
731    /// scheduler. Bails if no scheduler is currently attached.
732    ///
733    /// **v0 limitation.** Always respawns the boot scheduler at
734    /// `/scheduler` + `/sched_args` regardless of which scheduler
735    /// was most-recently attached — after an `Op::AttachScheduler`
736    /// or `Op::ReplaceScheduler` to a staged scheduler, this op
737    /// restarts the BOOT scheduler, not the most-recently-attached
738    /// one. For restarting a staged scheduler, use
739    /// [`Op::ReplaceScheduler`] with the same staged spec.
740    ///
741    /// **Dispatch** (`dispatch_restart_scheduler` at
742    /// `src/scenario/ops/dispatch.rs:2129`): kills the current scheduler
743    /// via the shared `kill_current_scheduler` helper, spawns the
744    /// boot scheduler from the hardcoded `/scheduler` + `/sched_args`
745    /// paths with log at `/tmp/sched.log`, then re-installs the
746    /// sched_exit_monitor against the re-spawned boot pid.
747    RestartScheduler,
748    /// Detach the currently-running scheduler and attach a different
749    /// one. Equivalent to `[DetachScheduler, AttachScheduler {
750    /// scheduler: new }]` but expressed as a single op so the
751    /// no-scheduler window is bounded and the per-phase scheduler
752    /// tagging on the sidecar can record the transition atomically.
753    ///
754    /// The mid-experiment swap case the operator typically wants:
755    /// run scheduler A for the first phase of a multi-step test, swap
756    /// to scheduler B (or A-with-different-CLI-args, modeled as a
757    /// distinct `Scheduler` declaration) for the second phase, and
758    /// assert a per-phase metric delta across the boundary.
759    ///
760    /// Bails if no scheduler is currently attached — there is no
761    /// scheduler to detach from, so the "replace" semantic has no
762    /// meaning. Use [`Op::AttachScheduler`] for the first attach.
763    ///
764    /// **Dispatch** (`dispatch_replace_scheduler` at
765    /// `src/scenario/ops/dispatch.rs:2153`): kills the current scheduler
766    /// via the shared `kill_current_scheduler` helper, spawns the
767    /// named staged scheduler binary from
768    /// `/staging/schedulers/<name>/`, re-installs the
769    /// sched_exit_monitor against the new SCHED_PID, waits up to
770    /// `REPLACE_NOT_TRYING_DEADLINE_S` (5s) for the accessor-init
771    /// worker to quiesce, captures the pre-publish seqno, then
772    /// waits up to 10s for fresh accessors to publish against the
773    /// new BPF object. The 10s budget aligns with
774    /// `SCHED_LIFECYCLE_KILL_GRACE` and covers a cold-cache vmlinux
775    /// re-parse during the worker reinit.
776    ReplaceScheduler {
777        scheduler: &'static crate::test_support::Scheduler,
778    },
779    /// Open a BPF map fd by name and hold it for the scenario lifetime.
780    ///
781    /// **Why this exists.** `Op::ReplaceScheduler` kills the outgoing
782    /// scheduler process; libbpf's drop path then releases the map
783    /// fds the loader was holding. Once the last refcount on a map
784    /// drops, the kernel frees it — typically before any post-swap
785    /// freeze captures, so the multi-bss "same-binary swap window"
786    /// case (two `<obj>.bss` copies coexisting briefly) closes too
787    /// fast to be reliably observed in a test. `PinBpfMap` holds an
788    /// extra refcount on the named map so the kernel keeps it alive
789    /// until the scenario ends.
790    ///
791    /// **Semantics.** Walks the kernel's map ID space (via
792    /// [`libbpf_rs::query::MapInfoIter`], which wraps
793    /// `BPF_MAP_GET_NEXT_ID` + `BPF_MAP_GET_FD_BY_ID` +
794    /// `BPF_OBJ_GET_INFO_BY_FD`) and keeps the fd whose name matches.
795    /// The held fd lives in the scenario's Backdrop state and drops
796    /// (via std `OwnedFd` `Drop`) at scenario teardown. Multiple
797    /// `PinBpfMap` ops with **distinct** names accumulate; pinning the
798    /// **same** name twice is a no-op (the second call returns without
799    /// re-opening the fd, so the originally-pinned map instance is the
800    /// one held — not the second-call-time instance).
801    ///
802    /// **Name truncation.** BPF map names are capped at
803    /// `BPF_OBJ_NAME_LEN = 16` bytes including the trailing NUL, so
804    /// 15 usable chars max per `kernel/bpf/syscall.c`'s
805    /// `bpf_obj_name_cpy`. Pass the kernel-visible name (typically
806    /// `<obj>.bss` / `<obj>.data` / `<obj>.rodata`). When a libbpf
807    /// object name + section suffix exceeds the 15-char cap, libbpf
808    /// truncates the object prefix at load time and the kernel-side
809    /// name is the truncated form; the framework does not auto-
810    /// truncate the user-supplied string, so pass the post-truncation
811    /// form. Reading the map names from a prior
812    /// [`crate::monitor::dump::FailureDumpReport`]'s `maps[].name`
813    /// or via `bpftool map list` is the safe way to discover the
814    /// exact string the kernel sees.
815    ///
816    /// **Order.** Place this op AFTER the scheduler that owns the
817    /// target map has attached (typically a small fixed hold suffices
818    /// — ~100ms for the small scx-ktstr fixture, longer for
819    /// heavyweight schedulers). For the same-binary swap-window
820    /// scenario specifically: pin the **outgoing** scheduler's bss
821    /// **before** `Op::ReplaceScheduler` runs — pinning after the
822    /// swap is too late because the outgoing scheduler's bss has
823    /// already been freed by libbpf's drop path. The pin walker
824    /// picks the lowest-id matching map, so the outgoing copy (the
825    /// older id) is the one held; the incoming scheduler's load
826    /// then creates a second copy that's also kept alive because
827    /// the outgoing refcount blocks the kernel from freeing the id.
828    ///
829    /// **Failure surface.** The pin runs at Step apply time inside
830    /// `execute_steps` / `execute_scenario`. A failure (no matching
831    /// map found in the walk) bails out of the apply path as an
832    /// `Err` from `execute_steps`; the scenario stops before the
833    /// next Step runs and the `post_vm` callback is not invoked.
834    /// The underlying [`libbpf_rs::query::MapInfoIter`] silently
835    /// terminates iteration on any non-`ENOENT` errno from the BPF
836    /// ID walk (including `EPERM` from missing `CAP_SYS_ADMIN`), so
837    /// such errors surface as the no-matching-map case rather than
838    /// a distinct EPERM error — acceptable because ktstr always runs
839    /// as root inside the guest, so the CAP_SYS_ADMIN gates at
840    /// `kernel/bpf/syscall.c:4761` (`BPF_MAP_GET_NEXT_ID` walk) and
841    /// `:4869` (`BPF_MAP_GET_FD_BY_ID`) are always satisfied and the
842    /// EPERM path is unreachable in practice.
843    ///
844    /// **Example.**
845    /// ```ignore
846    /// let steps = vec![
847    ///     // Phase 0: primary scheduler runs alone; pin BEFORE the swap.
848    ///     Step::with_op(
849    ///         Op::pin_bpf_map("<obj>.bss"),
850    ///         HoldSpec::frac(0.3),
851    ///     ),
852    ///     // Phase 1: swap to a same-binary alt — the pinned map
853    ///     // keeps the OUTGOING bss alive across the teardown.
854    ///     Step::with_op(
855    ///         Op::replace_scheduler(&STAGED_ALT_SCHED),
856    ///         HoldSpec::frac(0.7),
857    ///     ),
858    /// ];
859    /// ```
860    ///
861    /// **See also.** [`crate::scenario::bpf_pin::open_bpf_map_fd_by_name`]
862    /// for the underlying helper and `tests/live_var_disambiguation_e2e.rs`
863    /// for the swap-window conditional walker-fired gate this pin is
864    /// designed to make deterministic.
865    PinBpfMap { name: Cow<'static, str> },
866    /// Capture the current `cgroup.procs` of `cgroup` and store the
867    /// PID list on the active [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
868    /// under `tag`.
869    ///
870    /// Synchronous read of the cgroup-v2 `cgroup.procs` pseudofile in
871    /// the dispatching thread (in-scenario — runs wherever
872    /// `execute_scenario` runs; inside the guest VM for `#[ktstr_test]`
873    /// e2e tests, on the host for host-only scenarios). Returns the
874    /// thread-group leaders (PIDs / TGIDs) the kernel reports at apply
875    /// time. The snapshot is appended to the bridge's per-tag drain
876    /// log; test bodies drain via
877    /// [`SnapshotBridge::drain_cgroup_procs`](crate::scenario::snapshot::SnapshotBridge::drain_cgroup_procs)
878    /// (or the by-tag lookup
879    /// [`SnapshotBridge::cgroup_procs_by_tag`](crate::scenario::snapshot::SnapshotBridge::cgroup_procs_by_tag))
880    /// after the scenario completes to read the captured pids back.
881    ///
882    /// Distinct from [`Op::CaptureSnapshot`]: that op routes through
883    /// the host-side freeze coordinator (TLV transport in production,
884    /// thread-local bridge in test fixtures); this op runs entirely
885    /// in-process against the local cgroupfs.
886    ///
887    /// # Use cases
888    ///
889    /// Pin "did my workers land in cgroup X" assertions without the
890    /// shell-probe + tmpfs-roundtrip pattern. Typical shape:
891    ///
892    /// ```ignore
893    /// use ktstr::prelude::SnapshotBridge;
894    /// use std::sync::Arc;
895    ///
896    /// // Install a bridge (dummy capture cb — only cgroup-procs drain
897    /// // is used). MUST clone before set_thread_local, which consumes
898    /// // self — the clone shares the Arc-internal state and is what
899    /// // we drain on after the scenario completes.
900    /// let bridge = SnapshotBridge::new(Arc::new(|_| None));
901    /// let bridge_for_drain = bridge.clone();
902    /// let _guard = bridge.set_thread_local();
903    ///
904    /// let backdrop = Backdrop::new().push_op(Op::add_cgroup("workers"));
905    /// let steps = vec![
906    ///     Step::new(
907    ///         vec![
908    ///             Op::spawn(SpawnPlacement::cgroup("workers"),
909    ///                       WorkSpec::default().workers(4)),
910    ///             Op::capture_cgroup_procs("after_spawn", "workers"),
911    ///         ],
912    ///         HoldSpec::fixed(Duration::ZERO),
913    ///     ),
914    /// ];
915    /// let _ = execute_scenario(&ctx, backdrop, steps)?;
916    ///
917    /// // Either drain the whole log or look up by tag.
918    /// let after = bridge_for_drain.cgroup_procs_by_tag("after_spawn")
919    ///     .expect("Op::CaptureCgroupProcs(\"after_spawn\", ...) snapshot");
920    /// assert_eq!(after.pids.len(), 4);
921    /// ```
922    ///
923    /// # Within-Step ordering
924    ///
925    /// Ops in a single Step apply sequentially in vec order, so a
926    /// `Op::CaptureCgroupProcs` placed AFTER `Op::Spawn` /
927    /// `Op::MoveAllTasks` observes the post-spawn / post-migrate
928    /// kernel state. The producing ops complete synchronously (their
929    /// `cgroup.procs` writes block on kernel commit), so the capture
930    /// sees every PID those ops placed.
931    ///
932    /// # PID vs TID grain
933    ///
934    /// Reads `cgroup.procs` (thread-group leaders), NOT `cgroup.threads`
935    /// (per-thread TIDs). Grain implications by spawn op:
936    ///
937    /// - `Op::Spawn` → ktstr workers are 1-thread-per-worker, so
938    ///   `workers(N)` produces `N` pids in `cgroup.procs`.
939    /// - `Op::RunPayload` → an `execve`'d binary is ONE process; even
940    ///   if the binary spawns 100 threads, `cgroup.procs` reports the
941    ///   single thread-group leader. Tests asserting per-thread
942    ///   placement would need a sibling `cgroup.threads` accessor
943    ///   (future Op variant if a use case arises).
944    ///
945    /// # Tag uniqueness
946    ///
947    /// `tag` is the snapshot key the test body uses to find the
948    /// capture in the drain log. The apply-ops dispatch rejects an
949    /// empty `tag` with an actionable bail. Multiple captures of
950    /// the same `cgroup` under DIFFERENT tags surface as separate
951    /// entries (lets a scenario capture pre/post snapshots of the
952    /// same cgroup); multiple captures with the same `(tag, cgroup)`
953    /// also append rather than overwrite — tag uniqueness is a caller
954    /// convention, not a framework-enforced contract. The by-tag
955    /// lookup [`SnapshotBridge::cgroup_procs_by_tag`](crate::scenario::snapshot::SnapshotBridge::cgroup_procs_by_tag)
956    /// returns the FIRST match; callers who care about multiplicity
957    /// must use [`SnapshotBridge::drain_cgroup_procs`](crate::scenario::snapshot::SnapshotBridge::drain_cgroup_procs)
958    /// and filter the Vec manually.
959    ///
960    /// # Empty / unknown cgroup
961    ///
962    /// - Empty cgroup (exists but holds no tasks): captured snapshot
963    ///   has `pids = vec![]`. Lets callers assert "no tasks landed
964    ///   here" without conflating with "no such cgroup."
965    /// - Unknown cgroup (directory missing): apply bails with a
966    ///   layered anyhow chain — the outer wrap names the op + tag +
967    ///   cgroup; the inner [`crate::cgroup::CgroupOps::read_procs`]
968    ///   context surfaces the resolved path + the actionable hint
969    ///   about `Op::AddCgroup` / `workload_root_cgroup`. Use
970    ///   `format!("{err:#}")` (alternate display) to flatten both
971    ///   layers in test assertions.
972    ///
973    /// # See also
974    ///
975    /// - [`Op::CaptureSnapshot`] — diagnostic-snapshot capture (full
976    ///   scheduler state dump via FailureDumpReport). Distinct from
977    ///   this op's cgroup-procs read AND drains via a separate
978    ///   `SnapshotBridge::drain` / `drain_ordered` channel, not
979    ///   `drain_cgroup_procs`.
980    /// - [`crate::cgroup::CgroupOps::read_procs`] — the underlying
981    ///   trait method this op dispatches through.
982    CaptureCgroupProcs {
983        /// Snapshot key. Must be non-empty. Used by
984        /// [`SnapshotBridge::drain_cgroup_procs`](crate::scenario::snapshot::SnapshotBridge::drain_cgroup_procs)
985        /// consumers to find this capture in the drain log.
986        tag: Cow<'static, str>,
987        /// Cgroup to read `cgroup.procs` from. Must be a name
988        /// already tracked by the scenario (created via
989        /// `Op::AddCgroup`, a `CgroupDef` in setup, or pushed on
990        /// the Backdrop). Must be non-empty.
991        cgroup: Cow<'static, str>,
992    },
993    /// Re-steer a hardware IRQ to a single CPU by writing
994    /// `/proc/irq/<N>/smp_affinity_list` in the guest — the knob
995    /// that drives the kernel's `write_irq_affinity` →
996    /// `irq_set_affinity` → `irq_do_set_affinity` → irqchip
997    /// `set_affinity` path (`kernel/irq/proc.c`,
998    /// `kernel/irq/manage.c`). Use it to place a NIC's
999    /// RX-completion interrupt on a chosen CPU so the hardirq, the
1000    /// `NET_RX` softirq it raises, and any task that path wakes all
1001    /// land where the scenario wants them: the steering half of an
1002    /// IRQ-locality test whose generating half is
1003    /// [`crate::workload::WorkType::NetTraffic`] and whose observing
1004    /// half is the per-CPU IRQ metric axis (`max_cpu_hardirqs`,
1005    /// `max_cpu_softirq_net_rx`, and their `*_concentration` ratios).
1006    ///
1007    /// # In-guest file write, NOT a kernel-memory poke
1008    ///
1009    /// A write to the `irq_desc` affinity mask in kernel memory would
1010    /// NOT re-route delivery — only the `smp_affinity_list` write
1011    /// runs the full set-affinity path that reprograms the interrupt
1012    /// controller (MSI-X message / IOAPIC RTE). So this Op is
1013    /// dispatched as a plain `std::fs::write` from the executor
1014    /// in-guest (mirroring the `/proc/sysrq-trigger` write
1015    /// [`Op::DetachScheduler`] performs), NOT through the
1016    /// kernel-memory rendezvous path of [`Op::WriteKernelHot`] /
1017    /// [`Op::WriteKernelCold`].
1018    ///
1019    /// # Online-CPU requirement
1020    ///
1021    /// The kernel intersects the requested mask with
1022    /// `cpu_online_mask` before programming the irqchip
1023    /// (`irq_do_set_affinity`); a single-CPU target that is offline
1024    /// leaves no online CPU in the mask and the write returns
1025    /// `-EINVAL` (the `!cpumask_intersects(new_value,
1026    /// cpu_online_mask)` arm of `write_irq_affinity`). The
1027    /// dispatcher pre-checks `cpu` against
1028    /// `/sys/devices/system/cpu/online` and bails with an actionable
1029    /// message before the write, so an out-of-range / offline target
1030    /// names the CPU instead of surfacing a bare `EINVAL`. IRQ
1031    /// affinity is a system-wide property, NOT scoped to the writing
1032    /// task's cpuset — the target need not be in the runner's
1033    /// allowed set.
1034    ///
1035    /// Construct via [`Op::steer_irq`].
1036    SteerIrq {
1037        /// Which IRQ to steer — a literal Linux IRQ number or a
1038        /// `/proc/interrupts` action-name label. See [`IrqSelector`].
1039        irq: IrqSelector,
1040        /// Target Linux processor number — the value written to
1041        /// `smp_affinity_list`. Must be online (see the variant's
1042        /// online-CPU requirement above).
1043        cpu: usize,
1044    },
1045}
1046
1047/// Placement target for [`Op::Spawn`].
1048///
1049/// The previous taxonomy had two ops (`SpawnWorkers` and `SpawnHost`)
1050/// representing the two placement choices; the unified `Op::Spawn`
1051/// variant parameterises the placement so the framework has ONE
1052/// spawn op with the placement as data. `SpawnPlacement` is
1053/// `#[non_exhaustive]`; further placements are added here rather
1054/// than as new `Op` variants.
1055///
1056/// # `#[non_exhaustive]`
1057///
1058/// `SpawnPlacement` is `#[non_exhaustive]` — see
1059/// [`crate::non_exhaustive`] for the cross-crate pattern-match and
1060/// construction rules shared by every such type.
1061#[derive(Clone, Debug, Eq, Hash, PartialEq)]
1062#[non_exhaustive]
1063pub enum SpawnPlacement {
1064    /// Spawn workers in the spawner's own cgroup — the test
1065    /// runner's cgroup, NOT any managed workload cgroup declared
1066    /// via [`CgroupDef`] or [`Op::AddCgroup`]. The handler issues
1067    /// ZERO cgroup ops; the workers inherit whatever cgroup the
1068    /// test runner sits in.
1069    ///
1070    /// Inside a guest VM the runner's cgroup is typically the
1071    /// root (cgid=1), so RunnerCgroup workers appear in snapshots
1072    /// under the root cgroup rather than under your workload's
1073    /// named hierarchy.
1074    ///
1075    /// `WorkSpec::workers_pct` is rejected for this placement —
1076    /// there's no managed cgroup whose cpuset would supply the
1077    /// percentage denominator. Use an explicit `.workers(N)`
1078    /// count, or switch to `Cgroup(name)` against a cgroup whose
1079    /// cpuset gives `workers_pct` a denominator.
1080    ///
1081    /// # Why "RunnerCgroup"?
1082    ///
1083    /// The previous shape used `SpawnHost` — "host" referred to
1084    /// the spawner's own cgroup (analogous to the
1085    /// scheduler-observability "host tasks vs workload tasks"
1086    /// distinction in sched_ext schedulers, e.g. mitosis's cell
1087    /// 0). `RunnerCgroup` names the placement target precisely
1088    /// (the test-runner process's cgroup) without the
1089    /// host-vs-guest-machine ambiguity that "host" carried.
1090    RunnerCgroup,
1091    /// Spawn workers and move them into the named managed
1092    /// cgroup. The cgroup must already exist when the spawn op
1093    /// applies — declared via [`CgroupDef`] in `Step.setup`,
1094    /// via [`Op::AddCgroup`] / [`Op::AddCgroupDef`] earlier in
1095    /// the same step, or on the persistent
1096    /// [`Backdrop`](crate::scenario::Backdrop).
1097    Cgroup(Cow<'static, str>),
1098}
1099
1100impl SpawnPlacement {
1101    /// Construct [`SpawnPlacement::Cgroup`] from any string-like
1102    /// input (`&'static str`, `String`, `Cow<'static, str>`).
1103    /// Mirrors the [`impl Into<Cow<'static, str>>`] convention
1104    /// used by every other cgroup-name constructor on [`Op`]
1105    /// (`Op::add_cgroup`, `Op::spawn_workers`, `Op::move_all_tasks`,
1106    /// ...) so callers pass `"name"` not `"name".into()`.
1107    pub fn cgroup(name: impl Into<Cow<'static, str>>) -> Self {
1108        SpawnPlacement::Cgroup(name.into())
1109    }
1110
1111    /// Construct [`SpawnPlacement::RunnerCgroup`]. Const so it
1112    /// composes inside `const` scenarios + builds.
1113    pub const fn runner_cgroup() -> Self {
1114        SpawnPlacement::RunnerCgroup
1115    }
1116}
1117
1118/// Which IRQ [`Op::SteerIrq`] targets.
1119///
1120/// Two ways to name the same hardware IRQ:
1121///
1122/// - [`ByNumber`](Self::ByNumber) — the literal Linux IRQ number (the
1123///   leading column in `/proc/interrupts`, the `<N>` in
1124///   `/proc/irq/<N>/`). Use when the scenario already resolved the
1125///   number (e.g. via the per-NIC IRQ discovery a test does itself).
1126/// - [`ByLabel`](Self::ByLabel) — the `/proc/interrupts` action name
1127///   (the last whitespace token on the IRQ's line). The dispatcher
1128///   scans `/proc/interrupts` for the first line whose last token
1129///   equals the label and steers that IRQ. Use when the number is
1130///   not known ahead of time but the device label is stable. On the
1131///   virtio-MMIO transport ktstr boots, the NIC registers ONE shared
1132///   IRQ whose action name is the bare device basename (e.g.
1133///   `"virtio1"`), so that resolves uniquely. Limitation: the match
1134///   is the line's last token only — a shared IRQ (a comma-separated
1135///   action chain) matches just the LAST action, and a multi-word
1136///   action name never matches. The match deliberately is not
1137///   widened to any token because the per-CPU count / chip / hwirq
1138///   columns would then false-match; steer by
1139///   [`ByNumber`](Self::ByNumber) for a shared or multi-word-named
1140///   IRQ.
1141///
1142/// # `#[non_exhaustive]`
1143///
1144/// `IrqSelector` is `#[non_exhaustive]` — see
1145/// [`crate::non_exhaustive`] for the cross-crate pattern-match and
1146/// construction rules shared by every such type. Prefer the
1147/// [`by_number`](Self::by_number) / [`by_label`](Self::by_label)
1148/// constructors over naming variants directly.
1149#[derive(Clone, Debug, Eq, Hash, PartialEq)]
1150#[non_exhaustive]
1151pub enum IrqSelector {
1152    /// The literal Linux IRQ number (the `/proc/interrupts` leading
1153    /// column, the `<N>` in `/proc/irq/<N>/smp_affinity_list`).
1154    ByNumber(u32),
1155    /// The `/proc/interrupts` action-name label (the line's last
1156    /// whitespace token) — resolved to an IRQ number at dispatch.
1157    ByLabel(Cow<'static, str>),
1158}
1159
1160impl IrqSelector {
1161    /// Select an IRQ by its literal Linux IRQ number. Const so it
1162    /// composes inside `const` scenarios + builds.
1163    pub const fn by_number(irq: u32) -> Self {
1164        IrqSelector::ByNumber(irq)
1165    }
1166
1167    /// Select an IRQ by its `/proc/interrupts` action-name label
1168    /// (the last whitespace token on the IRQ's line, e.g. a
1169    /// virtio-net device basename like `"virtio1"`). Accepts any
1170    /// string-like input (`&'static str`, `String`,
1171    /// `Cow<'static, str>`), mirroring the cgroup-name constructor
1172    /// convention on [`Op`].
1173    pub fn by_label(label: impl Into<Cow<'static, str>>) -> Self {
1174        IrqSelector::ByLabel(label.into())
1175    }
1176}
1177
1178/// How to compute a cpuset from topology.
1179///
1180/// # `#[non_exhaustive]`
1181///
1182/// `CpusetSpec` is `#[non_exhaustive]` — see
1183/// [`crate::non_exhaustive`] for the cross-crate pattern-match and
1184/// construction rules shared by every such type.
1185///
1186/// Variant-specific guidance for `CpusetSpec`: prefer the
1187/// associated constructor functions — [`Self::llc`], [`Self::numa`],
1188/// [`Self::range`], [`Self::disjoint`], [`Self::overlap`], and
1189/// [`Self::exact`] — over naming variant literals like
1190/// `CpusetSpec::Llc(0)` or `CpusetSpec::Range { start_frac,
1191/// end_frac }`. Two reasons:
1192///
1193/// 1. **Stability across variant reshaping.** A future commit that
1194///    adds a field to `Range` (e.g. a stride parameter) breaks every
1195///    caller that spelled out `CpusetSpec::Range { start_frac,
1196///    end_frac }`; the `Self::range(..)` constructor absorbs the
1197///    new field behind a defaulted parameter. The `#[non_exhaustive]`
1198///    attribute is what reserves that freedom for the enum; the
1199///    constructor convention is how callers opt into benefiting from
1200///    it.
1201/// 2. **Semantic consistency with [`Self::exact`].** The `exact`
1202///    constructor accepts any `IntoIterator<Item = usize>` (arrays,
1203///    ranges, `Vec`, `BTreeSet`) and converts to `BTreeSet<usize>`
1204///    internally; callers that bypass it and write
1205///    `CpusetSpec::Exact(set)` directly must hand-build the
1206///    `BTreeSet` — duplicate bookkeeping a future-proofed constructor
1207///    erases.
1208///
1209/// Test code that needs to *inspect* a variant via pattern match
1210/// necessarily references the variant literal (the name is load-
1211/// bearing for the match), so the construction-side rule is a
1212/// convention for *production* call sites, not a hard constraint.
1213/// Inside this crate, matchers obey the pattern-side rule above;
1214/// constructors obey this rule.
1215///
1216/// `Clone + Debug + PartialEq`. `Eq` / `Hash` are impossible
1217/// because [`Range`](Self::Range) and [`Overlap`](Self::Overlap)
1218/// carry `f64` fractions; `Default` has no honest value (`Llc(0)`
1219/// vs. `Range(0..1)` vs. `Exact(empty)` are all different
1220/// "no-op" semantics).
1221///
1222/// Note: `f64::NAN != f64::NAN` per IEEE 754, so a `CpusetSpec`
1223/// containing NaN fractions will not equal a clone of itself;
1224/// `validate()` rejects NaN inputs.
1225#[derive(Clone, Debug, PartialEq)]
1226#[non_exhaustive]
1227pub enum CpusetSpec {
1228    /// All CPUs in a given LLC index.
1229    Llc(usize),
1230    /// All CPUs in a given NUMA node index.
1231    Numa(usize),
1232    /// Fractional range of usable CPUs [start_frac..end_frac).
1233    Range { start_frac: f64, end_frac: f64 },
1234    /// Partition usable CPUs into `of` equal disjoint sets; take the `index`-th.
1235    Disjoint { index: usize, of: usize },
1236    /// Like Disjoint but each set overlaps neighbors by `frac` of its size.
1237    Overlap { index: usize, of: usize, frac: f64 },
1238    /// Exact CPU set (no topology resolution).
1239    Exact(BTreeSet<usize>),
1240}
1241
1242impl CpusetSpec {
1243    /// Construct an `Exact` cpuset from any iterator of CPU indices.
1244    ///
1245    /// Accepts arrays, ranges, `Vec`, `BTreeSet`, or any `IntoIterator<Item = usize>`.
1246    pub fn exact(cpus: impl IntoIterator<Item = usize>) -> Self {
1247        CpusetSpec::Exact(cpus.into_iter().collect())
1248    }
1249
1250    /// Partition usable CPUs into `of` equal disjoint sets; take the `index`-th.
1251    pub const fn disjoint(index: usize, of: usize) -> Self {
1252        CpusetSpec::Disjoint { index, of }
1253    }
1254
1255    /// Like [`disjoint`](Self::disjoint) but each set overlaps neighbors by `frac` of its size.
1256    pub const fn overlap(index: usize, of: usize, frac: f64) -> Self {
1257        CpusetSpec::Overlap { index, of, frac }
1258    }
1259
1260    /// Fractional range of usable CPUs `[start_frac..end_frac)`.
1261    pub const fn range(start_frac: f64, end_frac: f64) -> Self {
1262        CpusetSpec::Range {
1263            start_frac,
1264            end_frac,
1265        }
1266    }
1267
1268    /// All CPUs in a given LLC index.
1269    pub const fn llc(index: usize) -> Self {
1270        CpusetSpec::Llc(index)
1271    }
1272
1273    /// All CPUs in a given NUMA node index.
1274    pub const fn numa(index: usize) -> Self {
1275        CpusetSpec::Numa(index)
1276    }
1277}
1278
1279/// Host-side write/read target for the kernel-memory ops
1280/// ([`Op::WriteKernelHot`] / [`Op::WriteKernelCold`] /
1281/// [`Op::ReadKernelHot`] / [`Op::ReadKernelCold`]).
1282///
1283/// Each variant names a kernel address by the resolution path the
1284/// host coordinator will take when the op fires; the actual
1285/// `GuestKernel` write helpers consume the resolved KVA. The variant
1286/// chosen here picks WHICH translation path (KASLR-aware kernel-image
1287/// base for [`Self::Symbol`], `PAGE_OFFSET` for [`Self::Direct`],
1288/// page-table walk for [`Self::Kva`], or per-CPU dereference for
1289/// [`Self::PerCpuField`]).
1290///
1291/// # Semantic risk — writing to load-bearing scheduler state
1292///
1293/// ktstr does not gate or filter target addresses. The framework
1294/// trusts the test author to know what they are pointing at. That
1295/// trust includes a class of fields where a raw write silently
1296/// breaks downstream kernel invariants the test author did not
1297/// intend to perturb. By design, mitigation is documentation-only:
1298/// the framework will not refuse a write nor emit a runtime warn —
1299/// the test author owns the choice. The cases to know about:
1300///
1301/// **Per-runqueue counters maintained by the scheduler classes.**
1302/// Raw writes skip the side-effects the kernel encodes in the
1303/// maintainer functions, leaving cross-class accounting in an
1304/// inconsistent state.
1305///
1306/// * **`struct rq.nr_running`** — the per-CPU runqueue task count.
1307///   `add_nr_running` / `sub_nr_running` (`kernel/sched/sched.h`)
1308///   also (a) fire the `sched_update_nr_running_tp` tracepoint and
1309///   (b) call `sched_update_tick_dependency(rq)` (the
1310///   `NOHZ_FULL` per-CPU tick gating logic); `add_nr_running`
1311///   additionally sets the root-domain `overloaded` bit
1312///   (`rq->rd->overloaded`) on the `prev_nr < 2 && new_nr >= 2`
1313///   transition. A bare 8-byte store skips all of those; the
1314///   counter and the root-domain overload signal diverge, the
1315///   NOHZ_FULL CPU may stop or start receiving ticks against the
1316///   test author's intent, and downstream load-balance decisions
1317///   read a count that no longer matches reality.
1318/// * **`struct cfs_rq.h_nr_runnable` / `h_nr_queued` /
1319///   `h_nr_idle`** (`kernel/sched/sched.h` `struct cfs_rq`) —
1320///   hierarchical CFS task counts maintained by
1321///   `account_entity_enqueue` / `dequeue` with cascade up the task
1322///   group tree. Raw write skips parent-cfs_rq propagation and
1323///   breaks group scheduling accounting.
1324/// * **`struct rt_rq.rt_nr_running`** (`kernel/sched/sched.h`
1325///   `struct rt_rq`) — RT class runqueue task count; updated by
1326///   `inc_rt_tasks` / `dec_rt_tasks` which also maintain the
1327///   per-rt_rq `overloaded` bit and the `highest_prio.curr/next`
1328///   priority-pushable tracking.
1329/// * **`struct dl_rq.dl_nr_running` / `running_bw` / `this_bw`**
1330///   (`kernel/sched/sched.h` `struct dl_rq`) — DEADLINE class
1331///   counters and bandwidth tracking; `add_running_bw` /
1332///   `sub_running_bw` (in `kernel/sched/deadline.c`) implement the
1333///   admission-control accounting that SUGOV's `cpu_bw_dl()`
1334///   consumes for frequency selection. A raw write to any of
1335///   these breaks admission control + DVFS.
1336///
1337/// **PELT (Per-Entity Load Tracking) averages.** These are
1338/// exponential moving averages whose internal `_sum` accumulators
1339/// are advanced against `cfs_rq_clock_pelt(cfs_rq)` (see
1340/// `kernel/sched/fair.c update_load_avg`, which calls into
1341/// `kernel/sched/pelt.c __update_load_avg_se` /
1342/// `__update_load_avg_cfs_rq`). Writing only the visible
1343/// `_avg` value desynchronises it from the `_sum` it was
1344/// computed from; the next `update_load_avg` decays both and
1345/// corrupts the next several passes.
1346///
1347/// * **`struct sched_avg`** fields on `task_struct.se.avg` and
1348///   `cfs_rq.avg`: `load_avg`, `runnable_avg`, `util_avg`,
1349///   `util_est`, plus `load_sum` / `runnable_sum` / `util_sum`
1350///   / `last_update_time` / `period_contrib` (see
1351///   `include/linux/sched.h struct sched_avg`).
1352/// * **`cfs_rq.removed.{load_avg,util_avg,runnable_avg}`** —
1353///   pending-decay buffer for departing entities; flushed at the
1354///   next `update_load_avg`.
1355/// * **`rq.cpu_capacity`** — set by `update_cpu_capacity`
1356///   (`kernel/sched/fair.c`, called from the load-balance path
1357///   `update_group_capacity`) from per-CPU RT capacity scaling;
1358///   initialized at boot in `kernel/sched/core.c sched_init`.
1359///   Raw writes are overwritten on the next load-balance tick
1360///   that triggers a capacity recomputation.
1361///
1362/// **Cgroup / task-group accounting.** Updating the task-group
1363/// hierarchy bypasses the cascade that the kernel performs over
1364/// every group entity.
1365///
1366/// * **`task_group.shares`** — cgroup CPU shares, normally set
1367///   via `sched_group_set_shares` (`kernel/sched/fair.c`) which
1368///   cascades into `update_load_set` + walks every task in the
1369///   group. Raw write skips the cascade and produces
1370///   inconsistent per-entity load weights.
1371/// * **`task_group.cfs_bandwidth.{quota, period, runtime}`** —
1372///   CFS bandwidth control. `tg_set_cfs_bandwidth`
1373///   (`kernel/sched/core.c`) is the cgroup-fs writer; the
1374///   per-cfs_rq runtime distribution is performed by
1375///   `__refill_cfs_bandwidth_runtime` (`kernel/sched/fair.c`)
1376///   gated by the `cfs_bandwidth_used()` static-key
1377///   (`kernel/sched/fair.c`) registered via
1378///   `start_cfs_bandwidth` (`kernel/sched/fair.c`). Raw writes
1379///   skip all of those.
1380///
1381/// **The right shape for influencing these fields is to drive the
1382/// kernel into the desired state through real activity** —
1383/// [`Op::Spawn`] with [`SpawnPlacement::RunnerCgroup`] (inherits the spawner's cgroup, typically
1384/// cgid=1 inside guest VMs) or
1385/// [`Op::Spawn`] with [`SpawnPlacement::Cgroup`] (runs inside a named cgroup) of a
1386/// synthetic [`WorkloadConfig`](crate::workload::WorkloadConfig)
1387/// for fake-load, real preemption pressure for sched_avg.
1388///
1389/// ## Fields that ARE safe to write raw (with caveats)
1390///
1391/// * **`jiffies_64`** (`include/linux/jiffies.h`) — the global
1392///   timekeeping tick counter. Safe to advance FORWARD only;
1393///   backward jumps trigger soft-lockup watchdog warnings and
1394///   can stall `time_after_eq` waiters whose expiry now appears
1395///   to be in the past in a way the timer wheel cannot
1396///   reconcile.
1397/// * **Per-CPU `rq.clock`** (`struct rq.clock`,
1398///   `kernel/sched/sched.h`) — the scheduler's per-CPU
1399///   wall-time clock. Not generically safe: `update_rq_clock`
1400///   (`kernel/sched/core.c`) overwrites it at every
1401///   scheduling tick + every enqueue/dequeue from
1402///   `sched_clock_cpu(cpu)`, so a raw write lasts at most until
1403///   the next tick (~1 ms with `HZ=1000`). The
1404///   `rq_clock_skip_update()` helper sets `RQCF_REQ_SKIP` in
1405///   `rq->clock_update_flags`, which suppresses one
1406///   `update_rq_clock` call, but its semantics are tightly
1407///   coupled to the RQCF_ACT_SKIP / RQCF_REQ_SKIP state
1408///   machine in `__schedule` — a self-contained "freeze
1409///   rq.clock at value X across step Y" pattern is the
1410///   framework's responsibility (planned), not a one-shot
1411///   raw-write primitive. Bumping `rq.clock_task` directly
1412///   is also NOT safe — that field is computed by
1413///   `update_rq_clock_task` from `rq->clock` minus IRQ and
1414///   steal-time deltas (`prev_irq_time` and
1415///   `prev_steal_time_rq`) and a raw write desynchronises it
1416///   from the inputs.
1417/// * **Per-CPU `rq.scx.clock`** (sched_ext per-CPU clock) — safe
1418///   ONLY when paired with setting `SCX_RQ_CLK_VALID` in
1419///   `rq.scx.flags`. The flag gates `scx_bpf_now()` reads;
1420///   writing the clock without the flag leaves `scx_bpf_now()`
1421///   returning stale data, and clearing the flag without
1422///   resetting the clock makes downstream BPF readers fall
1423///   back to the host TSC unexpectedly. Atomic bit-set without
1424///   read-back is provided by [`KernelValue::OrU32`] — the RMW
1425///   variant whose width matches `struct scx_rq.flags` (`u32`
1426///   at `kernel/sched/sched.h:803`). Note there is no
1427///   `OrU64` sibling: a 64-bit RMW at this field address would
1428///   corrupt the adjacent `u32 nr_immed` field at
1429///   `kernel/sched/sched.h:804`. Width is the variant tag, so
1430///   wrong-width writes are a compile-time error rather than a
1431///   silent field-overflow bug at runtime. Pair `OrU32(SCX_RQ_CLK_VALID)`
1432///   with the prior `U64(clock_val)` write in a single
1433///   `Op::WriteKernelCold` batch so both land under one freeze
1434///   rendezvous and the kernel's documented
1435///   write-clock-BEFORE-OR-flag ordering (per
1436///   `kernel/sched/sched.h:1848-1854` `scx_rq_clock_update`)
1437///   holds.
1438/// * **`scx-ktstr` private bss / per-CPU scratch** — the
1439///   fixture scheduler exposes a dedicated write surface for
1440///   test use; raw writes there don't propagate into core
1441///   sched code by construction.
1442///
1443/// # `#[non_exhaustive]`
1444///
1445/// `KernelTarget` is `#[non_exhaustive]` — see
1446/// [`crate::non_exhaustive`] for the cross-crate pattern-match rule.
1447/// Prefer the per-variant constructors ([`Self::symbol`],
1448/// [`Self::direct`], [`Self::kva`], [`Self::per_cpu_field`]) over
1449/// naming variant literals.
1450#[derive(Clone, Debug, PartialEq, Eq)]
1451#[non_exhaustive]
1452pub enum KernelTarget {
1453    /// Kernel text/data/bss symbol. The host resolves
1454    /// `name → KVA → PA` via the runtime kernel image base + KASLR
1455    /// `phys_base`, exactly as
1456    /// `crate::monitor::guest::GuestKernel::write_symbol_u64`
1457    /// already does for the existing write-symbol helper.
1458    Symbol(Cow<'static, str>),
1459    /// Direct-mapped kernel virtual address — translated via
1460    /// `kva - PAGE_OFFSET`. Use this when the caller has already
1461    /// resolved a SLAB / per-CPU / physmem KVA and just wants the
1462    /// host to write at that address.
1463    Direct(u64),
1464    /// Vmalloc'd / vmap'd kernel virtual address — translated via
1465    /// page-table walk through the guest's `CR3`. Use this for BPF
1466    /// maps, vmalloc'd memory, and any other address that does NOT
1467    /// live in the direct map.
1468    Kva(u64),
1469    /// Per-CPU field of a kernel struct, resolved at op dispatch
1470    /// time. The variant carries the symbolic intent only (`symbol`,
1471    /// `field`, `cpu`); the dispatcher looks up `symbol` in the
1472    /// vmlinux symbol table, adds `__per_cpu_offset[cpu]`, and adds
1473    /// the BTF-resolved byte offset of `field` within `symbol`'s
1474    /// struct type to yield the per-CPU field's runtime KVA.
1475    ///
1476    /// `symbol` must be in the v1 supported set: `runqueues` →
1477    /// `struct rq`, `kernel_cpustat` → `struct kernel_cpustat`,
1478    /// `kstat` → `struct kernel_stat`, `tick_cpu_sched` →
1479    /// `struct tick_sched`. Unknown symbols fail with a typed error
1480    /// (the wire variant doesn't carry struct type, so the
1481    /// dispatcher maps via a hardcoded table — extend it AND
1482    /// `KernelSymbols::from_elf` to add). KASLR-on round-trip
1483    /// coverage is an outstanding follow-up; ktstr defaults to
1484    /// `nokaslr` so the kaslr_offset slide is 0 on the standard
1485    /// test path.
1486    ///
1487    /// Lazy resolution keeps the construction surface pure-data
1488    /// (the test author needs no `GuestKernel`/BTF/symbol-table
1489    /// handle to construct the variant); resolution failures
1490    /// surface as op-execution errors at the same layer as
1491    /// missing-symbol failures in other snapshot ops.
1492    PerCpuField {
1493        /// Kernel symbol naming the per-CPU template
1494        /// (e.g. `"runqueues"`).
1495        symbol: Cow<'static, str>,
1496        /// Field name within the symbol's struct
1497        /// (e.g. `"clock"` for `struct rq.clock`).
1498        field: Cow<'static, str>,
1499        /// CPU index whose per-CPU instance to address.
1500        cpu: u32,
1501    },
1502    /// Per-task field of `struct task_struct` — SCX-managed tasks
1503    /// only (the dispatcher's L6 sched_class gate rejects non-SCX
1504    /// tasks). Resolved at dispatch by walking `init_task.tasks`
1505    /// plus each leader's `signal->thread_head` to locate the task
1506    /// with matching `pid` AND matching `expected_start_time_ns`
1507    /// (anti-PID-reuse identity), then adding the BTF-resolved
1508    /// nested-path byte offset of `field` within `task_struct`.
1509    /// See `crate::vmm::wire::KernelOpTarget::TaskField` for the
1510    /// 7-layer validation chain the dispatcher applies.
1511    ///
1512    /// `expected_start_time_ns` is `task->start_time` captured at
1513    /// WorkSpec spawn time. Get it via
1514    /// [`crate::workload::WorkloadHandle::worker_pids`] for
1515    /// the PID list, then read `/proc/<pid>/stat` field 22 +
1516    /// convert from jiffies to ns via
1517    /// `* 1_000_000_000 / sysconf(_SC_CLK_TCK)`.
1518    TaskField {
1519        /// Guest-side `pid_t` of the target task. Both leaders and
1520        /// non-leader threads are addressable.
1521        pid: u32,
1522        /// `task->start_time` (ns) recorded at spawn time. The
1523        /// dispatcher's L2 check rejects writes when the observed
1524        /// `task->start_time` differs (PID-reuse identity guard).
1525        expected_start_time_ns: u64,
1526        /// Dot-separated nested-member path within `task_struct`.
1527        /// SCX-only fields recommended (e.g. `"scx.dsq_vtime"`,
1528        /// `"start_boottime"`). `"se.vruntime"` writes are
1529        /// silently discarded by EEVDF's `place_entity` on enqueue
1530        /// (`kernel/sched/fair.c:5381-5514` since 6.6) AND rejected
1531        /// by the SCX-only class gate; do not use.
1532        field: Cow<'static, str>,
1533    },
1534}
1535
1536impl KernelTarget {
1537    /// Kernel text/data/bss symbol target. Resolves at op-dispatch
1538    /// time via the runtime kernel image base + KASLR `phys_base`.
1539    ///
1540    /// **Heads up.** See the `# Semantic risk` section on the
1541    /// enclosing [`KernelTarget`] type doc before pointing this
1542    /// at a scheduler-bookkeeping symbol.
1543    pub fn symbol(name: impl Into<Cow<'static, str>>) -> Self {
1544        KernelTarget::Symbol(name.into())
1545    }
1546
1547    /// Direct-mapped KVA target. Translates via `kva - PAGE_OFFSET`.
1548    /// For per-CPU bases the caller must add
1549    /// `__per_cpu_offset[cpu]` to the base symbol KVA before
1550    /// constructing the variant; use [`Self::per_cpu_field`]
1551    /// instead for the framework-resolved per-CPU shape.
1552    ///
1553    /// **Heads up.** See the `# Semantic risk` section on the
1554    /// enclosing [`KernelTarget`] type doc before pointing this
1555    /// at a scheduler-bookkeeping address.
1556    pub const fn direct(kva: u64) -> Self {
1557        KernelTarget::Direct(kva)
1558    }
1559
1560    /// Vmalloc'd / vmap'd KVA target. Translates via page-table
1561    /// walk through the guest's `CR3`.
1562    ///
1563    /// **Heads up.** See the `# Semantic risk` section on the
1564    /// enclosing [`KernelTarget`] type doc before pointing this
1565    /// at a scheduler-bookkeeping address.
1566    pub const fn kva(kva: u64) -> Self {
1567        KernelTarget::Kva(kva)
1568    }
1569
1570    /// Per-CPU field of a kernel struct. Resolves at op-dispatch
1571    /// time via `symbol_kva + __per_cpu_offset[cpu] + BTF byte
1572    /// offset of field`.
1573    ///
1574    /// **Heads up.** See the `# Semantic risk` section on the
1575    /// enclosing [`KernelTarget`] type doc before pointing this
1576    /// at a per-CPU scheduler-bookkeeping field.
1577    pub fn per_cpu_field(
1578        symbol: impl Into<Cow<'static, str>>,
1579        field: impl Into<Cow<'static, str>>,
1580        cpu: u32,
1581    ) -> Self {
1582        KernelTarget::PerCpuField {
1583            symbol: symbol.into(),
1584            field: field.into(),
1585            cpu,
1586        }
1587    }
1588
1589    /// Per-task `struct task_struct` field target — SCX-managed
1590    /// tasks only. Resolves at dispatch via `init_task.tasks` +
1591    /// per-leader `signal->thread_head` walks to find the task
1592    /// with matching `pid` AND matching `expected_start_time_ns`
1593    /// (anti-PID-reuse), then BTF nested-path offset of `field`.
1594    ///
1595    /// `expected_start_time_ns` is `task->start_time` (set once by
1596    /// `kernel/fork.c::copy_process` via `ktime_get_ns()`).
1597    /// Get worker PIDs via
1598    /// [`crate::workload::WorkloadHandle::worker_pids`] then
1599    /// read `/proc/<pid>/stat` field 22 at spawn time and convert
1600    /// to ns: `field_22_jiffies * 1_000_000_000 /
1601    /// sysconf(_SC_CLK_TCK)`.
1602    ///
1603    /// `field` is dot-separated nested-member path. The dispatcher
1604    /// applies a 7-layer validation chain (pid match, start_time
1605    /// identity, lifetime, on_rq=0, scx queued-empty, ext
1606    /// sched_class, start_boottime != 0) before
1607    /// the write/read lands — see
1608    /// `crate::vmm::wire::KernelOpTarget::TaskField` for the full
1609    /// contract.
1610    ///
1611    /// **SCX-only.** The dispatcher rejects non-SCX tasks via the
1612    /// class+policy gates. Recommended fields: `"scx.dsq_vtime"`
1613    /// (DSQ priority key, preserved across dequeue/enqueue),
1614    /// `"start_boottime"` (task fork timestamp).
1615    ///
1616    /// **Do NOT write `"se.vruntime"`.** EEVDF's `place_entity`
1617    /// (`kernel/sched/fair.c:5381-5514`, since 6.6) overwrites
1618    /// `se->vruntime` on every enqueue; direct vruntime writes are
1619    /// silently discarded for sleeping tasks (our validation gate).
1620    /// CFS-class tasks are rejected before reaching the write
1621    /// regardless, but the field-level warning is the actionable
1622    /// guidance for "why won't my vruntime write stick" debugging.
1623    ///
1624    /// **Heads up.** The dispatcher's L4 (`on_rq == 0`) + L5
1625    /// (`scx.dsq == NULL` AND `scx.runnable_node` empty) gates
1626    /// reject writes on queued/running tasks per CFS rb-tree + SCX
1627    /// DSQ ordering safety. Test authors must use blocking workload
1628    /// patterns (e.g. [`crate::workload::WorkType::FutexPingPong`],
1629    /// `WorkType::WaitOnFutex`, `WorkType::Sleep`) so workers are
1630    /// sleeping when the cold-path Op fires.
1631    ///
1632    /// # Examples
1633    ///
1634    /// ```ignore
1635    /// // Escape-hatch primitive: seed a specific worker's
1636    /// // scx.dsq_vtime to ~30 days. WorkSpec.uptime (separate API)
1637    /// // wraps this; use the escape hatch when the scenario knows
1638    /// // the exact PID + start_time tuple.
1639    /// use ktstr::prelude::*;
1640    /// use std::time::Duration;
1641    ///
1642    /// let workers = handle.worker_pids();         // Vec<libc::pid_t>
1643    /// let worker_pid = workers[0] as u32;
1644    /// // Read `/proc/<pid>/stat` field 22, convert from jiffies to
1645    /// // nanoseconds via `* 1_000_000_000 / sysconf(_SC_CLK_TCK)`.
1646    /// // (Helper expected to land alongside WorkSpec.uptime.)
1647    /// let start_time_ns: u64 = read_start_time_ns(worker_pid)?;
1648    ///
1649    /// let seed_vtime_ns = (30 * 86_400_u64) * 1_000_000_000; // 30 days
1650    /// let writes = vec![(
1651    ///     KernelTarget::task_field(worker_pid, start_time_ns, "scx.dsq_vtime"),
1652    ///     KernelValue::u64(seed_vtime_ns),
1653    /// )];
1654    /// // Worker MUST be in a blocking pattern (FutexPingPong, etc.)
1655    /// // at op-fire time; the dispatcher's 8-layer validation
1656    /// // rejects writes against runnable/queued tasks.
1657    /// ```
1658    pub fn task_field(
1659        pid: u32,
1660        expected_start_time_ns: u64,
1661        field: impl Into<Cow<'static, str>>,
1662    ) -> Self {
1663        KernelTarget::TaskField {
1664            pid,
1665            expected_start_time_ns,
1666            field: field.into(),
1667        }
1668    }
1669}
1670
1671/// Value payload for the kernel-memory write ops, and the result
1672/// shape for the read ops.
1673///
1674/// The variant tag picks both the width (`u32` vs `u64` vs a byte
1675/// slice) and the underlying `crate::monitor::guest::GuestKernel`
1676/// write helper the host coordinator will invoke (`write_*_u32`,
1677/// `write_*_u64`, `write_*_bytes` per the [`KernelTarget`] class).
1678///
1679/// # `#[non_exhaustive]`
1680///
1681/// `KernelValue` is `#[non_exhaustive]` so new value widths can be
1682/// added without breaking external pattern-matchers. Prefer the
1683/// per-variant constructors over naming variant literals.
1684#[derive(Clone, Debug, PartialEq, Eq)]
1685#[non_exhaustive]
1686pub enum KernelValue {
1687    /// 32-bit unsigned little-endian write. Atomic when the
1688    /// resolved host PA is 4-byte aligned. Misaligned PAs fall
1689    /// through to a per-byte volatile loop in
1690    /// `crate::monitor::reader::GuestMem`
1691    /// `write_volatile_bytes` (the 4-byte fast path branches on
1692    /// `ptr.align_offset(align_of::<u32>()) == 0` and only emits
1693    /// a single `write_volatile` when alignment holds); torn
1694    /// intermediate state is observable to concurrent guest readers
1695    /// in the fallback case.
1696    ///
1697    /// **For setting individual bits without disturbing the
1698    /// surrounding value**, use [`Self::OrU32`] instead — that
1699    /// variant performs read-modify-write OR semantics under the
1700    /// freeze rendezvous (e.g. setting `SCX_RQ_CLK_VALID` in
1701    /// `rq.scx.flags` without clobbering the other 31 flag bits).
1702    /// A plain `U32(value)` write replaces every bit; OrU32 sets
1703    /// only the bits in the mask.
1704    U32(u32),
1705    /// 64-bit unsigned little-endian write. Atomic when the
1706    /// resolved host PA is 8-byte aligned. See the alignment note
1707    /// on [`Self::U32`] for the misaligned fall-through behaviour.
1708    ///
1709    /// **No `OrU64` sibling exists by design.** The canonical
1710    /// scheduler-flags use case ([`KernelValue::OrU32`] →
1711    /// `struct scx_rq.flags`) is on a `u32` field per
1712    /// `kernel/sched/sched.h:803`; a 64-bit RMW at that address
1713    /// would corrupt the adjacent `u32 nr_immed` field at
1714    /// `kernel/sched/sched.h:804`. If a future u64 RMW use case
1715    /// emerges with a verified width, add the variant then.
1716    U64(u64),
1717    /// Variable-length byte payload. Written non-atomically; the
1718    /// `GuestKernel::write_*_bytes` helpers emit a Release fence
1719    /// after the copy so a weakly-ordered guest's
1720    /// `smp_load_acquire` observes the bytes in write order — the
1721    /// fence orders the stores but does NOT atomicize the
1722    /// multi-byte write versus a concurrent guest reader.
1723    Bytes(Vec<u8>),
1724    /// 32-bit unsigned read-modify-write OR. The dispatcher reads
1725    /// the live u32 at the resolved host PA, ORs the carried mask
1726    /// into it, and writes the new value back. Width is u32 — the
1727    /// canonical use case is OR-ing a single-bit kernel flag (e.g.
1728    /// `SCX_RQ_CLK_VALID = 1 << 5`) into `struct scx_rq.flags`,
1729    /// declared `u32` at `kernel/sched/sched.h:803` inside the
1730    /// struct opened at L793. A 64-bit RMW at a u32 field address
1731    /// would either silently truncate the upper 32 bits or
1732    /// corrupt the adjacent `u32 nr_immed` field at
1733    /// `kernel/sched/sched.h:804`, so the variant tag itself
1734    /// picks the width and rules out width mismatch at the call
1735    /// site.
1736    ///
1737    /// **Atomicity** (cold-path dispatcher): the host coordinator
1738    /// holds the freeze rendezvous for the duration of the RMW —
1739    /// every guest vCPU is parked on a futex inside `handle_freeze`
1740    /// (no kernel-side writer is scheduled), and the host
1741    /// coordinator is the only writer of guest memory in scope.
1742    /// `read_u32 → OR mask → write_u32` therefore runs atomic
1743    /// **by quiesce**: no concurrent kernel writer can interleave
1744    /// between the load and the store. No `compare_exchange` loop
1745    /// is required for cold-path dispatch.
1746    ///
1747    /// At the host CPU level the read and write are separate
1748    /// (non-instruction-atomic) operations: a hypothetical
1749    /// concurrent host writer of guest memory would be a race.
1750    /// The freeze coordinator is the sole such writer by design
1751    /// (per the cold-path threat model documented at
1752    /// [`super::Op::WriteKernelCold`]), so the parked-vCPU
1753    /// contract is sufficient.
1754    ///
1755    /// **Alignment**: the dispatcher delegates u32 reads/writes
1756    /// to `crate::monitor::guest::GuestKernel`'s
1757    /// `read_*_u32` / `write_*_u32` helpers, which use a
1758    /// single-instruction `write_volatile` at 4-byte-aligned host
1759    /// PAs and fall through to a per-byte volatile loop on
1760    /// misalignment. Under the freeze rendezvous the per-byte
1761    /// fallback is safe (no concurrent kernel writer), so
1762    /// misaligned PAs do not produce a torn-RMW race —
1763    /// but kernel ABI alignment for `u32` fields is enforced by
1764    /// the compiler at the kernel side regardless, so misaligned
1765    /// PAs for legitimate symbol/field writes do not arise in
1766    /// practice.
1767    ///
1768    /// **Hot-path future** (when [`super::Op::WriteKernelHot`]
1769    /// gains `OrU32` support — currently rejected per the
1770    /// [`super::Op::WriteKernelHot`] doc): the live-guest race
1771    /// model requires a `compare_exchange` loop over
1772    /// `core::sync::atomic::AtomicU32::from_ptr` (Rust 1.75+) at
1773    /// 4-byte alignment, with explicit rejection of misaligned
1774    /// PAs (per-byte fallback cannot be made atomic vs. a live
1775    /// kernel writer).
1776    ///
1777    /// **Ordering**: cold-path dispatch happens while every vCPU
1778    /// is parked at the freeze rendezvous, so no concurrent
1779    /// guest write races our RMW for single-op use cases. The
1780    /// `SCX_RQ_CLK_VALID` case specifically requires
1781    /// **write-clock-BEFORE-OR-flag** ordering per the kernel's
1782    /// own `scx_rq_clock_update` at `kernel/sched/sched.h:1848-1854`
1783    /// (which does `WRITE_ONCE(rq->scx.clock, val)` then
1784    /// `smp_store_release(&rq->scx.flags, flags |
1785    /// SCX_RQ_CLK_VALID)`); a host-side caller that wants the
1786    /// same observable invariant must batch the clock write +
1787    /// the OR-flag in the same `Op::WriteKernelCold` batch and
1788    /// rely on the freeze rendezvous's vCPU-pause to serialise
1789    /// against guest readers.
1790    OrU32(u32),
1791}
1792
1793impl KernelValue {
1794    /// 32-bit unsigned value.
1795    pub const fn u32(val: u32) -> Self {
1796        KernelValue::U32(val)
1797    }
1798
1799    /// 64-bit unsigned value.
1800    pub const fn u64(val: u64) -> Self {
1801        KernelValue::U64(val)
1802    }
1803
1804    /// Variable-length byte payload.
1805    pub fn bytes(data: impl Into<Vec<u8>>) -> Self {
1806        KernelValue::Bytes(data.into())
1807    }
1808
1809    /// 32-bit unsigned read-modify-write OR mask. See
1810    /// [`Self::OrU32`] for the width-, atomicity-, and ordering-
1811    /// contract. The canonical use case is OR-ing a single-bit
1812    /// kernel flag like `SCX_RQ_CLK_VALID` into `struct scx_rq.flags`.
1813    pub const fn or_u32(mask: u32) -> Self {
1814        KernelValue::OrU32(mask)
1815    }
1816}
1817
1818impl From<&KernelTarget> for crate::vmm::wire::KernelOpTarget {
1819    /// 1:1 mapping of every Op-side [`KernelTarget`] variant to its
1820    /// wire-side peer. `Cow → String` coercion for the symbolic
1821    /// forms; copy for the integer/`u32` forms. Used by the
1822    /// executor's `Op::WriteKernel*` / `Op::ReadKernel*` dispatch
1823    /// arms when building [`crate::vmm::wire::KernelOpRequestPayload`].
1824    fn from(target: &KernelTarget) -> Self {
1825        match target {
1826            KernelTarget::Symbol(name) => Self::Symbol(name.to_string()),
1827            KernelTarget::Direct(kva) => Self::Direct(*kva),
1828            KernelTarget::Kva(kva) => Self::Kva(*kva),
1829            KernelTarget::PerCpuField { symbol, field, cpu } => Self::PerCpuField {
1830                symbol: symbol.to_string(),
1831                field: field.to_string(),
1832                cpu: *cpu,
1833            },
1834            KernelTarget::TaskField {
1835                pid,
1836                expected_start_time_ns,
1837                field,
1838            } => Self::TaskField {
1839                pid: *pid,
1840                expected_start_time_ns: *expected_start_time_ns,
1841                field: field.to_string(),
1842            },
1843        }
1844    }
1845}
1846
1847impl From<&KernelValue> for crate::vmm::wire::KernelOpValue {
1848    /// 1:1 mapping of every Op-side [`KernelValue`] variant to its
1849    /// wire-side peer. The `Bytes` arm clones the inner `Vec<u8>`
1850    /// so the source variant remains usable after dispatch (large
1851    /// payloads pay the clone cost — see
1852    /// [`crate::vmm::wire::KernelOpValue::Bytes`] for the wire
1853    /// representation).
1854    fn from(value: &KernelValue) -> Self {
1855        match value {
1856            KernelValue::U32(v) => Self::U32(*v),
1857            KernelValue::U64(v) => Self::U64(*v),
1858            KernelValue::Bytes(b) => Self::Bytes(b.clone()),
1859            KernelValue::OrU32(mask) => Self::OrU32(*mask),
1860        }
1861    }
1862}
1863
1864/// Width specifier for the [`Op::ReadKernelHot`] /
1865/// [`Op::ReadKernelCold`] ops — picks which
1866/// `crate::monitor::guest::GuestKernel`
1867/// `read_*_u32` / `read_*_u64` / `read_*_bytes` family the host
1868/// dispatcher invokes for the read. Mirrors [`KernelValue`]'s
1869/// variant tags but without payload data (reads do not carry an
1870/// outgoing value — only a width hint that the dispatcher uses to
1871/// size the resulting [`crate::vmm::wire::KernelOpValue`] in the
1872/// reply).
1873///
1874/// # `#[non_exhaustive]`
1875///
1876/// `KernelValueWidth` is `#[non_exhaustive]` so new widths can be
1877/// added without breaking external pattern-matchers. Prefer the
1878/// per-variant constructors ([`Self::u32`], [`Self::u64`],
1879/// [`Self::bytes`]) over naming variant literals.
1880#[derive(Clone, Debug, PartialEq, Eq)]
1881#[non_exhaustive]
1882pub enum KernelValueWidth {
1883    /// Read a `u32` little-endian. Atomic when the resolved host
1884    /// PA is 4-byte aligned (see [`KernelValue::U32`]'s alignment
1885    /// note for the misaligned fall-through behaviour).
1886    U32,
1887    /// Read a `u64` little-endian. Atomic at 8-byte alignment;
1888    /// otherwise a per-byte loop is used (same fall-through as
1889    /// [`KernelValue::U64`]).
1890    U64,
1891    /// Read exactly `len` raw bytes. Non-atomic; reads through the
1892    /// `crate::monitor::guest::GuestKernel` `read_*_bytes`
1893    /// helpers' chunked-page primitive.
1894    Bytes(usize),
1895}
1896
1897impl KernelValueWidth {
1898    /// `u32` read width.
1899    pub const fn u32() -> Self {
1900        KernelValueWidth::U32
1901    }
1902
1903    /// `u64` read width.
1904    pub const fn u64() -> Self {
1905        KernelValueWidth::U64
1906    }
1907
1908    /// `len`-byte read width. Produces a
1909    /// [`crate::vmm::wire::KernelOpValue::Bytes`] of exactly `len`
1910    /// bytes in the reply.
1911    pub const fn bytes(len: usize) -> Self {
1912        KernelValueWidth::Bytes(len)
1913    }
1914}
1915
1916impl From<&KernelValueWidth> for crate::vmm::wire::KernelOpValue {
1917    /// Map a [`KernelValueWidth`] to a zero-filled
1918    /// [`crate::vmm::wire::KernelOpValue`] of the requested width
1919    /// for the read-entry's value-hint slot. The wire payload's
1920    /// `value` discriminant tells the host dispatcher which read
1921    /// family to invoke; the byte contents are written by the
1922    /// host before replying.
1923    fn from(width: &KernelValueWidth) -> Self {
1924        match width {
1925            KernelValueWidth::U32 => Self::U32(0),
1926            KernelValueWidth::U64 => Self::U64(0),
1927            KernelValueWidth::Bytes(len) => Self::Bytes(vec![0u8; *len]),
1928        }
1929    }
1930}