ktstr/scenario/ops/types/op.rs
1//! `Op` operation taxonomy + `CpusetSpec` enum with its constructor
2//! impl block. Owns the variant-set (and `OpKind` discriminator
3//! enum) plus the cpuset-spec construction surface. Resolution-time
4//! `CpusetSpec` logic lives in a sibling impl block in
5//! [`super::resolve`] — Rust permits multiple impl blocks across
6//! files in the same crate; the split tracks the construction /
7//! resolution responsibility boundary.
8//!
9//! See the parent module ([`super`]) for the file-layout overview
10//! and the cross-impl-block convention.
11
12use std::borrow::Cow;
13use std::collections::BTreeSet;
14
15use crate::workload::{AffinityIntent, WorkSpec};
16
17use super::CgroupDef;
18
19/// Atomic operation on the cgroup topology.
20///
21/// Names use `Cow<'static, str>` so ops can reference compile-time
22/// literals (zero-cost) or runtime-generated strings (owned).
23///
24/// # `#[non_exhaustive]`
25///
26/// `Op` is `#[non_exhaustive]` — see [`crate::non_exhaustive`] for
27/// the cross-crate pattern-match rule. `Op`-specific construction
28/// convention: prefer the per-op constructors (e.g. `Op::add_cgroup`,
29/// `Op::run_payload`) over naming variants directly; new
30/// constructors are added alongside new variants and are the stable
31/// surface.
32#[derive(Clone, Debug, strum::EnumDiscriminants)]
33#[strum_discriminants(name(OpKind))]
34#[strum_discriminants(derive(strum::EnumIter))]
35#[strum_discriminants(vis(pub))]
36#[non_exhaustive]
37pub enum Op {
38 /// Create a new cgroup under the managed cgroup parent, with no
39 /// cpuset, no controller knobs, and no workers — the
40 /// operator-friendly way to declare an empty move-target cgroup
41 /// that later receives tasks via [`Op::MoveAllTasks`] or
42 /// similar. For mid-step cgroups that need cpuset / cpu /
43 /// memory / io / pids / workers, use [`Op::add_cgroup_def`]
44 /// instead; for setup-time cgroups with the same knobs, declare
45 /// via [`super::super::Step::with_defs`].
46 AddCgroup { name: Cow<'static, str> },
47 /// Create a cgroup mid-step from a full [`CgroupDef`] — cpuset,
48 /// cpu/memory/io/pids knobs, and worker spawns all apply in one
49 /// op, mirroring the way `Step::with_defs` materializes a
50 /// step-local CgroupDef at setup time. Use this when the
51 /// add-cgroup-with-cpuset-and-workers sequence needs to happen
52 /// after the step's setup pass (e.g. driven by an earlier op's
53 /// observed state) instead of as part of the step's setup. The
54 /// embedded `def` is dedup-checked the same way `apply_setup`
55 /// rejects collisions with prior Backdrop or step-local
56 /// CgroupDef declarations.
57 AddCgroupDef { def: CgroupDef },
58 /// Remove a cgroup (stops its workers first). Permitted against
59 /// both step-local and Backdrop-owned cgroups; removing a
60 /// Backdrop cgroup mid-scenario drops it from the Backdrop
61 /// tracking list so a later `Op::AddCgroup` with the same name
62 /// can re-create the cgroup. A typo'd cgroup name surfaces
63 /// later as a kernel-layer "cgroup missing" error on the next
64 /// op that references the name, not at the RemoveCgroup site.
65 RemoveCgroup { cgroup: Cow<'static, str> },
66 /// Set a cgroup's cpuset to the resolved CPU set.
67 SetCpuset {
68 cgroup: Cow<'static, str>,
69 cpus: CpusetSpec,
70 },
71 /// Clear a cgroup's cpuset (allow all CPUs).
72 ClearCpuset { cgroup: Cow<'static, str> },
73 /// Read both cgroups' cpusets and swap them.
74 SwapCpusets {
75 a: Cow<'static, str>,
76 b: Cow<'static, str>,
77 },
78 /// Spawn workers and place them according to `placement`.
79 ///
80 /// The work type is used as-is; gauntlet `work_type_override` does
81 /// not apply. Use [`CgroupDef`] with `swappable(true)` when the
82 /// work type should be overridable.
83 ///
84 /// Placement contract (bullets follow [`SpawnPlacement`] variant
85 /// declaration order):
86 /// * [`SpawnPlacement::RunnerCgroup`] — spawn workers in the
87 /// spawner's own cgroup; the handler issues ZERO cgroup ops
88 /// and the workers inherit whatever cgroup the test runner
89 /// sits in. `WorkSpec::workers_pct` is rejected for this
90 /// placement because there's no managed cgroup whose cpuset
91 /// would supply the percentage denominator.
92 /// * [`SpawnPlacement::Cgroup`] — spawn workers and move them
93 /// into the named cgroup; the cgroup must already exist
94 /// (declared via [`CgroupDef`] in `Step.setup`, via
95 /// [`Op::AddCgroup`] / [`Op::AddCgroupDef`] earlier in the
96 /// same step, or on the persistent
97 /// [`Backdrop`](crate::scenario::Backdrop)).
98 Spawn {
99 placement: SpawnPlacement,
100 work: WorkSpec,
101 },
102 /// Stop all workers in a cgroup (does not remove the cgroup).
103 /// Permitted against both step-local and Backdrop-owned cgroups;
104 /// stopping a Backdrop cgroup's workers mid-scenario leaves the
105 /// cgroup hierarchy intact but makes subsequent ops that expect
106 /// those workers (e.g. wait/kill payload) fail to find them.
107 StopCgroup { cgroup: Cow<'static, str> },
108 /// Set worker affinity in a cgroup. Resolved at apply time via
109 /// `resolve_affinity_for_cgroup()`.
110 SetAffinity {
111 cgroup: Cow<'static, str>,
112 affinity: AffinityIntent,
113 },
114 /// Move all tasks from one cgroup to another.
115 ///
116 /// Each task is moved via `cgroup.procs`. If any move fails, the
117 /// error propagates and handle name keys are left unchanged (workers
118 /// remain addressed under `from`). On success, handle name keys are
119 /// updated to `to` so subsequent ops address the moved workers.
120 ///
121 /// # Self-move rejection
122 ///
123 /// A self-move (`from == to`) is rejected at handler entry — the
124 /// kernel cgroup.procs write is idempotent on same-cgroup targets
125 /// so the op would silently no-op, masking either a stale op the
126 /// test author forgot to remove or a typo. The bail names both
127 /// sides so the operator can pick the right fix. The check also
128 /// catches the symmetric empty-string pair (`("", "")`), which
129 /// would otherwise no-op a RunnerCgroup-to-RunnerCgroup transfer.
130 ///
131 /// # Empty-string source
132 ///
133 /// Passing `from = ""` matches workers spawned by
134 /// [`Op::Spawn`] with [`SpawnPlacement::RunnerCgroup`] —
135 /// RunnerCgroup-placement handles are tracked under the
136 /// empty-string key (workers stay in the spawner's own cgroup,
137 /// outside any managed hierarchy). `Op::move_all_tasks("",
138 /// "named")` is the canonical way to materialize
139 /// RunnerCgroup-placement workers into a managed cgroup
140 /// mid-scenario; after the move the captured handles re-key
141 /// to `"named"` and lose their empty-string identity,
142 /// behaving like any other managed worker (lifetime tied to
143 /// `"named"`'s ownership slot per the table below).
144 ///
145 /// # Lifetime / ownership-direction asymmetry
146 ///
147 /// `MoveAllTasks` is asymmetric with respect to cgroup ownership:
148 /// the legality of a move depends on the relative lifetimes of
149 /// the `from` and `to` cgroups, not just on which one is the
150 /// source.
151 ///
152 /// | `from` ownership | `to` ownership | Outcome |
153 /// |-----------------------|-----------------------|---------|
154 /// | step-local | step-local | Allowed; both die at step teardown together. |
155 /// | step-local | Backdrop (persistent) | Allowed; handle ownership transfers from step-local set to Backdrop set so the worker survives step teardown. |
156 /// | Backdrop | Backdrop | Allowed; both persist for the scenario. |
157 /// | Backdrop | step-local | **Rejected at apply time.** A persistent worker would be stranded inside a cgroup that gets `rmdir`'d at step boundary; the kernel migrates the orphaned task to the cgroup root with a frozen-task warning in dmesg. The `bail!` diagnostic names the offending pair and tells the operator to either declare the destination in the Backdrop too, or move the worker back into a Backdrop-owned cgroup. |
158 ///
159 /// The Backdrop→Backdrop and step→step cases are unconditionally
160 /// allowed because both endpoints share a lifetime; the
161 /// step→Backdrop case is allowed because the kernel moves
162 /// reference-count once and the framework's
163 /// `ScenarioState::rename_handles`
164 /// transfers the handle into the persistent slot in the same
165 /// step. The Backdrop→step case is the only one that produces
166 /// a guaranteed orphan, hence the asymmetric reject.
167 ///
168 /// # Backdrop-setup exemption
169 ///
170 /// `MoveAllTasks` ops running INSIDE a Backdrop's `setup_ops`
171 /// pass (`state.target_backdrop=true`) are exempt from the
172 /// Backdrop→step-local check: at that point, "step-local"
173 /// cgroups don't exist yet (the Backdrop is the only cgroup
174 /// scope), and the rule reduces to a pure source-ownership
175 /// check that the apply path handles already.
176 MoveAllTasks {
177 from: Cow<'static, str>,
178 to: Cow<'static, str>,
179 },
180 /// Spawn a userspace [`Payload`](crate::test_support::Payload)
181 /// binary in the background and track its
182 /// [`PayloadHandle`](crate::scenario::payload_run::PayloadHandle)
183 /// under the step's payload-handle set.
184 ///
185 /// Subsequent [`Op::WaitPayload`] / [`Op::KillPayload`] address
186 /// the running child by the composite
187 /// (`Payload::name`, `cgroup`) key — the same payload can run
188 /// concurrently in two different cgroups without a dedup
189 /// collision, but the lookup from the waiting op must match
190 /// the pair the run op recorded. See [`Op::WaitPayload`] /
191 /// [`Op::KillPayload`] for the ambiguity rules when the
192 /// waiting op supplies only the name.
193 ///
194 /// Only [`PayloadKind::Binary`](crate::test_support::PayloadKind::Binary)
195 /// payloads are spawnable; scheduler-kind payloads are rejected at
196 /// apply time with an actionable error.
197 ///
198 /// `args` is appended to `payload.default_args`. `cgroup`, when
199 /// set, places the child in the named cgroup (resolved relative
200 /// to the scenario's parent cgroup) via
201 /// [`PayloadRun::in_cgroup`](crate::scenario::payload_run::PayloadRun::in_cgroup);
202 /// unset inherits the spawning process's cgroup.
203 ///
204 /// Handles not explicitly consumed by `WaitPayload` / `KillPayload`
205 /// are drained at step-teardown by `collect_step` (step-local) or
206 /// at scenario end by `collect_backdrop` (when the handle lives on
207 /// the Backdrop), matching the [`CgroupDef::workload`] semantics.
208 ///
209 /// # Scheduler-kind rejection across surfaces
210 ///
211 /// Three surfaces accept a `&Payload` and each rejects a
212 /// scheduler-kind Payload differently — deliberately, to match
213 /// the lifecycle of the caller:
214 ///
215 /// | Surface | Rejection | When |
216 /// |-------------------------------------------------------------------------------------------|-----------------------|---------------|
217 /// | [`PayloadRun::run`](crate::scenario::payload_run::PayloadRun::run) (`ctx.payload(&X)...`) | `Err(anyhow::Error)` | scenario-time |
218 /// | [`CgroupDef::workload`] | `panic!` | declaration-time |
219 /// | `Op::RunPayload` (this variant) | `Err(anyhow::Error)` | apply-ops-time |
220 ///
221 /// Rationale: `CgroupDef::workload` is a builder invoked during
222 /// test construction (nextest `--list` phase) — a panic there
223 /// surfaces the misuse before any VM boot, with a full
224 /// backtrace pointing at the offending call. `ctx.payload()`
225 /// and `Op::RunPayload` both run inside an executing scenario
226 /// where one bad misuse should not crash the whole test run;
227 /// they `bail!` with an actionable message and let the
228 /// surrounding step-sequence skip to teardown. The three
229 /// paths are symmetric in *what* they reject (scheduler-kind
230 /// Payloads in non-scheduler slots); they differ only in
231 /// *how* the misuse is surfaced, matched to caller context.
232 RunPayload {
233 payload: &'static crate::test_support::Payload,
234 args: Vec<String>,
235 cgroup: Option<Cow<'static, str>>,
236 },
237 /// Block until the payload named `name` exits naturally, then
238 /// evaluate its checks and record metrics to the per-test sidecar.
239 ///
240 /// The target is looked up by composite key (`name`, `cgroup`).
241 /// `cgroup: None` matches the unique live copy (whatever its
242 /// placement); if two or more copies of the same payload are
243 /// live in different cgroups, the lookup bails with an
244 /// "ambiguous — specify cgroup" error so the test doesn't
245 /// silently wait on the wrong one. Use
246 /// [`Op::wait_payload_in_cgroup`] to disambiguate.
247 ///
248 /// A consumed or unknown `(name, cgroup)` pair returns `Err`
249 /// with an actionable message — test authors must not silently
250 /// wait for payloads that were never started or have already
251 /// been consumed by a prior `WaitPayload`/`KillPayload`.
252 ///
253 /// **No timeout.** `WaitPayload` waits indefinitely for the
254 /// child to exit. A binary that never terminates (e.g. a
255 /// benchmark configured without `--runtime=N`, or a stress-ng
256 /// run without `--timeout`) will hang the step until the
257 /// outer test watchdog fires. For time-boxed long-running
258 /// payloads, prefer [`KillPayload`](Self::KillPayload) paired
259 /// with a [`super::super::HoldSpec::fixed`] / [`super::super::HoldSpec::frac`] step
260 /// boundary that guarantees forward progress; the payload's
261 /// own CLI (`--runtime`, `--timeout`) is the reliable way to
262 /// cap a single invocation's runtime.
263 ///
264 /// Check failures from the payload are recorded to the sidecar
265 /// for regression analysis but do NOT fail the step or the test
266 /// in-process. Use
267 /// [`ctx.payload(&X).run()`](crate::scenario::payload_run::PayloadRun::run)
268 /// directly if the test body needs to gate on check results.
269 WaitPayload {
270 name: Cow<'static, str>,
271 cgroup: Option<Cow<'static, str>>,
272 },
273 /// SIGKILL the payload named `name`, reap the child, evaluate
274 /// checks, and record metrics. Mirrors the behavior of
275 /// step-teardown drain for an explicitly-targeted payload.
276 ///
277 /// The target is looked up by composite key (`name`, `cgroup`)
278 /// — see [`Op::WaitPayload`] for the ambiguity rules.
279 ///
280 /// A consumed or unknown `(name, cgroup)` pair returns `Err`
281 /// with an actionable message, identical to [`Op::WaitPayload`]'s
282 /// lookup semantics.
283 ///
284 /// Check failures from the payload are recorded to the sidecar
285 /// for regression analysis but do NOT fail the step or the test
286 /// in-process. Use
287 /// [`ctx.payload(&X).run()`](crate::scenario::payload_run::PayloadRun::run)
288 /// directly if the test body needs to gate on check results.
289 KillPayload {
290 name: Cow<'static, str>,
291 cgroup: Option<Cow<'static, str>>,
292 },
293 /// Freeze every task in the named cgroup via `cgroup.freeze`.
294 ///
295 /// Writes `"1"` to the cgroup's `cgroup.freeze` file. The kernel's
296 /// `cgroup_freeze_write` dispatches the asynchronous freeze path;
297 /// tasks transition to the frozen state without external SIGSTOP,
298 /// and `cgroup.events` reaches `frozen 1` once every task has
299 /// parked. Idempotent — freezing an already-frozen cgroup is a
300 /// no-op.
301 ///
302 /// # Auto-unfreeze at teardown
303 ///
304 /// `Op::FreezeCgroup` is paired with [`Op::UnfreezeCgroup`] to
305 /// release. A test that omits the unfreeze still tears down
306 /// cleanly: [`crate::cgroup::CgroupManager::remove_cgroup`]
307 /// auto-unfreezes the cgroup before draining tasks (see the
308 /// kernel's `cgroup_freezer_migrate_task`, which clears the
309 /// task's freeze state when it migrates to an unfrozen
310 /// destination), so step teardown is robust to a stuck-frozen
311 /// cgroup. Pair the ops explicitly when the scenario needs
312 /// observable unfreeze timing inside the step body.
313 ///
314 /// # Worked example
315 ///
316 /// Three-Step suspend/resume sequence: a `Backdrop`-resident
317 /// long-running workload is paused mid-scenario and resumed
318 /// later, exercising how the scheduler responds to a sudden
319 /// idle window.
320 ///
321 /// ```text
322 /// Step 1 (run): apply cgroup; workload spins for 2s.
323 /// Step 2 (suspend): Op::freeze_cgroup("workers"); hold 1s.
324 /// The cgroup's tasks park via cgroup.freeze,
325 /// schedstat gauges drop to zero, and the
326 /// scheduler observes a sudden idle subtree.
327 /// Step 3 (resume): Op::unfreeze_cgroup("workers"); hold 2s.
328 /// Tasks return to runnable state, the
329 /// scheduler must re-pick them onto the
330 /// cgroup's CPUs without spuriously preempting
331 /// unrelated workloads.
332 /// ```
333 ///
334 /// # Observer-cgroup deadlock warning
335 ///
336 /// Do NOT freeze a cgroup that hosts the test's own observation
337 /// machinery. The freeze path stops every task in the cgroup —
338 /// including any thread that:
339 /// - opens `/proc/<pid>/sched` or other procfs entries owned by
340 /// tasks inside the frozen cgroup, then waits on the read,
341 /// - holds a futex shared with frozen tasks (the unfreeze must
342 /// land before the wait can complete),
343 /// - synchronously waits on a stalled-task pipe whose
344 /// producer is in the frozen cgroup.
345 ///
346 /// The framework's stimulus-event SHM ring and the `BlkWorker`
347 /// epoll loop both run outside the test cgroup tree, so they
348 /// are unaffected — but a test author who explicitly places an
349 /// observer thread inside the same cgroup as its observation
350 /// targets will deadlock the scenario when the freeze fires.
351 /// Place observers in a sibling cgroup (or in the parent) so
352 /// `cgroup.freeze` is scoped to the workload subtree alone.
353 ///
354 /// Pair with [`Op::UnfreezeCgroup`] to release. Useful for
355 /// scheduler suspend/resume tests where the test body wants to
356 /// observe how the scheduler handles a suddenly-frozen workload
357 /// and the resumption sequence afterwards.
358 ///
359 /// Treats a missing cgroup as a step failure: the
360 /// `cgroup.freeze` write fails with `ENOENT` and the error
361 /// propagates via the `apply_ops` `with_context` chain.
362 /// Freezing a non-existent cgroup is NOT a no-op; only
363 /// freezing an already-frozen cgroup is.
364 FreezeCgroup { cgroup: Cow<'static, str> },
365 /// Unfreeze every task in the named cgroup via `cgroup.freeze`.
366 ///
367 /// Writes `"0"` to the cgroup's `cgroup.freeze` file. Inverse of
368 /// [`Op::FreezeCgroup`]. Idempotent.
369 UnfreezeCgroup { cgroup: Cow<'static, str> },
370 /// Capture a host-side diagnostic snapshot under `name`. The
371 /// freeze coordinator pauses every vCPU long enough to read
372 /// the BPF map state, vCPU registers, and per-CPU
373 /// counters into a
374 /// [`FailureDumpReport`](crate::monitor::dump::FailureDumpReport),
375 /// then resumes the guest. The report is keyed by `name` on
376 /// the active
377 /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge);
378 /// downstream test code reads it via
379 /// [`Snapshot`](crate::scenario::snapshot::Snapshot).
380 ///
381 /// On-demand snapshots are orthogonal to the error-class
382 /// freeze trigger — the request flows through a separate
383 /// channel, does not transition the coordinator's
384 /// `freeze_state`, and is serviced even after `Done`. The only
385 /// scheduling rule: at most one capture in flight at a time
386 /// (each request waits for the previous freeze's vCPUs to
387 /// fully resume before issuing).
388 ///
389 /// **Guest → host wire.** In-guest scenarios submit the request
390 /// over the virtio-console port-1 TLV stream: `request_snapshot`
391 /// builds a `SnapshotRequestPayload` and writes it via
392 /// `write_msg(MsgType::SnapshotRequest, ...)` to `/dev/vport0p1`
393 /// (`src/vmm/guest_comms.rs`). The host coordinator decodes the
394 /// `MSG_TYPE_SNAPSHOT_REQUEST` frame, runs
395 /// `freeze_and_dispatch(FreezeMode::Capture { .. })`, and the
396 /// installed `CaptureCallback` returns the resulting report
397 /// through a paired reply frame. See
398 /// [`CaptureCallback`](crate::scenario::snapshot::CaptureCallback)
399 /// for the full protocol.
400 ///
401 /// **No active bridge ⇒ no-op.** When the executor runs in a
402 /// context with no installed
403 /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
404 /// (e.g. unit tests that exercise the executor without
405 /// spinning up a VM), this op emits a `tracing::warn!` and
406 /// continues. Existing scenarios that never declare snapshot
407 /// ops keep their behavior unchanged.
408 ///
409 /// # Example
410 ///
411 /// Declare a snapshot mid-step, fetch the captured report
412 /// after the scenario completes, and assert against a
413 /// BTF-rendered field:
414 ///
415 /// ```ignore
416 /// use ktstr::scenario::ops::{CgroupDef, HoldSpec, Op, Step, execute_steps};
417 /// use ktstr::scenario::snapshot::{Snapshot, SnapshotBridge};
418 ///
419 /// // Wire up the bridge before execute_steps runs (host-side
420 /// // VM setup typically performs this step automatically).
421 /// let bridge = SnapshotBridge::new(/* capture callback */);
422 /// let _guard = bridge.clone().set_thread_local();
423 ///
424 /// let steps = vec![Step {
425 /// setup: vec![CgroupDef::named("workers").workers(2)].into(),
426 /// ops: vec![Op::capture_snapshot("after_spawn")],
427 /// hold: HoldSpec::FULL,
428 /// }];
429 /// execute_steps(ctx, steps)?;
430 ///
431 /// // Inspection.
432 /// let captured = bridge.drain();
433 /// let report = captured.get("after_spawn").expect("snapshot recorded");
434 /// let snap = Snapshot::new(report);
435 /// let nr_cpus = snap.var("nr_cpus_onln").as_u64()?;
436 /// assert!(nr_cpus > 0, "snapshot captured live nr_cpus_onln");
437 /// ```
438 CaptureSnapshot { name: Cow<'static, str> },
439 /// Capture a snapshot whenever the guest writes to the named
440 /// kernel symbol. The snapshot is tagged with the symbol
441 /// itself; one fire = one capture.
442 ///
443 /// Symbol resolution at op execution time is a verbatim match
444 /// against the vmlinux ELF symbol table: the freeze coordinator
445 /// walks `Elf::syms` and accepts the symbol whose strtab entry
446 /// equals the requested string byte-for-byte. There is no
447 /// prefix stripping, BTF lookup, kallsyms walk, or per-CPU
448 /// offset arithmetic — the string must match an entry that
449 /// `nm vmlinux` would print (e.g. `"jiffies_64"`,
450 /// `"scx_watchdog_timestamp"`).
451 ///
452 /// The `register_watch` callback on a host-side
453 /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
454 /// is for **host-side unit testing only** — it lets in-process
455 /// executor tests record the symbol and return without arming
456 /// any hardware. Production in-VM scenarios run via the
457 /// virtio-console port 1 `MSG_TYPE_SNAPSHOT_REQUEST` TLV frame
458 /// and the host coordinator's `arm_user_watchpoint` path
459 /// (`src/vmm/freeze_coord/mod.rs`); the thread-local bridge is
460 /// never installed inside the guest.
461 ///
462 /// # Guard rails
463 ///
464 /// - **Maximum of 3 watch ops per scenario.** The KVM
465 /// hardware-watchpoint plumbing reserves slot 0 for the
466 /// existing `*scx_root->exit_kind` trigger (used by the
467 /// error-trigger path); only the remaining three user
468 /// watchpoint slots are available for on-demand watches. The
469 /// bridge's `register_watch` rejects a 4th
470 /// `Op::WatchSnapshot` and fails the step when the cap is
471 /// exceeded.
472 /// - **Symbol resolution failures bail immediately.** A
473 /// missing symbol or unaligned address surfaces as an `Err`
474 /// from `execute_steps` so the test author notices the
475 /// watch did not attach. Silent degradation would leave the
476 /// scenario running with no captures and look identical to
477 /// a healthy passing run.
478 /// - **4-byte alignment.** The resolved KVA must be 4-byte
479 /// aligned: the framework arms 4-byte data-write watches,
480 /// which require `addr & 0x3 == 0` on every supported
481 /// architecture. Mis-aligned addresses bail at setup with
482 /// the resolved KVA in the error.
483 /// - **Silent-misfire detection (KASLR-on guests).** When the
484 /// host coordinator's `kaslr_offset` is zero AND the
485 /// resolved kernel symbol lives in the x86_64 high-half
486 /// address range, `arm_user_watchpoint` emits a
487 /// `tracing::warn!` (once per unique `(symbol, link_kva)`
488 /// per process) noting the arm targets the link-time KVA
489 /// while the runtime symbol lives at `link_kva +
490 /// runtime_kaslr_slide`. The arm STILL completes (rejecting
491 /// it would regress every caller running before the host
492 /// coordinator's runtime-KASLR-slide derivation lands);
493 /// operators who hit the warn can boot the guest with the
494 /// `nokaslr` cmdline to use `Op::WatchSnapshot`, or omit
495 /// the op from KASLR-on test runs entirely.
496 ///
497 /// **Guest → host wire.** The registration request rides the
498 /// same ioeventfd doorbell as [`Op::CaptureSnapshot`] (separate tag
499 /// namespace), so symbol resolution + user watchpoint slot
500 /// allocation + `KVM_SET_GUEST_DEBUG` arming happen on the host
501 /// without a vCPU userspace exit. Once armed, the
502 /// `KVM_EXIT_DEBUG` dispatch path drives the resulting
503 /// captures directly into the freeze coordinator (no
504 /// per-fire doorbell write needed). See
505 /// [`WatchRegisterCallback`](crate::scenario::snapshot::WatchRegisterCallback)
506 /// for the full protocol.
507 ///
508 /// Note: high-frequency variables (rq counters, jiffies)
509 /// will fire watches every few microseconds and fire
510 /// thousands of times (each overwriting the prior capture
511 /// under the same tag); the framework does not rate-limit
512 /// captures, so the test author owns the frequency choice.
513 /// Use [`Op::CaptureSnapshot`] for time-driven captures when
514 /// frequency is the concern.
515 WatchSnapshot { symbol: Cow<'static, str> },
516 /// Live-vCPU write of one or more [`KernelTarget`] / [`KernelValue`]
517 /// pairs into running guest memory. The host coordinator routes
518 /// each pair to the appropriate `GuestKernel::write_*` helper
519 /// (no freeze rendezvous, vCPUs keep executing). A Release fence
520 /// is issued after the last write so a weakly-ordered guest's
521 /// `smp_load_acquire` observes the bytes in write order — but
522 /// concurrent guest readers can still race against in-flight
523 /// stores, and the caller owns any guest-side synchronisation
524 /// the test requires (`READ_ONCE` / `smp_load_acquire` on the
525 /// target field).
526 ///
527 /// Same orchestration pattern as the existing
528 /// `BpfMapAccessor::write_value` path: synchronous host-side
529 /// memory mutation on a worker thread, no vCPU pause. Use this
530 /// for scratch fields, debug flags, scx-ktstr-private state,
531 /// and anything the guest reads with proper barriers.
532 ///
533 /// **Batch shape.** `writes` carries 1+ pairs; the executor
534 /// issues them in order. For a single write the
535 /// [`Op::write_kernel_hot`](#method.write_kernel_hot) singleton
536 /// constructor wraps a 1-element vec.
537 ///
538 /// **Dispatch.** The executor's arm calls
539 /// `dispatch_kernel_op_request` (`src/scenario/ops/dispatch.rs:2386`), which
540 /// uses the in-process `SnapshotBridge` callback when one is
541 /// installed (the test-fixture seam) and falls back to the
542 /// virtio-console port-1 wire path (`MsgType::KernelOpRequest`)
543 /// in-guest. The wire request is consumed by
544 /// `dispatch_kernel_op_batch` (`src/vmm/freeze_coord/kernel_op_dispatch.rs`),
545 /// invoked from the freeze coordinator's apply path.
546 ///
547 /// **See also.** [`KernelTarget`] — scroll to the
548 /// "Semantic risk" section for the single source of truth
549 /// on which scheduler-bookkeeping targets are safe vs
550 /// silently load-bearing.
551 WriteKernelHot {
552 /// Ordered list of `(target, value)` pairs to write.
553 writes: Vec<(KernelTarget, KernelValue)>,
554 },
555 /// Auto-freezing batched write of one or more
556 /// [`KernelTarget`] / [`KernelValue`] pairs while every vCPU is
557 /// parked at the freeze rendezvous. Reuses the same coordinator
558 /// path that [`Op::CaptureSnapshot`] triggers: one rendezvous,
559 /// every write in the batch lands while paused, then resume.
560 ///
561 /// **Batching is a hard correctness requirement.** Multi-CPU
562 /// seeds (e.g. a planned `with_uptime` helper writing per-CPU
563 /// `rq.clock` on every CPU at the same instant) must land in
564 /// ONE freeze window —
565 /// N separate cold-write ops would mean N rendezvous cycles
566 /// and observable inter-CPU skew. The variant payload is a
567 /// `Vec` precisely to make batched writes the natural shape.
568 /// The executor's `apply_ops` pre-pass auto-merges adjacent
569 /// singleton `Op::WriteKernelCold` ops into one merged op as
570 /// a safety net — N adjacent `write_kernel_cold(...)` calls
571 /// collapse into one rendezvous regardless of whether the
572 /// caller used [`crate::scenario::ops::Op::write_kernel_cold_batch`]
573 /// or chained singletons.
574 ///
575 /// **Dispatch.** The executor's arm calls
576 /// `dispatch_kernel_op_request` (`src/scenario/ops/dispatch.rs:2386`), which
577 /// uses the in-process `SnapshotBridge` callback when one is
578 /// installed (the test-fixture seam) and falls back to the
579 /// virtio-console port-1 wire path (`MsgType::KernelOpRequest`)
580 /// in-guest. The wire request lands at the freeze coordinator's
581 /// rendezvous boundary via
582 /// `dispatch_kernel_op_batch` (`src/vmm/freeze_coord/kernel_op_dispatch.rs`).
583 ///
584 /// Use this for: multi-field atomic writes, all-CPUs-at-once
585 /// seeding, one-shot setup that must complete before the guest
586 /// observes any partial state. Use [`Op::WriteKernelHot`] when
587 /// the guest is OK with live-write semantics + caller-side
588 /// synchronisation.
589 ///
590 /// **See also.** [`KernelTarget`] — scroll to the
591 /// "Semantic risk" section for the single source of truth
592 /// on which scheduler-bookkeeping targets are safe vs
593 /// silently load-bearing.
594 WriteKernelCold {
595 /// Ordered list of `(target, value)` pairs to write inside
596 /// a single freeze rendezvous.
597 writes: Vec<(KernelTarget, KernelValue)>,
598 },
599 /// Live-vCPU read of a [`KernelTarget`] into the
600 /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
601 /// drain log keyed by `tag`. Mirrors [`Op::WriteKernelHot`]:
602 /// no freeze rendezvous, host-side worker thread issues the
603 /// read while the guest keeps executing. The caller assumes
604 /// the read may race against guest writes; for read-write
605 /// coherency pair the op with a guest-side `smp_store_release`
606 /// on the target.
607 ///
608 /// Use this for: read-back of values previously written via
609 /// [`Op::WriteKernelHot`], lightweight polling of single fields
610 /// the test wants to observe without pausing the guest.
611 ///
612 /// **Width.** The `width` field picks which
613 /// `crate::monitor::guest::GuestKernel` `read_*` family the
614 /// host dispatcher invokes — `u32` / `u64` / `Bytes(len)`.
615 /// The reply lands as a [`crate::vmm::wire::KernelOpValue`] of
616 /// the matching shape in the bridge's drain log; a u32 field
617 /// must be read with `KernelValueWidth::u32()` (a u64 read of
618 /// a u32 field returns the field's bytes plus 4 adjacent
619 /// bytes).
620 ///
621 /// **Dispatch.** Same bridge-first / wire-fallback model as
622 /// [`Op::WriteKernelHot`]; the wire request is consumed by
623 /// `dispatch_kernel_op_batch` (`src/vmm/freeze_coord/kernel_op_dispatch.rs`).
624 ReadKernelHot {
625 /// Bridge-keyed tag under which the read result lands.
626 tag: Cow<'static, str>,
627 /// Address to read.
628 target: KernelTarget,
629 /// Width specifier: picks the read family + the reply
630 /// value shape.
631 width: KernelValueWidth,
632 },
633 /// Auto-freezing read of a [`KernelTarget`] into the
634 /// [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
635 /// drain log keyed by `tag`, taken while every vCPU is parked
636 /// at the freeze rendezvous. Reuses the same coordinator path
637 /// that [`Op::CaptureSnapshot`] triggers. Coherent with
638 /// respect to guest state — no concurrent guest write can race
639 /// against the read.
640 ///
641 /// Use this for: ground-truth reads that must reflect a stable
642 /// guest state, snapshot-style point-in-time reads. Note: each
643 /// `Op::ReadKernelCold` triggers its OWN freeze rendezvous —
644 /// `apply_ops`'s pre-pass folds adjacent
645 /// `Op::WriteKernelCold` ops into one rendezvous but does NOT
646 /// fold reads (per-entry wire tags are needed for the
647 /// multi-read reply-routing contract; queued as a wire-format
648 /// follow-up). For multi-read coherent snapshots, prefer
649 /// [`Op::CaptureSnapshot`] (which already orchestrates a single
650 /// rendezvous for all snapshot reads).
651 ///
652 /// **Width.** Same `width` semantics as [`Op::ReadKernelHot`]:
653 /// pick the read family explicitly so the dispatcher invokes
654 /// the matching `GuestKernel::read_*` helper.
655 ///
656 /// **Dispatch.** Bridge-first / wire-fallback like the other
657 /// `*Kernel*` variants; the wire request lands at the freeze
658 /// coordinator's rendezvous boundary via
659 /// `dispatch_kernel_op_batch` (`src/vmm/freeze_coord/kernel_op_dispatch.rs`).
660 ReadKernelCold {
661 /// Bridge-keyed tag under which the read result lands.
662 tag: Cow<'static, str>,
663 /// Address to read.
664 target: KernelTarget,
665 /// Width specifier: picks the read family + the reply
666 /// value shape.
667 width: KernelValueWidth,
668 },
669 /// Attach a scheduler mid-scenario: spawn the named staged
670 /// scheduler from `/staging/schedulers/<name>/` inside the guest
671 /// and wait for it to publish its first BPF object accessors.
672 ///
673 /// **Dispatch** (`dispatch_attach_scheduler` at
674 /// `src/scenario/ops/dispatch.rs:2032`): waits up to 60s for the
675 /// accessor-init worker to quiesce (handles the case where the
676 /// boot scheduler's first publish is still in flight), captures
677 /// the pre-spawn publish seqno, spawns the staged scheduler
678 /// binary, re-installs the sched_exit_monitor against the new
679 /// SCHED_PID, then waits up to 30s for a fresh accessor publish.
680 ///
681 /// **Already-attached behavior.** No framework-level idempotency
682 /// guard: if a scheduler is already running, the kernel rejects
683 /// the new attach at the `scx_enable_state() != SCX_DISABLED`
684 /// gate (`kernel/sched/ext.c:6837`, returns `-EBUSY`); the
685 /// spawned binary exits, no fresh publish lands, and the dispatch
686 /// bails on the 30s publish-wait timeout. Use
687 /// [`Op::DetachScheduler`] (then `AttachScheduler`) or
688 /// [`Op::ReplaceScheduler`] to swap schedulers.
689 ///
690 /// The `scheduler` reference holds a `'static` lifetime: the
691 /// test author declares each [`crate::test_support::Scheduler`]
692 /// at static scope (via `declare_scheduler!` or a
693 /// `static MY_SCHED: Scheduler = ...` item) and passes the
694 /// borrow into the constructor. The staging slot that ships the
695 /// binary into the initramfs is `KtstrTestEntry::staged_schedulers`;
696 /// the dispatch arm reads its path via
697 /// `test_support::staged::staged_scheduler_binary_path`.
698 AttachScheduler {
699 scheduler: &'static crate::test_support::Scheduler,
700 },
701 /// Detach the currently-running scheduler.
702 ///
703 /// **Dispatch** (`dispatch_detach_scheduler` →
704 /// `kill_current_scheduler` at `src/scenario/ops/dispatch.rs:1896`):
705 /// stops the host's sched_exit_monitor so the intentional kill
706 /// isn't promoted into a test-fatal scheduler-died signal,
707 /// writes `'S'` to `/proc/sysrq-trigger` to start the kernel-
708 /// side `scx_disable` cascade asynchronously (avoiding the
709 /// D-state stall inside `scx_flush_disable_work`'s
710 /// `kthread_flush_work(&sch->disable_work)` at
711 /// `kernel/sched/ext.c:6145`, reached on the struct_ops detach
712 /// path via `bpf_scx_unreg` at `kernel/sched/ext.c:7666`), sends
713 /// `SIGTERM` to the
714 /// scheduler pid, waits up to `SCHED_LIFECYCLE_KILL_GRACE` (10s)
715 /// for the kernel BPF state to reach `SCX_DISABLED`, then
716 /// clears the `SCHED_PID` atomic (defined in
717 /// `src/vmm/rust_init/mod.rs`) so subsequent
718 /// `crate::vmm::rust_init::sched_pid()` reads return `None`.
719 ///
720 /// Bails when no scheduler is currently attached (SCHED_PID is
721 /// 0), when the SIGTERM syscall fails, or when the
722 /// `SCX_DISABLED` wait times out. NOT idempotent: a second
723 /// detach with no scheduler attached bails rather than no-oping.
724 /// For defensive "ensure clean slate" scaffolds, gate on
725 /// `crate::vmm::rust_init::sched_pid()` returning `Some` before
726 /// emitting the Detach step rather than relying on no-op
727 /// tolerance.
728 DetachScheduler,
729 /// Kill the currently-running scheduler and respawn the BOOT
730 /// scheduler. Useful for hot-restart validation of the boot
731 /// scheduler. Bails if no scheduler is currently attached.
732 ///
733 /// **v0 limitation.** Always respawns the boot scheduler at
734 /// `/scheduler` + `/sched_args` regardless of which scheduler
735 /// was most-recently attached — after an `Op::AttachScheduler`
736 /// or `Op::ReplaceScheduler` to a staged scheduler, this op
737 /// restarts the BOOT scheduler, not the most-recently-attached
738 /// one. For restarting a staged scheduler, use
739 /// [`Op::ReplaceScheduler`] with the same staged spec.
740 ///
741 /// **Dispatch** (`dispatch_restart_scheduler` at
742 /// `src/scenario/ops/dispatch.rs:2129`): kills the current scheduler
743 /// via the shared `kill_current_scheduler` helper, spawns the
744 /// boot scheduler from the hardcoded `/scheduler` + `/sched_args`
745 /// paths with log at `/tmp/sched.log`, then re-installs the
746 /// sched_exit_monitor against the re-spawned boot pid.
747 RestartScheduler,
748 /// Detach the currently-running scheduler and attach a different
749 /// one. Equivalent to `[DetachScheduler, AttachScheduler {
750 /// scheduler: new }]` but expressed as a single op so the
751 /// no-scheduler window is bounded and the per-phase scheduler
752 /// tagging on the sidecar can record the transition atomically.
753 ///
754 /// The mid-experiment swap case the operator typically wants:
755 /// run scheduler A for the first phase of a multi-step test, swap
756 /// to scheduler B (or A-with-different-CLI-args, modeled as a
757 /// distinct `Scheduler` declaration) for the second phase, and
758 /// assert a per-phase metric delta across the boundary.
759 ///
760 /// Bails if no scheduler is currently attached — there is no
761 /// scheduler to detach from, so the "replace" semantic has no
762 /// meaning. Use [`Op::AttachScheduler`] for the first attach.
763 ///
764 /// **Dispatch** (`dispatch_replace_scheduler` at
765 /// `src/scenario/ops/dispatch.rs:2153`): kills the current scheduler
766 /// via the shared `kill_current_scheduler` helper, spawns the
767 /// named staged scheduler binary from
768 /// `/staging/schedulers/<name>/`, re-installs the
769 /// sched_exit_monitor against the new SCHED_PID, waits up to
770 /// `REPLACE_NOT_TRYING_DEADLINE_S` (5s) for the accessor-init
771 /// worker to quiesce, captures the pre-publish seqno, then
772 /// waits up to 10s for fresh accessors to publish against the
773 /// new BPF object. The 10s budget aligns with
774 /// `SCHED_LIFECYCLE_KILL_GRACE` and covers a cold-cache vmlinux
775 /// re-parse during the worker reinit.
776 ReplaceScheduler {
777 scheduler: &'static crate::test_support::Scheduler,
778 },
779 /// Open a BPF map fd by name and hold it for the scenario lifetime.
780 ///
781 /// **Why this exists.** `Op::ReplaceScheduler` kills the outgoing
782 /// scheduler process; libbpf's drop path then releases the map
783 /// fds the loader was holding. Once the last refcount on a map
784 /// drops, the kernel frees it — typically before any post-swap
785 /// freeze captures, so the multi-bss "same-binary swap window"
786 /// case (two `<obj>.bss` copies coexisting briefly) closes too
787 /// fast to be reliably observed in a test. `PinBpfMap` holds an
788 /// extra refcount on the named map so the kernel keeps it alive
789 /// until the scenario ends.
790 ///
791 /// **Semantics.** Walks the kernel's map ID space (via
792 /// [`libbpf_rs::query::MapInfoIter`], which wraps
793 /// `BPF_MAP_GET_NEXT_ID` + `BPF_MAP_GET_FD_BY_ID` +
794 /// `BPF_OBJ_GET_INFO_BY_FD`) and keeps the fd whose name matches.
795 /// The held fd lives in the scenario's Backdrop state and drops
796 /// (via std `OwnedFd` `Drop`) at scenario teardown. Multiple
797 /// `PinBpfMap` ops with **distinct** names accumulate; pinning the
798 /// **same** name twice is a no-op (the second call returns without
799 /// re-opening the fd, so the originally-pinned map instance is the
800 /// one held — not the second-call-time instance).
801 ///
802 /// **Name truncation.** BPF map names are capped at
803 /// `BPF_OBJ_NAME_LEN = 16` bytes including the trailing NUL, so
804 /// 15 usable chars max per `kernel/bpf/syscall.c`'s
805 /// `bpf_obj_name_cpy`. Pass the kernel-visible name (typically
806 /// `<obj>.bss` / `<obj>.data` / `<obj>.rodata`). When a libbpf
807 /// object name + section suffix exceeds the 15-char cap, libbpf
808 /// truncates the object prefix at load time and the kernel-side
809 /// name is the truncated form; the framework does not auto-
810 /// truncate the user-supplied string, so pass the post-truncation
811 /// form. Reading the map names from a prior
812 /// [`crate::monitor::dump::FailureDumpReport`]'s `maps[].name`
813 /// or via `bpftool map list` is the safe way to discover the
814 /// exact string the kernel sees.
815 ///
816 /// **Order.** Place this op AFTER the scheduler that owns the
817 /// target map has attached (typically a small fixed hold suffices
818 /// — ~100ms for the small scx-ktstr fixture, longer for
819 /// heavyweight schedulers). For the same-binary swap-window
820 /// scenario specifically: pin the **outgoing** scheduler's bss
821 /// **before** `Op::ReplaceScheduler` runs — pinning after the
822 /// swap is too late because the outgoing scheduler's bss has
823 /// already been freed by libbpf's drop path. The pin walker
824 /// picks the lowest-id matching map, so the outgoing copy (the
825 /// older id) is the one held; the incoming scheduler's load
826 /// then creates a second copy that's also kept alive because
827 /// the outgoing refcount blocks the kernel from freeing the id.
828 ///
829 /// **Failure surface.** The pin runs at Step apply time inside
830 /// `execute_steps` / `execute_scenario`. A failure (no matching
831 /// map found in the walk) bails out of the apply path as an
832 /// `Err` from `execute_steps`; the scenario stops before the
833 /// next Step runs and the `post_vm` callback is not invoked.
834 /// The underlying [`libbpf_rs::query::MapInfoIter`] silently
835 /// terminates iteration on any non-`ENOENT` errno from the BPF
836 /// ID walk (including `EPERM` from missing `CAP_SYS_ADMIN`), so
837 /// such errors surface as the no-matching-map case rather than
838 /// a distinct EPERM error — acceptable because ktstr always runs
839 /// as root inside the guest, so the CAP_SYS_ADMIN gates at
840 /// `kernel/bpf/syscall.c:4761` (`BPF_MAP_GET_NEXT_ID` walk) and
841 /// `:4869` (`BPF_MAP_GET_FD_BY_ID`) are always satisfied and the
842 /// EPERM path is unreachable in practice.
843 ///
844 /// **Example.**
845 /// ```ignore
846 /// let steps = vec![
847 /// // Phase 0: primary scheduler runs alone; pin BEFORE the swap.
848 /// Step::with_op(
849 /// Op::pin_bpf_map("<obj>.bss"),
850 /// HoldSpec::frac(0.3),
851 /// ),
852 /// // Phase 1: swap to a same-binary alt — the pinned map
853 /// // keeps the OUTGOING bss alive across the teardown.
854 /// Step::with_op(
855 /// Op::replace_scheduler(&STAGED_ALT_SCHED),
856 /// HoldSpec::frac(0.7),
857 /// ),
858 /// ];
859 /// ```
860 ///
861 /// **See also.** [`crate::scenario::bpf_pin::open_bpf_map_fd_by_name`]
862 /// for the underlying helper and `tests/live_var_disambiguation_e2e.rs`
863 /// for the swap-window conditional walker-fired gate this pin is
864 /// designed to make deterministic.
865 PinBpfMap { name: Cow<'static, str> },
866 /// Capture the current `cgroup.procs` of `cgroup` and store the
867 /// PID list on the active [`SnapshotBridge`](crate::scenario::snapshot::SnapshotBridge)
868 /// under `tag`.
869 ///
870 /// Synchronous read of the cgroup-v2 `cgroup.procs` pseudofile in
871 /// the dispatching thread (in-scenario — runs wherever
872 /// `execute_scenario` runs; inside the guest VM for `#[ktstr_test]`
873 /// e2e tests, on the host for host-only scenarios). Returns the
874 /// thread-group leaders (PIDs / TGIDs) the kernel reports at apply
875 /// time. The snapshot is appended to the bridge's per-tag drain
876 /// log; test bodies drain via
877 /// [`SnapshotBridge::drain_cgroup_procs`](crate::scenario::snapshot::SnapshotBridge::drain_cgroup_procs)
878 /// (or the by-tag lookup
879 /// [`SnapshotBridge::cgroup_procs_by_tag`](crate::scenario::snapshot::SnapshotBridge::cgroup_procs_by_tag))
880 /// after the scenario completes to read the captured pids back.
881 ///
882 /// Distinct from [`Op::CaptureSnapshot`]: that op routes through
883 /// the host-side freeze coordinator (TLV transport in production,
884 /// thread-local bridge in test fixtures); this op runs entirely
885 /// in-process against the local cgroupfs.
886 ///
887 /// # Use cases
888 ///
889 /// Pin "did my workers land in cgroup X" assertions without the
890 /// shell-probe + tmpfs-roundtrip pattern. Typical shape:
891 ///
892 /// ```ignore
893 /// use ktstr::prelude::SnapshotBridge;
894 /// use std::sync::Arc;
895 ///
896 /// // Install a bridge (dummy capture cb — only cgroup-procs drain
897 /// // is used). MUST clone before set_thread_local, which consumes
898 /// // self — the clone shares the Arc-internal state and is what
899 /// // we drain on after the scenario completes.
900 /// let bridge = SnapshotBridge::new(Arc::new(|_| None));
901 /// let bridge_for_drain = bridge.clone();
902 /// let _guard = bridge.set_thread_local();
903 ///
904 /// let backdrop = Backdrop::new().push_op(Op::add_cgroup("workers"));
905 /// let steps = vec![
906 /// Step::new(
907 /// vec![
908 /// Op::spawn(SpawnPlacement::cgroup("workers"),
909 /// WorkSpec::default().workers(4)),
910 /// Op::capture_cgroup_procs("after_spawn", "workers"),
911 /// ],
912 /// HoldSpec::fixed(Duration::ZERO),
913 /// ),
914 /// ];
915 /// let _ = execute_scenario(&ctx, backdrop, steps)?;
916 ///
917 /// // Either drain the whole log or look up by tag.
918 /// let after = bridge_for_drain.cgroup_procs_by_tag("after_spawn")
919 /// .expect("Op::CaptureCgroupProcs(\"after_spawn\", ...) snapshot");
920 /// assert_eq!(after.pids.len(), 4);
921 /// ```
922 ///
923 /// # Within-Step ordering
924 ///
925 /// Ops in a single Step apply sequentially in vec order, so a
926 /// `Op::CaptureCgroupProcs` placed AFTER `Op::Spawn` /
927 /// `Op::MoveAllTasks` observes the post-spawn / post-migrate
928 /// kernel state. The producing ops complete synchronously (their
929 /// `cgroup.procs` writes block on kernel commit), so the capture
930 /// sees every PID those ops placed.
931 ///
932 /// # PID vs TID grain
933 ///
934 /// Reads `cgroup.procs` (thread-group leaders), NOT `cgroup.threads`
935 /// (per-thread TIDs). Grain implications by spawn op:
936 ///
937 /// - `Op::Spawn` → ktstr workers are 1-thread-per-worker, so
938 /// `workers(N)` produces `N` pids in `cgroup.procs`.
939 /// - `Op::RunPayload` → an `execve`'d binary is ONE process; even
940 /// if the binary spawns 100 threads, `cgroup.procs` reports the
941 /// single thread-group leader. Tests asserting per-thread
942 /// placement would need a sibling `cgroup.threads` accessor
943 /// (future Op variant if a use case arises).
944 ///
945 /// # Tag uniqueness
946 ///
947 /// `tag` is the snapshot key the test body uses to find the
948 /// capture in the drain log. The apply-ops dispatch rejects an
949 /// empty `tag` with an actionable bail. Multiple captures of
950 /// the same `cgroup` under DIFFERENT tags surface as separate
951 /// entries (lets a scenario capture pre/post snapshots of the
952 /// same cgroup); multiple captures with the same `(tag, cgroup)`
953 /// also append rather than overwrite — tag uniqueness is a caller
954 /// convention, not a framework-enforced contract. The by-tag
955 /// lookup [`SnapshotBridge::cgroup_procs_by_tag`](crate::scenario::snapshot::SnapshotBridge::cgroup_procs_by_tag)
956 /// returns the FIRST match; callers who care about multiplicity
957 /// must use [`SnapshotBridge::drain_cgroup_procs`](crate::scenario::snapshot::SnapshotBridge::drain_cgroup_procs)
958 /// and filter the Vec manually.
959 ///
960 /// # Empty / unknown cgroup
961 ///
962 /// - Empty cgroup (exists but holds no tasks): captured snapshot
963 /// has `pids = vec![]`. Lets callers assert "no tasks landed
964 /// here" without conflating with "no such cgroup."
965 /// - Unknown cgroup (directory missing): apply bails with a
966 /// layered anyhow chain — the outer wrap names the op + tag +
967 /// cgroup; the inner [`crate::cgroup::CgroupOps::read_procs`]
968 /// context surfaces the resolved path + the actionable hint
969 /// about `Op::AddCgroup` / `workload_root_cgroup`. Use
970 /// `format!("{err:#}")` (alternate display) to flatten both
971 /// layers in test assertions.
972 ///
973 /// # See also
974 ///
975 /// - [`Op::CaptureSnapshot`] — diagnostic-snapshot capture (full
976 /// scheduler state dump via FailureDumpReport). Distinct from
977 /// this op's cgroup-procs read AND drains via a separate
978 /// `SnapshotBridge::drain` / `drain_ordered` channel, not
979 /// `drain_cgroup_procs`.
980 /// - [`crate::cgroup::CgroupOps::read_procs`] — the underlying
981 /// trait method this op dispatches through.
982 CaptureCgroupProcs {
983 /// Snapshot key. Must be non-empty. Used by
984 /// [`SnapshotBridge::drain_cgroup_procs`](crate::scenario::snapshot::SnapshotBridge::drain_cgroup_procs)
985 /// consumers to find this capture in the drain log.
986 tag: Cow<'static, str>,
987 /// Cgroup to read `cgroup.procs` from. Must be a name
988 /// already tracked by the scenario (created via
989 /// `Op::AddCgroup`, a `CgroupDef` in setup, or pushed on
990 /// the Backdrop). Must be non-empty.
991 cgroup: Cow<'static, str>,
992 },
993 /// Re-steer a hardware IRQ to a single CPU by writing
994 /// `/proc/irq/<N>/smp_affinity_list` in the guest — the knob
995 /// that drives the kernel's `write_irq_affinity` →
996 /// `irq_set_affinity` → `irq_do_set_affinity` → irqchip
997 /// `set_affinity` path (`kernel/irq/proc.c`,
998 /// `kernel/irq/manage.c`). Use it to place a NIC's
999 /// RX-completion interrupt on a chosen CPU so the hardirq, the
1000 /// `NET_RX` softirq it raises, and any task that path wakes all
1001 /// land where the scenario wants them: the steering half of an
1002 /// IRQ-locality test whose generating half is
1003 /// [`crate::workload::WorkType::NetTraffic`] and whose observing
1004 /// half is the per-CPU IRQ metric axis (`max_cpu_hardirqs`,
1005 /// `max_cpu_softirq_net_rx`, and their `*_concentration` ratios).
1006 ///
1007 /// # In-guest file write, NOT a kernel-memory poke
1008 ///
1009 /// A write to the `irq_desc` affinity mask in kernel memory would
1010 /// NOT re-route delivery — only the `smp_affinity_list` write
1011 /// runs the full set-affinity path that reprograms the interrupt
1012 /// controller (MSI-X message / IOAPIC RTE). So this Op is
1013 /// dispatched as a plain `std::fs::write` from the executor
1014 /// in-guest (mirroring the `/proc/sysrq-trigger` write
1015 /// [`Op::DetachScheduler`] performs), NOT through the
1016 /// kernel-memory rendezvous path of [`Op::WriteKernelHot`] /
1017 /// [`Op::WriteKernelCold`].
1018 ///
1019 /// # Online-CPU requirement
1020 ///
1021 /// The kernel intersects the requested mask with
1022 /// `cpu_online_mask` before programming the irqchip
1023 /// (`irq_do_set_affinity`); a single-CPU target that is offline
1024 /// leaves no online CPU in the mask and the write returns
1025 /// `-EINVAL` (the `!cpumask_intersects(new_value,
1026 /// cpu_online_mask)` arm of `write_irq_affinity`). The
1027 /// dispatcher pre-checks `cpu` against
1028 /// `/sys/devices/system/cpu/online` and bails with an actionable
1029 /// message before the write, so an out-of-range / offline target
1030 /// names the CPU instead of surfacing a bare `EINVAL`. IRQ
1031 /// affinity is a system-wide property, NOT scoped to the writing
1032 /// task's cpuset — the target need not be in the runner's
1033 /// allowed set.
1034 ///
1035 /// Construct via [`Op::steer_irq`].
1036 SteerIrq {
1037 /// Which IRQ to steer — a literal Linux IRQ number or a
1038 /// `/proc/interrupts` action-name label. See [`IrqSelector`].
1039 irq: IrqSelector,
1040 /// Target Linux processor number — the value written to
1041 /// `smp_affinity_list`. Must be online (see the variant's
1042 /// online-CPU requirement above).
1043 cpu: usize,
1044 },
1045}
1046
1047/// Placement target for [`Op::Spawn`].
1048///
1049/// The previous taxonomy had two ops (`SpawnWorkers` and `SpawnHost`)
1050/// representing the two placement choices; the unified `Op::Spawn`
1051/// variant parameterises the placement so the framework has ONE
1052/// spawn op with the placement as data. `SpawnPlacement` is
1053/// `#[non_exhaustive]`; further placements are added here rather
1054/// than as new `Op` variants.
1055///
1056/// # `#[non_exhaustive]`
1057///
1058/// `SpawnPlacement` is `#[non_exhaustive]` — see
1059/// [`crate::non_exhaustive`] for the cross-crate pattern-match and
1060/// construction rules shared by every such type.
1061#[derive(Clone, Debug, Eq, Hash, PartialEq)]
1062#[non_exhaustive]
1063pub enum SpawnPlacement {
1064 /// Spawn workers in the spawner's own cgroup — the test
1065 /// runner's cgroup, NOT any managed workload cgroup declared
1066 /// via [`CgroupDef`] or [`Op::AddCgroup`]. The handler issues
1067 /// ZERO cgroup ops; the workers inherit whatever cgroup the
1068 /// test runner sits in.
1069 ///
1070 /// Inside a guest VM the runner's cgroup is typically the
1071 /// root (cgid=1), so RunnerCgroup workers appear in snapshots
1072 /// under the root cgroup rather than under your workload's
1073 /// named hierarchy.
1074 ///
1075 /// `WorkSpec::workers_pct` is rejected for this placement —
1076 /// there's no managed cgroup whose cpuset would supply the
1077 /// percentage denominator. Use an explicit `.workers(N)`
1078 /// count, or switch to `Cgroup(name)` against a cgroup whose
1079 /// cpuset gives `workers_pct` a denominator.
1080 ///
1081 /// # Why "RunnerCgroup"?
1082 ///
1083 /// The previous shape used `SpawnHost` — "host" referred to
1084 /// the spawner's own cgroup (analogous to the
1085 /// scheduler-observability "host tasks vs workload tasks"
1086 /// distinction in sched_ext schedulers, e.g. mitosis's cell
1087 /// 0). `RunnerCgroup` names the placement target precisely
1088 /// (the test-runner process's cgroup) without the
1089 /// host-vs-guest-machine ambiguity that "host" carried.
1090 RunnerCgroup,
1091 /// Spawn workers and move them into the named managed
1092 /// cgroup. The cgroup must already exist when the spawn op
1093 /// applies — declared via [`CgroupDef`] in `Step.setup`,
1094 /// via [`Op::AddCgroup`] / [`Op::AddCgroupDef`] earlier in
1095 /// the same step, or on the persistent
1096 /// [`Backdrop`](crate::scenario::Backdrop).
1097 Cgroup(Cow<'static, str>),
1098}
1099
1100impl SpawnPlacement {
1101 /// Construct [`SpawnPlacement::Cgroup`] from any string-like
1102 /// input (`&'static str`, `String`, `Cow<'static, str>`).
1103 /// Mirrors the [`impl Into<Cow<'static, str>>`] convention
1104 /// used by every other cgroup-name constructor on [`Op`]
1105 /// (`Op::add_cgroup`, `Op::spawn_workers`, `Op::move_all_tasks`,
1106 /// ...) so callers pass `"name"` not `"name".into()`.
1107 pub fn cgroup(name: impl Into<Cow<'static, str>>) -> Self {
1108 SpawnPlacement::Cgroup(name.into())
1109 }
1110
1111 /// Construct [`SpawnPlacement::RunnerCgroup`]. Const so it
1112 /// composes inside `const` scenarios + builds.
1113 pub const fn runner_cgroup() -> Self {
1114 SpawnPlacement::RunnerCgroup
1115 }
1116}
1117
1118/// Which IRQ [`Op::SteerIrq`] targets.
1119///
1120/// Two ways to name the same hardware IRQ:
1121///
1122/// - [`ByNumber`](Self::ByNumber) — the literal Linux IRQ number (the
1123/// leading column in `/proc/interrupts`, the `<N>` in
1124/// `/proc/irq/<N>/`). Use when the scenario already resolved the
1125/// number (e.g. via the per-NIC IRQ discovery a test does itself).
1126/// - [`ByLabel`](Self::ByLabel) — the `/proc/interrupts` action name
1127/// (the last whitespace token on the IRQ's line). The dispatcher
1128/// scans `/proc/interrupts` for the first line whose last token
1129/// equals the label and steers that IRQ. Use when the number is
1130/// not known ahead of time but the device label is stable. On the
1131/// virtio-MMIO transport ktstr boots, the NIC registers ONE shared
1132/// IRQ whose action name is the bare device basename (e.g.
1133/// `"virtio1"`), so that resolves uniquely. Limitation: the match
1134/// is the line's last token only — a shared IRQ (a comma-separated
1135/// action chain) matches just the LAST action, and a multi-word
1136/// action name never matches. The match deliberately is not
1137/// widened to any token because the per-CPU count / chip / hwirq
1138/// columns would then false-match; steer by
1139/// [`ByNumber`](Self::ByNumber) for a shared or multi-word-named
1140/// IRQ.
1141///
1142/// # `#[non_exhaustive]`
1143///
1144/// `IrqSelector` is `#[non_exhaustive]` — see
1145/// [`crate::non_exhaustive`] for the cross-crate pattern-match and
1146/// construction rules shared by every such type. Prefer the
1147/// [`by_number`](Self::by_number) / [`by_label`](Self::by_label)
1148/// constructors over naming variants directly.
1149#[derive(Clone, Debug, Eq, Hash, PartialEq)]
1150#[non_exhaustive]
1151pub enum IrqSelector {
1152 /// The literal Linux IRQ number (the `/proc/interrupts` leading
1153 /// column, the `<N>` in `/proc/irq/<N>/smp_affinity_list`).
1154 ByNumber(u32),
1155 /// The `/proc/interrupts` action-name label (the line's last
1156 /// whitespace token) — resolved to an IRQ number at dispatch.
1157 ByLabel(Cow<'static, str>),
1158}
1159
1160impl IrqSelector {
1161 /// Select an IRQ by its literal Linux IRQ number. Const so it
1162 /// composes inside `const` scenarios + builds.
1163 pub const fn by_number(irq: u32) -> Self {
1164 IrqSelector::ByNumber(irq)
1165 }
1166
1167 /// Select an IRQ by its `/proc/interrupts` action-name label
1168 /// (the last whitespace token on the IRQ's line, e.g. a
1169 /// virtio-net device basename like `"virtio1"`). Accepts any
1170 /// string-like input (`&'static str`, `String`,
1171 /// `Cow<'static, str>`), mirroring the cgroup-name constructor
1172 /// convention on [`Op`].
1173 pub fn by_label(label: impl Into<Cow<'static, str>>) -> Self {
1174 IrqSelector::ByLabel(label.into())
1175 }
1176}
1177
1178/// How to compute a cpuset from topology.
1179///
1180/// # `#[non_exhaustive]`
1181///
1182/// `CpusetSpec` is `#[non_exhaustive]` — see
1183/// [`crate::non_exhaustive`] for the cross-crate pattern-match and
1184/// construction rules shared by every such type.
1185///
1186/// Variant-specific guidance for `CpusetSpec`: prefer the
1187/// associated constructor functions — [`Self::llc`], [`Self::numa`],
1188/// [`Self::range`], [`Self::disjoint`], [`Self::overlap`], and
1189/// [`Self::exact`] — over naming variant literals like
1190/// `CpusetSpec::Llc(0)` or `CpusetSpec::Range { start_frac,
1191/// end_frac }`. Two reasons:
1192///
1193/// 1. **Stability across variant reshaping.** A future commit that
1194/// adds a field to `Range` (e.g. a stride parameter) breaks every
1195/// caller that spelled out `CpusetSpec::Range { start_frac,
1196/// end_frac }`; the `Self::range(..)` constructor absorbs the
1197/// new field behind a defaulted parameter. The `#[non_exhaustive]`
1198/// attribute is what reserves that freedom for the enum; the
1199/// constructor convention is how callers opt into benefiting from
1200/// it.
1201/// 2. **Semantic consistency with [`Self::exact`].** The `exact`
1202/// constructor accepts any `IntoIterator<Item = usize>` (arrays,
1203/// ranges, `Vec`, `BTreeSet`) and converts to `BTreeSet<usize>`
1204/// internally; callers that bypass it and write
1205/// `CpusetSpec::Exact(set)` directly must hand-build the
1206/// `BTreeSet` — duplicate bookkeeping a future-proofed constructor
1207/// erases.
1208///
1209/// Test code that needs to *inspect* a variant via pattern match
1210/// necessarily references the variant literal (the name is load-
1211/// bearing for the match), so the construction-side rule is a
1212/// convention for *production* call sites, not a hard constraint.
1213/// Inside this crate, matchers obey the pattern-side rule above;
1214/// constructors obey this rule.
1215///
1216/// `Clone + Debug + PartialEq`. `Eq` / `Hash` are impossible
1217/// because [`Range`](Self::Range) and [`Overlap`](Self::Overlap)
1218/// carry `f64` fractions; `Default` has no honest value (`Llc(0)`
1219/// vs. `Range(0..1)` vs. `Exact(empty)` are all different
1220/// "no-op" semantics).
1221///
1222/// Note: `f64::NAN != f64::NAN` per IEEE 754, so a `CpusetSpec`
1223/// containing NaN fractions will not equal a clone of itself;
1224/// `validate()` rejects NaN inputs.
1225#[derive(Clone, Debug, PartialEq)]
1226#[non_exhaustive]
1227pub enum CpusetSpec {
1228 /// All CPUs in a given LLC index.
1229 Llc(usize),
1230 /// All CPUs in a given NUMA node index.
1231 Numa(usize),
1232 /// Fractional range of usable CPUs [start_frac..end_frac).
1233 Range { start_frac: f64, end_frac: f64 },
1234 /// Partition usable CPUs into `of` equal disjoint sets; take the `index`-th.
1235 Disjoint { index: usize, of: usize },
1236 /// Like Disjoint but each set overlaps neighbors by `frac` of its size.
1237 Overlap { index: usize, of: usize, frac: f64 },
1238 /// Exact CPU set (no topology resolution).
1239 Exact(BTreeSet<usize>),
1240}
1241
1242impl CpusetSpec {
1243 /// Construct an `Exact` cpuset from any iterator of CPU indices.
1244 ///
1245 /// Accepts arrays, ranges, `Vec`, `BTreeSet`, or any `IntoIterator<Item = usize>`.
1246 pub fn exact(cpus: impl IntoIterator<Item = usize>) -> Self {
1247 CpusetSpec::Exact(cpus.into_iter().collect())
1248 }
1249
1250 /// Partition usable CPUs into `of` equal disjoint sets; take the `index`-th.
1251 pub const fn disjoint(index: usize, of: usize) -> Self {
1252 CpusetSpec::Disjoint { index, of }
1253 }
1254
1255 /// Like [`disjoint`](Self::disjoint) but each set overlaps neighbors by `frac` of its size.
1256 pub const fn overlap(index: usize, of: usize, frac: f64) -> Self {
1257 CpusetSpec::Overlap { index, of, frac }
1258 }
1259
1260 /// Fractional range of usable CPUs `[start_frac..end_frac)`.
1261 pub const fn range(start_frac: f64, end_frac: f64) -> Self {
1262 CpusetSpec::Range {
1263 start_frac,
1264 end_frac,
1265 }
1266 }
1267
1268 /// All CPUs in a given LLC index.
1269 pub const fn llc(index: usize) -> Self {
1270 CpusetSpec::Llc(index)
1271 }
1272
1273 /// All CPUs in a given NUMA node index.
1274 pub const fn numa(index: usize) -> Self {
1275 CpusetSpec::Numa(index)
1276 }
1277}
1278
1279/// Host-side write/read target for the kernel-memory ops
1280/// ([`Op::WriteKernelHot`] / [`Op::WriteKernelCold`] /
1281/// [`Op::ReadKernelHot`] / [`Op::ReadKernelCold`]).
1282///
1283/// Each variant names a kernel address by the resolution path the
1284/// host coordinator will take when the op fires; the actual
1285/// `GuestKernel` write helpers consume the resolved KVA. The variant
1286/// chosen here picks WHICH translation path (KASLR-aware kernel-image
1287/// base for [`Self::Symbol`], `PAGE_OFFSET` for [`Self::Direct`],
1288/// page-table walk for [`Self::Kva`], or per-CPU dereference for
1289/// [`Self::PerCpuField`]).
1290///
1291/// # Semantic risk — writing to load-bearing scheduler state
1292///
1293/// ktstr does not gate or filter target addresses. The framework
1294/// trusts the test author to know what they are pointing at. That
1295/// trust includes a class of fields where a raw write silently
1296/// breaks downstream kernel invariants the test author did not
1297/// intend to perturb. By design, mitigation is documentation-only:
1298/// the framework will not refuse a write nor emit a runtime warn —
1299/// the test author owns the choice. The cases to know about:
1300///
1301/// **Per-runqueue counters maintained by the scheduler classes.**
1302/// Raw writes skip the side-effects the kernel encodes in the
1303/// maintainer functions, leaving cross-class accounting in an
1304/// inconsistent state.
1305///
1306/// * **`struct rq.nr_running`** — the per-CPU runqueue task count.
1307/// `add_nr_running` / `sub_nr_running` (`kernel/sched/sched.h`)
1308/// also (a) fire the `sched_update_nr_running_tp` tracepoint and
1309/// (b) call `sched_update_tick_dependency(rq)` (the
1310/// `NOHZ_FULL` per-CPU tick gating logic); `add_nr_running`
1311/// additionally sets the root-domain `overloaded` bit
1312/// (`rq->rd->overloaded`) on the `prev_nr < 2 && new_nr >= 2`
1313/// transition. A bare 8-byte store skips all of those; the
1314/// counter and the root-domain overload signal diverge, the
1315/// NOHZ_FULL CPU may stop or start receiving ticks against the
1316/// test author's intent, and downstream load-balance decisions
1317/// read a count that no longer matches reality.
1318/// * **`struct cfs_rq.h_nr_runnable` / `h_nr_queued` /
1319/// `h_nr_idle`** (`kernel/sched/sched.h` `struct cfs_rq`) —
1320/// hierarchical CFS task counts maintained by
1321/// `account_entity_enqueue` / `dequeue` with cascade up the task
1322/// group tree. Raw write skips parent-cfs_rq propagation and
1323/// breaks group scheduling accounting.
1324/// * **`struct rt_rq.rt_nr_running`** (`kernel/sched/sched.h`
1325/// `struct rt_rq`) — RT class runqueue task count; updated by
1326/// `inc_rt_tasks` / `dec_rt_tasks` which also maintain the
1327/// per-rt_rq `overloaded` bit and the `highest_prio.curr/next`
1328/// priority-pushable tracking.
1329/// * **`struct dl_rq.dl_nr_running` / `running_bw` / `this_bw`**
1330/// (`kernel/sched/sched.h` `struct dl_rq`) — DEADLINE class
1331/// counters and bandwidth tracking; `add_running_bw` /
1332/// `sub_running_bw` (in `kernel/sched/deadline.c`) implement the
1333/// admission-control accounting that SUGOV's `cpu_bw_dl()`
1334/// consumes for frequency selection. A raw write to any of
1335/// these breaks admission control + DVFS.
1336///
1337/// **PELT (Per-Entity Load Tracking) averages.** These are
1338/// exponential moving averages whose internal `_sum` accumulators
1339/// are advanced against `cfs_rq_clock_pelt(cfs_rq)` (see
1340/// `kernel/sched/fair.c update_load_avg`, which calls into
1341/// `kernel/sched/pelt.c __update_load_avg_se` /
1342/// `__update_load_avg_cfs_rq`). Writing only the visible
1343/// `_avg` value desynchronises it from the `_sum` it was
1344/// computed from; the next `update_load_avg` decays both and
1345/// corrupts the next several passes.
1346///
1347/// * **`struct sched_avg`** fields on `task_struct.se.avg` and
1348/// `cfs_rq.avg`: `load_avg`, `runnable_avg`, `util_avg`,
1349/// `util_est`, plus `load_sum` / `runnable_sum` / `util_sum`
1350/// / `last_update_time` / `period_contrib` (see
1351/// `include/linux/sched.h struct sched_avg`).
1352/// * **`cfs_rq.removed.{load_avg,util_avg,runnable_avg}`** —
1353/// pending-decay buffer for departing entities; flushed at the
1354/// next `update_load_avg`.
1355/// * **`rq.cpu_capacity`** — set by `update_cpu_capacity`
1356/// (`kernel/sched/fair.c`, called from the load-balance path
1357/// `update_group_capacity`) from per-CPU RT capacity scaling;
1358/// initialized at boot in `kernel/sched/core.c sched_init`.
1359/// Raw writes are overwritten on the next load-balance tick
1360/// that triggers a capacity recomputation.
1361///
1362/// **Cgroup / task-group accounting.** Updating the task-group
1363/// hierarchy bypasses the cascade that the kernel performs over
1364/// every group entity.
1365///
1366/// * **`task_group.shares`** — cgroup CPU shares, normally set
1367/// via `sched_group_set_shares` (`kernel/sched/fair.c`) which
1368/// cascades into `update_load_set` + walks every task in the
1369/// group. Raw write skips the cascade and produces
1370/// inconsistent per-entity load weights.
1371/// * **`task_group.cfs_bandwidth.{quota, period, runtime}`** —
1372/// CFS bandwidth control. `tg_set_cfs_bandwidth`
1373/// (`kernel/sched/core.c`) is the cgroup-fs writer; the
1374/// per-cfs_rq runtime distribution is performed by
1375/// `__refill_cfs_bandwidth_runtime` (`kernel/sched/fair.c`)
1376/// gated by the `cfs_bandwidth_used()` static-key
1377/// (`kernel/sched/fair.c`) registered via
1378/// `start_cfs_bandwidth` (`kernel/sched/fair.c`). Raw writes
1379/// skip all of those.
1380///
1381/// **The right shape for influencing these fields is to drive the
1382/// kernel into the desired state through real activity** —
1383/// [`Op::Spawn`] with [`SpawnPlacement::RunnerCgroup`] (inherits the spawner's cgroup, typically
1384/// cgid=1 inside guest VMs) or
1385/// [`Op::Spawn`] with [`SpawnPlacement::Cgroup`] (runs inside a named cgroup) of a
1386/// synthetic [`WorkloadConfig`](crate::workload::WorkloadConfig)
1387/// for fake-load, real preemption pressure for sched_avg.
1388///
1389/// ## Fields that ARE safe to write raw (with caveats)
1390///
1391/// * **`jiffies_64`** (`include/linux/jiffies.h`) — the global
1392/// timekeeping tick counter. Safe to advance FORWARD only;
1393/// backward jumps trigger soft-lockup watchdog warnings and
1394/// can stall `time_after_eq` waiters whose expiry now appears
1395/// to be in the past in a way the timer wheel cannot
1396/// reconcile.
1397/// * **Per-CPU `rq.clock`** (`struct rq.clock`,
1398/// `kernel/sched/sched.h`) — the scheduler's per-CPU
1399/// wall-time clock. Not generically safe: `update_rq_clock`
1400/// (`kernel/sched/core.c`) overwrites it at every
1401/// scheduling tick + every enqueue/dequeue from
1402/// `sched_clock_cpu(cpu)`, so a raw write lasts at most until
1403/// the next tick (~1 ms with `HZ=1000`). The
1404/// `rq_clock_skip_update()` helper sets `RQCF_REQ_SKIP` in
1405/// `rq->clock_update_flags`, which suppresses one
1406/// `update_rq_clock` call, but its semantics are tightly
1407/// coupled to the RQCF_ACT_SKIP / RQCF_REQ_SKIP state
1408/// machine in `__schedule` — a self-contained "freeze
1409/// rq.clock at value X across step Y" pattern is the
1410/// framework's responsibility (planned), not a one-shot
1411/// raw-write primitive. Bumping `rq.clock_task` directly
1412/// is also NOT safe — that field is computed by
1413/// `update_rq_clock_task` from `rq->clock` minus IRQ and
1414/// steal-time deltas (`prev_irq_time` and
1415/// `prev_steal_time_rq`) and a raw write desynchronises it
1416/// from the inputs.
1417/// * **Per-CPU `rq.scx.clock`** (sched_ext per-CPU clock) — safe
1418/// ONLY when paired with setting `SCX_RQ_CLK_VALID` in
1419/// `rq.scx.flags`. The flag gates `scx_bpf_now()` reads;
1420/// writing the clock without the flag leaves `scx_bpf_now()`
1421/// returning stale data, and clearing the flag without
1422/// resetting the clock makes downstream BPF readers fall
1423/// back to the host TSC unexpectedly. Atomic bit-set without
1424/// read-back is provided by [`KernelValue::OrU32`] — the RMW
1425/// variant whose width matches `struct scx_rq.flags` (`u32`
1426/// at `kernel/sched/sched.h:803`). Note there is no
1427/// `OrU64` sibling: a 64-bit RMW at this field address would
1428/// corrupt the adjacent `u32 nr_immed` field at
1429/// `kernel/sched/sched.h:804`. Width is the variant tag, so
1430/// wrong-width writes are a compile-time error rather than a
1431/// silent field-overflow bug at runtime. Pair `OrU32(SCX_RQ_CLK_VALID)`
1432/// with the prior `U64(clock_val)` write in a single
1433/// `Op::WriteKernelCold` batch so both land under one freeze
1434/// rendezvous and the kernel's documented
1435/// write-clock-BEFORE-OR-flag ordering (per
1436/// `kernel/sched/sched.h:1848-1854` `scx_rq_clock_update`)
1437/// holds.
1438/// * **`scx-ktstr` private bss / per-CPU scratch** — the
1439/// fixture scheduler exposes a dedicated write surface for
1440/// test use; raw writes there don't propagate into core
1441/// sched code by construction.
1442///
1443/// # `#[non_exhaustive]`
1444///
1445/// `KernelTarget` is `#[non_exhaustive]` — see
1446/// [`crate::non_exhaustive`] for the cross-crate pattern-match rule.
1447/// Prefer the per-variant constructors ([`Self::symbol`],
1448/// [`Self::direct`], [`Self::kva`], [`Self::per_cpu_field`]) over
1449/// naming variant literals.
1450#[derive(Clone, Debug, PartialEq, Eq)]
1451#[non_exhaustive]
1452pub enum KernelTarget {
1453 /// Kernel text/data/bss symbol. The host resolves
1454 /// `name → KVA → PA` via the runtime kernel image base + KASLR
1455 /// `phys_base`, exactly as
1456 /// `crate::monitor::guest::GuestKernel::write_symbol_u64`
1457 /// already does for the existing write-symbol helper.
1458 Symbol(Cow<'static, str>),
1459 /// Direct-mapped kernel virtual address — translated via
1460 /// `kva - PAGE_OFFSET`. Use this when the caller has already
1461 /// resolved a SLAB / per-CPU / physmem KVA and just wants the
1462 /// host to write at that address.
1463 Direct(u64),
1464 /// Vmalloc'd / vmap'd kernel virtual address — translated via
1465 /// page-table walk through the guest's `CR3`. Use this for BPF
1466 /// maps, vmalloc'd memory, and any other address that does NOT
1467 /// live in the direct map.
1468 Kva(u64),
1469 /// Per-CPU field of a kernel struct, resolved at op dispatch
1470 /// time. The variant carries the symbolic intent only (`symbol`,
1471 /// `field`, `cpu`); the dispatcher looks up `symbol` in the
1472 /// vmlinux symbol table, adds `__per_cpu_offset[cpu]`, and adds
1473 /// the BTF-resolved byte offset of `field` within `symbol`'s
1474 /// struct type to yield the per-CPU field's runtime KVA.
1475 ///
1476 /// `symbol` must be in the v1 supported set: `runqueues` →
1477 /// `struct rq`, `kernel_cpustat` → `struct kernel_cpustat`,
1478 /// `kstat` → `struct kernel_stat`, `tick_cpu_sched` →
1479 /// `struct tick_sched`. Unknown symbols fail with a typed error
1480 /// (the wire variant doesn't carry struct type, so the
1481 /// dispatcher maps via a hardcoded table — extend it AND
1482 /// `KernelSymbols::from_elf` to add). KASLR-on round-trip
1483 /// coverage is an outstanding follow-up; ktstr defaults to
1484 /// `nokaslr` so the kaslr_offset slide is 0 on the standard
1485 /// test path.
1486 ///
1487 /// Lazy resolution keeps the construction surface pure-data
1488 /// (the test author needs no `GuestKernel`/BTF/symbol-table
1489 /// handle to construct the variant); resolution failures
1490 /// surface as op-execution errors at the same layer as
1491 /// missing-symbol failures in other snapshot ops.
1492 PerCpuField {
1493 /// Kernel symbol naming the per-CPU template
1494 /// (e.g. `"runqueues"`).
1495 symbol: Cow<'static, str>,
1496 /// Field name within the symbol's struct
1497 /// (e.g. `"clock"` for `struct rq.clock`).
1498 field: Cow<'static, str>,
1499 /// CPU index whose per-CPU instance to address.
1500 cpu: u32,
1501 },
1502 /// Per-task field of `struct task_struct` — SCX-managed tasks
1503 /// only (the dispatcher's L6 sched_class gate rejects non-SCX
1504 /// tasks). Resolved at dispatch by walking `init_task.tasks`
1505 /// plus each leader's `signal->thread_head` to locate the task
1506 /// with matching `pid` AND matching `expected_start_time_ns`
1507 /// (anti-PID-reuse identity), then adding the BTF-resolved
1508 /// nested-path byte offset of `field` within `task_struct`.
1509 /// See `crate::vmm::wire::KernelOpTarget::TaskField` for the
1510 /// 7-layer validation chain the dispatcher applies.
1511 ///
1512 /// `expected_start_time_ns` is `task->start_time` captured at
1513 /// WorkSpec spawn time. Get it via
1514 /// [`crate::workload::WorkloadHandle::worker_pids`] for
1515 /// the PID list, then read `/proc/<pid>/stat` field 22 +
1516 /// convert from jiffies to ns via
1517 /// `* 1_000_000_000 / sysconf(_SC_CLK_TCK)`.
1518 TaskField {
1519 /// Guest-side `pid_t` of the target task. Both leaders and
1520 /// non-leader threads are addressable.
1521 pid: u32,
1522 /// `task->start_time` (ns) recorded at spawn time. The
1523 /// dispatcher's L2 check rejects writes when the observed
1524 /// `task->start_time` differs (PID-reuse identity guard).
1525 expected_start_time_ns: u64,
1526 /// Dot-separated nested-member path within `task_struct`.
1527 /// SCX-only fields recommended (e.g. `"scx.dsq_vtime"`,
1528 /// `"start_boottime"`). `"se.vruntime"` writes are
1529 /// silently discarded by EEVDF's `place_entity` on enqueue
1530 /// (`kernel/sched/fair.c:5381-5514` since 6.6) AND rejected
1531 /// by the SCX-only class gate; do not use.
1532 field: Cow<'static, str>,
1533 },
1534}
1535
1536impl KernelTarget {
1537 /// Kernel text/data/bss symbol target. Resolves at op-dispatch
1538 /// time via the runtime kernel image base + KASLR `phys_base`.
1539 ///
1540 /// **Heads up.** See the `# Semantic risk` section on the
1541 /// enclosing [`KernelTarget`] type doc before pointing this
1542 /// at a scheduler-bookkeeping symbol.
1543 pub fn symbol(name: impl Into<Cow<'static, str>>) -> Self {
1544 KernelTarget::Symbol(name.into())
1545 }
1546
1547 /// Direct-mapped KVA target. Translates via `kva - PAGE_OFFSET`.
1548 /// For per-CPU bases the caller must add
1549 /// `__per_cpu_offset[cpu]` to the base symbol KVA before
1550 /// constructing the variant; use [`Self::per_cpu_field`]
1551 /// instead for the framework-resolved per-CPU shape.
1552 ///
1553 /// **Heads up.** See the `# Semantic risk` section on the
1554 /// enclosing [`KernelTarget`] type doc before pointing this
1555 /// at a scheduler-bookkeeping address.
1556 pub const fn direct(kva: u64) -> Self {
1557 KernelTarget::Direct(kva)
1558 }
1559
1560 /// Vmalloc'd / vmap'd KVA target. Translates via page-table
1561 /// walk through the guest's `CR3`.
1562 ///
1563 /// **Heads up.** See the `# Semantic risk` section on the
1564 /// enclosing [`KernelTarget`] type doc before pointing this
1565 /// at a scheduler-bookkeeping address.
1566 pub const fn kva(kva: u64) -> Self {
1567 KernelTarget::Kva(kva)
1568 }
1569
1570 /// Per-CPU field of a kernel struct. Resolves at op-dispatch
1571 /// time via `symbol_kva + __per_cpu_offset[cpu] + BTF byte
1572 /// offset of field`.
1573 ///
1574 /// **Heads up.** See the `# Semantic risk` section on the
1575 /// enclosing [`KernelTarget`] type doc before pointing this
1576 /// at a per-CPU scheduler-bookkeeping field.
1577 pub fn per_cpu_field(
1578 symbol: impl Into<Cow<'static, str>>,
1579 field: impl Into<Cow<'static, str>>,
1580 cpu: u32,
1581 ) -> Self {
1582 KernelTarget::PerCpuField {
1583 symbol: symbol.into(),
1584 field: field.into(),
1585 cpu,
1586 }
1587 }
1588
1589 /// Per-task `struct task_struct` field target — SCX-managed
1590 /// tasks only. Resolves at dispatch via `init_task.tasks` +
1591 /// per-leader `signal->thread_head` walks to find the task
1592 /// with matching `pid` AND matching `expected_start_time_ns`
1593 /// (anti-PID-reuse), then BTF nested-path offset of `field`.
1594 ///
1595 /// `expected_start_time_ns` is `task->start_time` (set once by
1596 /// `kernel/fork.c::copy_process` via `ktime_get_ns()`).
1597 /// Get worker PIDs via
1598 /// [`crate::workload::WorkloadHandle::worker_pids`] then
1599 /// read `/proc/<pid>/stat` field 22 at spawn time and convert
1600 /// to ns: `field_22_jiffies * 1_000_000_000 /
1601 /// sysconf(_SC_CLK_TCK)`.
1602 ///
1603 /// `field` is dot-separated nested-member path. The dispatcher
1604 /// applies a 7-layer validation chain (pid match, start_time
1605 /// identity, lifetime, on_rq=0, scx queued-empty, ext
1606 /// sched_class, start_boottime != 0) before
1607 /// the write/read lands — see
1608 /// `crate::vmm::wire::KernelOpTarget::TaskField` for the full
1609 /// contract.
1610 ///
1611 /// **SCX-only.** The dispatcher rejects non-SCX tasks via the
1612 /// class+policy gates. Recommended fields: `"scx.dsq_vtime"`
1613 /// (DSQ priority key, preserved across dequeue/enqueue),
1614 /// `"start_boottime"` (task fork timestamp).
1615 ///
1616 /// **Do NOT write `"se.vruntime"`.** EEVDF's `place_entity`
1617 /// (`kernel/sched/fair.c:5381-5514`, since 6.6) overwrites
1618 /// `se->vruntime` on every enqueue; direct vruntime writes are
1619 /// silently discarded for sleeping tasks (our validation gate).
1620 /// CFS-class tasks are rejected before reaching the write
1621 /// regardless, but the field-level warning is the actionable
1622 /// guidance for "why won't my vruntime write stick" debugging.
1623 ///
1624 /// **Heads up.** The dispatcher's L4 (`on_rq == 0`) + L5
1625 /// (`scx.dsq == NULL` AND `scx.runnable_node` empty) gates
1626 /// reject writes on queued/running tasks per CFS rb-tree + SCX
1627 /// DSQ ordering safety. Test authors must use blocking workload
1628 /// patterns (e.g. [`crate::workload::WorkType::FutexPingPong`],
1629 /// `WorkType::WaitOnFutex`, `WorkType::Sleep`) so workers are
1630 /// sleeping when the cold-path Op fires.
1631 ///
1632 /// # Examples
1633 ///
1634 /// ```ignore
1635 /// // Escape-hatch primitive: seed a specific worker's
1636 /// // scx.dsq_vtime to ~30 days. WorkSpec.uptime (separate API)
1637 /// // wraps this; use the escape hatch when the scenario knows
1638 /// // the exact PID + start_time tuple.
1639 /// use ktstr::prelude::*;
1640 /// use std::time::Duration;
1641 ///
1642 /// let workers = handle.worker_pids(); // Vec<libc::pid_t>
1643 /// let worker_pid = workers[0] as u32;
1644 /// // Read `/proc/<pid>/stat` field 22, convert from jiffies to
1645 /// // nanoseconds via `* 1_000_000_000 / sysconf(_SC_CLK_TCK)`.
1646 /// // (Helper expected to land alongside WorkSpec.uptime.)
1647 /// let start_time_ns: u64 = read_start_time_ns(worker_pid)?;
1648 ///
1649 /// let seed_vtime_ns = (30 * 86_400_u64) * 1_000_000_000; // 30 days
1650 /// let writes = vec![(
1651 /// KernelTarget::task_field(worker_pid, start_time_ns, "scx.dsq_vtime"),
1652 /// KernelValue::u64(seed_vtime_ns),
1653 /// )];
1654 /// // Worker MUST be in a blocking pattern (FutexPingPong, etc.)
1655 /// // at op-fire time; the dispatcher's 8-layer validation
1656 /// // rejects writes against runnable/queued tasks.
1657 /// ```
1658 pub fn task_field(
1659 pid: u32,
1660 expected_start_time_ns: u64,
1661 field: impl Into<Cow<'static, str>>,
1662 ) -> Self {
1663 KernelTarget::TaskField {
1664 pid,
1665 expected_start_time_ns,
1666 field: field.into(),
1667 }
1668 }
1669}
1670
1671/// Value payload for the kernel-memory write ops, and the result
1672/// shape for the read ops.
1673///
1674/// The variant tag picks both the width (`u32` vs `u64` vs a byte
1675/// slice) and the underlying `crate::monitor::guest::GuestKernel`
1676/// write helper the host coordinator will invoke (`write_*_u32`,
1677/// `write_*_u64`, `write_*_bytes` per the [`KernelTarget`] class).
1678///
1679/// # `#[non_exhaustive]`
1680///
1681/// `KernelValue` is `#[non_exhaustive]` so new value widths can be
1682/// added without breaking external pattern-matchers. Prefer the
1683/// per-variant constructors over naming variant literals.
1684#[derive(Clone, Debug, PartialEq, Eq)]
1685#[non_exhaustive]
1686pub enum KernelValue {
1687 /// 32-bit unsigned little-endian write. Atomic when the
1688 /// resolved host PA is 4-byte aligned. Misaligned PAs fall
1689 /// through to a per-byte volatile loop in
1690 /// `crate::monitor::reader::GuestMem`
1691 /// `write_volatile_bytes` (the 4-byte fast path branches on
1692 /// `ptr.align_offset(align_of::<u32>()) == 0` and only emits
1693 /// a single `write_volatile` when alignment holds); torn
1694 /// intermediate state is observable to concurrent guest readers
1695 /// in the fallback case.
1696 ///
1697 /// **For setting individual bits without disturbing the
1698 /// surrounding value**, use [`Self::OrU32`] instead — that
1699 /// variant performs read-modify-write OR semantics under the
1700 /// freeze rendezvous (e.g. setting `SCX_RQ_CLK_VALID` in
1701 /// `rq.scx.flags` without clobbering the other 31 flag bits).
1702 /// A plain `U32(value)` write replaces every bit; OrU32 sets
1703 /// only the bits in the mask.
1704 U32(u32),
1705 /// 64-bit unsigned little-endian write. Atomic when the
1706 /// resolved host PA is 8-byte aligned. See the alignment note
1707 /// on [`Self::U32`] for the misaligned fall-through behaviour.
1708 ///
1709 /// **No `OrU64` sibling exists by design.** The canonical
1710 /// scheduler-flags use case ([`KernelValue::OrU32`] →
1711 /// `struct scx_rq.flags`) is on a `u32` field per
1712 /// `kernel/sched/sched.h:803`; a 64-bit RMW at that address
1713 /// would corrupt the adjacent `u32 nr_immed` field at
1714 /// `kernel/sched/sched.h:804`. If a future u64 RMW use case
1715 /// emerges with a verified width, add the variant then.
1716 U64(u64),
1717 /// Variable-length byte payload. Written non-atomically; the
1718 /// `GuestKernel::write_*_bytes` helpers emit a Release fence
1719 /// after the copy so a weakly-ordered guest's
1720 /// `smp_load_acquire` observes the bytes in write order — the
1721 /// fence orders the stores but does NOT atomicize the
1722 /// multi-byte write versus a concurrent guest reader.
1723 Bytes(Vec<u8>),
1724 /// 32-bit unsigned read-modify-write OR. The dispatcher reads
1725 /// the live u32 at the resolved host PA, ORs the carried mask
1726 /// into it, and writes the new value back. Width is u32 — the
1727 /// canonical use case is OR-ing a single-bit kernel flag (e.g.
1728 /// `SCX_RQ_CLK_VALID = 1 << 5`) into `struct scx_rq.flags`,
1729 /// declared `u32` at `kernel/sched/sched.h:803` inside the
1730 /// struct opened at L793. A 64-bit RMW at a u32 field address
1731 /// would either silently truncate the upper 32 bits or
1732 /// corrupt the adjacent `u32 nr_immed` field at
1733 /// `kernel/sched/sched.h:804`, so the variant tag itself
1734 /// picks the width and rules out width mismatch at the call
1735 /// site.
1736 ///
1737 /// **Atomicity** (cold-path dispatcher): the host coordinator
1738 /// holds the freeze rendezvous for the duration of the RMW —
1739 /// every guest vCPU is parked on a futex inside `handle_freeze`
1740 /// (no kernel-side writer is scheduled), and the host
1741 /// coordinator is the only writer of guest memory in scope.
1742 /// `read_u32 → OR mask → write_u32` therefore runs atomic
1743 /// **by quiesce**: no concurrent kernel writer can interleave
1744 /// between the load and the store. No `compare_exchange` loop
1745 /// is required for cold-path dispatch.
1746 ///
1747 /// At the host CPU level the read and write are separate
1748 /// (non-instruction-atomic) operations: a hypothetical
1749 /// concurrent host writer of guest memory would be a race.
1750 /// The freeze coordinator is the sole such writer by design
1751 /// (per the cold-path threat model documented at
1752 /// [`super::Op::WriteKernelCold`]), so the parked-vCPU
1753 /// contract is sufficient.
1754 ///
1755 /// **Alignment**: the dispatcher delegates u32 reads/writes
1756 /// to `crate::monitor::guest::GuestKernel`'s
1757 /// `read_*_u32` / `write_*_u32` helpers, which use a
1758 /// single-instruction `write_volatile` at 4-byte-aligned host
1759 /// PAs and fall through to a per-byte volatile loop on
1760 /// misalignment. Under the freeze rendezvous the per-byte
1761 /// fallback is safe (no concurrent kernel writer), so
1762 /// misaligned PAs do not produce a torn-RMW race —
1763 /// but kernel ABI alignment for `u32` fields is enforced by
1764 /// the compiler at the kernel side regardless, so misaligned
1765 /// PAs for legitimate symbol/field writes do not arise in
1766 /// practice.
1767 ///
1768 /// **Hot-path future** (when [`super::Op::WriteKernelHot`]
1769 /// gains `OrU32` support — currently rejected per the
1770 /// [`super::Op::WriteKernelHot`] doc): the live-guest race
1771 /// model requires a `compare_exchange` loop over
1772 /// `core::sync::atomic::AtomicU32::from_ptr` (Rust 1.75+) at
1773 /// 4-byte alignment, with explicit rejection of misaligned
1774 /// PAs (per-byte fallback cannot be made atomic vs. a live
1775 /// kernel writer).
1776 ///
1777 /// **Ordering**: cold-path dispatch happens while every vCPU
1778 /// is parked at the freeze rendezvous, so no concurrent
1779 /// guest write races our RMW for single-op use cases. The
1780 /// `SCX_RQ_CLK_VALID` case specifically requires
1781 /// **write-clock-BEFORE-OR-flag** ordering per the kernel's
1782 /// own `scx_rq_clock_update` at `kernel/sched/sched.h:1848-1854`
1783 /// (which does `WRITE_ONCE(rq->scx.clock, val)` then
1784 /// `smp_store_release(&rq->scx.flags, flags |
1785 /// SCX_RQ_CLK_VALID)`); a host-side caller that wants the
1786 /// same observable invariant must batch the clock write +
1787 /// the OR-flag in the same `Op::WriteKernelCold` batch and
1788 /// rely on the freeze rendezvous's vCPU-pause to serialise
1789 /// against guest readers.
1790 OrU32(u32),
1791}
1792
1793impl KernelValue {
1794 /// 32-bit unsigned value.
1795 pub const fn u32(val: u32) -> Self {
1796 KernelValue::U32(val)
1797 }
1798
1799 /// 64-bit unsigned value.
1800 pub const fn u64(val: u64) -> Self {
1801 KernelValue::U64(val)
1802 }
1803
1804 /// Variable-length byte payload.
1805 pub fn bytes(data: impl Into<Vec<u8>>) -> Self {
1806 KernelValue::Bytes(data.into())
1807 }
1808
1809 /// 32-bit unsigned read-modify-write OR mask. See
1810 /// [`Self::OrU32`] for the width-, atomicity-, and ordering-
1811 /// contract. The canonical use case is OR-ing a single-bit
1812 /// kernel flag like `SCX_RQ_CLK_VALID` into `struct scx_rq.flags`.
1813 pub const fn or_u32(mask: u32) -> Self {
1814 KernelValue::OrU32(mask)
1815 }
1816}
1817
1818impl From<&KernelTarget> for crate::vmm::wire::KernelOpTarget {
1819 /// 1:1 mapping of every Op-side [`KernelTarget`] variant to its
1820 /// wire-side peer. `Cow → String` coercion for the symbolic
1821 /// forms; copy for the integer/`u32` forms. Used by the
1822 /// executor's `Op::WriteKernel*` / `Op::ReadKernel*` dispatch
1823 /// arms when building [`crate::vmm::wire::KernelOpRequestPayload`].
1824 fn from(target: &KernelTarget) -> Self {
1825 match target {
1826 KernelTarget::Symbol(name) => Self::Symbol(name.to_string()),
1827 KernelTarget::Direct(kva) => Self::Direct(*kva),
1828 KernelTarget::Kva(kva) => Self::Kva(*kva),
1829 KernelTarget::PerCpuField { symbol, field, cpu } => Self::PerCpuField {
1830 symbol: symbol.to_string(),
1831 field: field.to_string(),
1832 cpu: *cpu,
1833 },
1834 KernelTarget::TaskField {
1835 pid,
1836 expected_start_time_ns,
1837 field,
1838 } => Self::TaskField {
1839 pid: *pid,
1840 expected_start_time_ns: *expected_start_time_ns,
1841 field: field.to_string(),
1842 },
1843 }
1844 }
1845}
1846
1847impl From<&KernelValue> for crate::vmm::wire::KernelOpValue {
1848 /// 1:1 mapping of every Op-side [`KernelValue`] variant to its
1849 /// wire-side peer. The `Bytes` arm clones the inner `Vec<u8>`
1850 /// so the source variant remains usable after dispatch (large
1851 /// payloads pay the clone cost — see
1852 /// [`crate::vmm::wire::KernelOpValue::Bytes`] for the wire
1853 /// representation).
1854 fn from(value: &KernelValue) -> Self {
1855 match value {
1856 KernelValue::U32(v) => Self::U32(*v),
1857 KernelValue::U64(v) => Self::U64(*v),
1858 KernelValue::Bytes(b) => Self::Bytes(b.clone()),
1859 KernelValue::OrU32(mask) => Self::OrU32(*mask),
1860 }
1861 }
1862}
1863
1864/// Width specifier for the [`Op::ReadKernelHot`] /
1865/// [`Op::ReadKernelCold`] ops — picks which
1866/// `crate::monitor::guest::GuestKernel`
1867/// `read_*_u32` / `read_*_u64` / `read_*_bytes` family the host
1868/// dispatcher invokes for the read. Mirrors [`KernelValue`]'s
1869/// variant tags but without payload data (reads do not carry an
1870/// outgoing value — only a width hint that the dispatcher uses to
1871/// size the resulting [`crate::vmm::wire::KernelOpValue`] in the
1872/// reply).
1873///
1874/// # `#[non_exhaustive]`
1875///
1876/// `KernelValueWidth` is `#[non_exhaustive]` so new widths can be
1877/// added without breaking external pattern-matchers. Prefer the
1878/// per-variant constructors ([`Self::u32`], [`Self::u64`],
1879/// [`Self::bytes`]) over naming variant literals.
1880#[derive(Clone, Debug, PartialEq, Eq)]
1881#[non_exhaustive]
1882pub enum KernelValueWidth {
1883 /// Read a `u32` little-endian. Atomic when the resolved host
1884 /// PA is 4-byte aligned (see [`KernelValue::U32`]'s alignment
1885 /// note for the misaligned fall-through behaviour).
1886 U32,
1887 /// Read a `u64` little-endian. Atomic at 8-byte alignment;
1888 /// otherwise a per-byte loop is used (same fall-through as
1889 /// [`KernelValue::U64`]).
1890 U64,
1891 /// Read exactly `len` raw bytes. Non-atomic; reads through the
1892 /// `crate::monitor::guest::GuestKernel` `read_*_bytes`
1893 /// helpers' chunked-page primitive.
1894 Bytes(usize),
1895}
1896
1897impl KernelValueWidth {
1898 /// `u32` read width.
1899 pub const fn u32() -> Self {
1900 KernelValueWidth::U32
1901 }
1902
1903 /// `u64` read width.
1904 pub const fn u64() -> Self {
1905 KernelValueWidth::U64
1906 }
1907
1908 /// `len`-byte read width. Produces a
1909 /// [`crate::vmm::wire::KernelOpValue::Bytes`] of exactly `len`
1910 /// bytes in the reply.
1911 pub const fn bytes(len: usize) -> Self {
1912 KernelValueWidth::Bytes(len)
1913 }
1914}
1915
1916impl From<&KernelValueWidth> for crate::vmm::wire::KernelOpValue {
1917 /// Map a [`KernelValueWidth`] to a zero-filled
1918 /// [`crate::vmm::wire::KernelOpValue`] of the requested width
1919 /// for the read-entry's value-hint slot. The wire payload's
1920 /// `value` discriminant tells the host dispatcher which read
1921 /// family to invoke; the byte contents are written by the
1922 /// host before replying.
1923 fn from(width: &KernelValueWidth) -> Self {
1924 match width {
1925 KernelValueWidth::U32 => Self::U32(0),
1926 KernelValueWidth::U64 => Self::U64(0),
1927 KernelValueWidth::Bytes(len) => Self::Bytes(vec![0u8; *len]),
1928 }
1929 }
1930}