ktstr/cli/kernel_build/build.rs
1//! Top-level kernel build orchestration.
2//!
3//! Holds [`kernel_build_pipeline`] (the post-acquisition orchestrator
4//! that runs `clean` → configure → build → validate → cache-store),
5//! the two-phase reservation acquisition
6//! ([`acquire_build_reservation`]) for LLC flock + cgroup sandbox +
7//! `make -jN` hint, and the source-tree flock helper
8//! ([`acquire_source_tree_lock`]) that serializes parallel builds
9//! against the same on-disk source tree.
10
11use std::path::Path;
12
13use anyhow::{Context, Result};
14
15use super::super::kernel_cmd::{
16 DIRTY_TREE_CACHE_SKIP_HINT, EMBEDDED_KCONFIG, NON_GIT_TREE_CACHE_SKIP_HINT,
17 embedded_kconfig_hash,
18};
19use super::super::util::{success, warn};
20use super::kconfig::{
21 all_fragment_lines_present, configure_kernel, validate_kernel_config,
22 warn_dropped_extra_kconfig_lines, warn_extra_kconfig_overrides_baked_in,
23};
24use super::make::{build_make_args, make_kernel_with_output, run_make, run_make_with_output};
25
26/// Result of the post-acquisition kernel build pipeline.
27///
28/// Returned by [`kernel_build_pipeline`] so callers can inspect
29/// the cache entry and built image path.
30#[non_exhaustive]
31pub struct KernelBuildResult {
32 /// Cache entry, if the build was cached. `None` for dirty trees
33 /// or when cache store fails.
34 pub entry: Option<crate::cache::CacheEntry>,
35 /// Path to the built kernel image.
36 pub image_path: std::path::PathBuf,
37 /// Whether the source tree was dirty as observed by the build
38 /// pipeline. `true` if either the acquire-time inspection
39 /// reported dirty OR the post-build re-check observed a
40 /// mid-build mutation (worktree edit, branch flip, mid-build
41 /// commit). The downstream label decoration in cargo-ktstr's
42 /// `resolve_one` uses this to append `_dirty` so a
43 /// non-reproducible run is distinguishable from a clean rebuild
44 /// of the same path.
45 pub post_build_is_dirty: bool,
46}
47
48/// Two-phase build reservation handles (LLC flock plan + cgroup v2
49/// sandbox + make -jN hint). Consumed by
50/// [`kernel_build_pipeline`]; the factored-out
51/// [`acquire_build_reservation`] builds it from `cpu_cap` without
52/// depending on kernel source, enabling integration tests that
53/// exercise the reservation logic against synthetic topologies.
54///
55/// Drop order is load-bearing: `_sandbox` is declared first and
56/// drops first per Rust's declaration-order field-drop rule;
57/// this ensures the cgroup sandbox is removed before the LLC
58/// flock is released. Otherwise a peer could observe the LLC
59/// released before the cgroup is gone and mint a conflicting
60/// plan.
61#[derive(Debug)]
62pub(crate) struct BuildReservation {
63 /// cgroup v2 sandbox. `None` when `plan` is `None` (no reservation
64 /// to enforce). Drops FIRST per struct field order — cgroup
65 /// rmdir runs while LLC flocks are still held. `_` prefix
66 /// keeps the binding alive through Drop but marks it as
67 /// not-read — the RAII invariant IS the read.
68 pub(crate) _sandbox: Option<crate::vmm::cgroup_sandbox::BuildSandbox>,
69 /// LLC plan (flock fds + cpus + mems). `None` under
70 /// `KTSTR_BYPASS_LLC_LOCKS=1` or sysfs-unreadable host without
71 /// `--cpu-cap`. Drops SECOND per struct field order —
72 /// flocks release AFTER the sandbox rmdir lands.
73 pub(crate) plan: Option<crate::vmm::host_topology::LlcPlan>,
74 /// `make -jN` parallelism hint. `Some(N)` under an active
75 /// `plan`; `None` when no reservation exists (caller falls
76 /// back to `nproc`).
77 pub(crate) make_jobs: Option<usize>,
78}
79
80/// Acquire the two-phase reservation (LLC flocks + cgroup sandbox)
81/// for a kernel build. Factored out of [`kernel_build_pipeline`]
82/// so integration tests can exercise the cpu_cap → acquire →
83/// sandbox → make_jobs decision tree without requiring a real
84/// kernel source tree.
85///
86/// Returns a `BuildReservation` whose fields are the three values
87/// `kernel_build_pipeline` used to bind inline. `_sandbox` is
88/// declared first and drops first per Rust's declaration-order
89/// field-drop rule; this ensures the cgroup sandbox is removed
90/// before the LLC flock is released.
91///
92/// `cli_label` prefixes operator-facing error text.
93///
94/// `cpu_cap` is the resolved CPU-count cap from
95/// [`CpuCap::resolve`](crate::vmm::host_topology::CpuCap::resolve);
96/// `None` means "reserve 30% of the calling process's allowed-CPU
97/// set", applied inside the planner at acquire time.
98pub(crate) fn acquire_build_reservation(
99 cli_label: &str,
100 cpu_cap: Option<crate::vmm::host_topology::CpuCap>,
101) -> Result<BuildReservation> {
102 let bypass = crate::bypass_llc_locks_active();
103 // INVARIANT: `_sandbox` is declared first and drops first per
104 // Rust's declaration-order field-drop rule; this ensures the
105 // cgroup sandbox is removed before the LLC flock is released.
106 // Reordering either would either
107 // (a) unlock LLCs while the sandbox still enforces the
108 // cpuset — a concurrent peer could claim the LLC and stomp
109 // gcc children that haven't exited — or (b) leave the cgroup
110 // hierarchy non-empty when its parent tries to rmdir.
111 let plan: Option<crate::vmm::host_topology::LlcPlan> = if bypass {
112 if cpu_cap.is_some() {
113 anyhow::bail!(
114 "{cli_label}: --cpu-cap conflicts with KTSTR_BYPASS_LLC_LOCKS=1; \
115 unset one of them. --cpu-cap is a resource contract; bypass \
116 disables the contract entirely."
117 );
118 }
119 None
120 } else if let Ok(host_topo) = crate::vmm::host_topology::HostTopology::from_sysfs() {
121 let test_topo = crate::topology::TestTopology::from_system()?;
122 let acquired_plan =
123 crate::vmm::host_topology::acquire_llc_plan(&host_topo, &test_topo, cpu_cap)?;
124 crate::vmm::host_topology::warn_if_cross_node_spill(&acquired_plan, &host_topo);
125 Some(acquired_plan)
126 } else {
127 if cpu_cap.is_some() {
128 anyhow::bail!(
129 "{cli_label}: --cpu-cap set but host LLC topology unreadable \
130 from sysfs — cannot enforce the resource budget. Run on a \
131 host with /sys/devices/system/cpu populated, or drop \
132 --cpu-cap to build without enforcement."
133 );
134 }
135 tracing::warn!(
136 "{cli_label}: could not read host LLC topology from sysfs; \
137 skipping kernel-build LLC reservation. Concurrent perf-mode \
138 runs on this host will NOT be serialized against this build"
139 );
140 None
141 };
142
143 // Phase 2: cgroup v2 sandbox that enforces cpu+mem binding on
144 // make/gcc children. `hard_error_on_degrade` is driven by
145 // whether `--cpu-cap` was set explicitly: degradation is fatal
146 // under the flag (the flag promises enforcement), and warn-only
147 // when the 30%-of-allowed default was expanded (the default
148 // contract is best-effort — a parent cgroup narrowing the
149 // reservation should not fail the build).
150 let sandbox: Option<crate::vmm::cgroup_sandbox::BuildSandbox> = match plan.as_ref() {
151 Some(p) => Some(crate::vmm::cgroup_sandbox::BuildSandbox::try_create(
152 &p.cpus,
153 &p.mems,
154 cpu_cap.is_some(),
155 )?),
156 None => None,
157 };
158
159 // `make -jN` parallelism hint. `N` = `plan.cpus.len()` via
160 // `make_jobs_for_plan` — the reserved CPU count, whether that
161 // came from an explicit `--cpu-cap N` or the 30%-of-allowed
162 // default. See `make_kernel_with_output` for the resolution.
163 let make_jobs = plan
164 .as_ref()
165 .map(crate::vmm::host_topology::make_jobs_for_plan);
166
167 Ok(BuildReservation {
168 plan,
169 _sandbox: sandbox,
170 make_jobs,
171 })
172}
173
174/// Acquire an exclusive flock on a per-source-canonical-path lockfile
175/// so two concurrent `cargo ktstr test --kernel <path>` runs against
176/// the SAME source tree don't race in `make` (defconfig vs
177/// olddefconfig vs compile_commands.json) and stomp each other's
178/// `.config` and build artifacts.
179///
180/// The lockfile lives at
181/// `{KTSTR_CACHE_DIR}/.locks/source-{path_hash}.lock` where
182/// `{path_hash}` is the full 8-char CRC32 hex of the canonical
183/// source-path bytes (same shape and helper the
184/// `local-unknown-{path_hash}` cache key uses, see
185/// [`crate::fetch::canonical_path_hash`] /
186/// [`crate::fetch::compose_local_cache_key`]) — one per-tree
187/// identifier ties the source-tree flock to the cache key it gates.
188///
189/// Lockfile placement piggybacks on the cache root's `.locks/`
190/// subdirectory ([`crate::flock::LOCK_DIR_NAME`]) so source-tree
191/// flocks share the same filesystem-residency story as cache-entry
192/// flocks: never under `/tmp`, where `tmpwatch` (or the equivalent
193/// `systemd-tmpfiles` cleanup) can sweep stale-mtime files out from
194/// under an active flock holder. flock(2) does NOT update the
195/// inode's mtime, so a /tmp-resident lockfile would be a candidate
196/// for sweep on every run, with the resulting `unlink(2)` racing
197/// any peer trying to `open(2)` the same path. The `.locks/`
198/// directory under the user-controlled cache root is exempt from
199/// those sweeps.
200///
201/// Try-then-wait: attempts a non-blocking acquire first. If
202/// contended, logs the holder (pid + cmdline from /proc/locks)
203/// and falls through to a blocking acquire that parks until the
204/// peer releases. When the blocking acquire returns, the peer's
205/// build is done and the cache likely contains the artifact —
206/// the caller checks the cache after we return and skips the
207/// build if the slot is populated.
208///
209/// Distinct from the cache-entry flock acquired inside
210/// [`crate::cache::CacheDir::store`]: that lock serializes the
211/// atomic install of an artifact bundle into a cache slot; this
212/// lock serializes the BUILD itself against the source-tree
213/// `make` invocations.
214pub(crate) fn acquire_source_tree_lock(
215 canonical: &Path,
216 cli_label: &str,
217) -> Result<std::os::fd::OwnedFd> {
218 use anyhow::Context;
219
220 // Share the per-path CRC32 with `local-unknown-{hash}` cache
221 // keys so a single per-tree identifier ties the source-tree
222 // flock to the cache slot it gates.
223 let path_hash = crate::fetch::canonical_path_hash(canonical);
224 let cache = crate::cache::CacheDir::new()
225 .with_context(|| "open cache root for source-tree lockfile placement")?;
226 cache
227 .ensure_lock_dir()
228 .with_context(|| "create cache `.locks/` subdir for source-tree lock")?;
229 let lock_path = cache.lock_path(&format!("source-{path_hash}"));
230
231 match crate::flock::try_flock(&lock_path, crate::flock::FlockMode::Exclusive)
232 .with_context(|| format!("acquire source-tree flock {}", lock_path.display()))?
233 {
234 Some(fd) => Ok(fd),
235 None => {
236 // Non-blocking acquire failed (EWOULDBLOCK) — a live
237 // peer holds the lock. Surface the holder, then block
238 // until they release. When the blocking acquire
239 // returns, the peer's build is done and the cache
240 // likely contains the artifact we need — the caller
241 // checks the cache after we return, so it will skip
242 // the build if the peer populated the slot.
243 let holders = crate::flock::read_holders(&lock_path).unwrap_or_default();
244 let holder_text = if holders.is_empty() {
245 String::from("(holder not identified via /proc/locks)")
246 } else {
247 crate::flock::format_holder_list(&holders)
248 };
249 eprintln!(
250 "{cli_label}: source tree {} is locked by a concurrent ktstr \
251 build — waiting for it to finish.\n{holder_text}",
252 canonical.display(),
253 );
254 crate::flock::block_flock(&lock_path, crate::flock::FlockMode::Exclusive).with_context(
255 || format!("blocking wait on source-tree flock {}", lock_path.display()),
256 )
257 }
258 }
259}
260
261/// Classification of source-tree state at the post-acquire
262/// re-probe site inside [`kernel_build_pipeline`].
263///
264/// The pipeline re-probes the source tree after the source-tree EX
265/// wait completes so a mid-wait mutation (operator edit, branch flip,
266/// commit on top) can invalidate the cache-skip short-circuit instead
267/// of returning a cache slot keyed on the pre-wait identity. The
268/// 5-variant split keeps cause-attribution honest in the operator
269/// diagnostic emitted by [`MidWaitState::diagnostic`]: a `git commit`
270/// during the wait is not "your edits"; an operator who started dirty
271/// did not dirty the tree because of the wait; a probe failure is
272/// not a confirmed mutation, just unknowable state.
273#[derive(Debug, PartialEq, Eq)]
274enum MidWaitState {
275 /// Source tree unchanged across the wait (or non-local source
276 /// where the wait has no source-tree implication). The pipeline
277 /// proceeds to the cache_lookup short-circuit.
278 Clean,
279 /// Operator started with a dirty tree BEFORE the source-tree
280 /// EX wait was taken. The wait was not the cause of the dirty
281 /// state, so the diagnostic is silent (returns `None`) to avoid
282 /// fabricating wait-related attribution.
283 PreAcquireDirty,
284 /// Operator edited a tracked file DURING the wait (acquire-time
285 /// probe was clean, post-wait probe is dirty). Forces a rebuild
286 /// and emits a "your local edits" diagnostic.
287 DirtyEdit,
288 /// Operator advanced HEAD (commit / branch flip) during the wait
289 /// (acquire-time short-hash differs from post-wait short-hash;
290 /// post-wait worktree is clean). Forces a rebuild and emits a
291 /// "HEAD advanced" diagnostic.
292 HashAdvanced,
293 /// Post-wait probe returned `Err` (corrupt git state, removed
294 /// source dir, or a gix internal error). Forces a conservative
295 /// rebuild — unknowable state cannot be assumed Clean.
296 ProbeFailed,
297}
298
299impl MidWaitState {
300 /// Operator-facing diagnostic body (without the `{cli_label}: `
301 /// prefix — caller composes via `eprintln!("{cli_label}: {body}")`).
302 ///
303 /// Returns `None` for [`Self::Clean`] (the cache-skip gate emits
304 /// its own message) and [`Self::PreAcquireDirty`] (the wait was
305 /// not the cause of the dirty state, so a wait-related diagnostic
306 /// would fabricate attribution).
307 fn diagnostic(&self) -> Option<&'static str> {
308 match self {
309 Self::DirtyEdit => Some(
310 "source tree changed during peer's build wait \
311 — rebuilding to capture your local edits",
312 ),
313 Self::HashAdvanced => Some(
314 "source HEAD advanced during peer's build wait \
315 — rebuilding for the new commit",
316 ),
317 Self::ProbeFailed => Some(
318 "source-tree dirty re-check failed during peer's \
319 build wait — rebuilding conservatively (re-run with \
320 RUST_LOG=warn for the probe error)",
321 ),
322 Self::Clean | Self::PreAcquireDirty => None,
323 }
324 }
325}
326
327/// Operator-facing diagnostic body emitted when the post-mid-wait
328/// `cache_lookup` short-circuit fires (without the `{cli_label}: `
329/// prefix — caller composes via `eprintln!("{cli_label}: {body}")`,
330/// matching the [`MidWaitState::diagnostic`] convention).
331///
332/// Separate from [`MidWaitState::diagnostic`] because the cache-hit
333/// is downstream of the variant classification — the message fires
334/// only when all three of `mid_wait_clean`, a populated cache slot,
335/// and an extant image file align. Tying the message to a single
336/// MidWaitState variant would misrepresent that conjunction.
337fn cache_hit_diagnostic(cache_key: &str) -> String {
338 format!(
339 "concurrent ktstr build populated cache slot {cache_key} during \
340 peer's build wait — skipping redundant rebuild"
341 )
342}
343
344/// Post-acquisition kernel build pipeline.
345///
346/// Handles: clean, configure, build, validate config, generate
347/// compile_commands.json for local trees, find image, strip vmlinux,
348/// compute metadata, cache store, and remote cache store (when
349/// enabled). Callers handle source acquisition.
350///
351/// `cli_label` prefixes diagnostic status output (e.g. `"ktstr"` or
352/// `"cargo ktstr"`).
353///
354/// `is_local_source` should be true when the source is a local
355/// kernel source tree, regardless of how the caller arrived there
356/// (`kernel build --kernel <path>`, `cargo ktstr test --kernel <path>`,
357/// or any other Path-spec entry that funnels through
358/// [`super::super::resolve_kernel_dir`] /
359/// [`super::super::resolve_kernel_dir_to_entry`]). It controls the
360/// mrproper warning and `source_tree_path` in metadata.
361///
362/// `extra_kconfig` is an optional user-supplied kconfig fragment
363/// merged on top of [`EMBEDDED_KCONFIG`] before `configure_kernel`
364/// (which runs olddefconfig only when new lines are needed).
365/// `Some(content)` appends the fragment AFTER the baked-in fragment
366/// so kbuild's last-occurrence-wins semantics
367/// (`scripts/kconfig/confdata.c::conf_read_simple`) make user values
368/// override baked-in ones on conflict, and forces a re-configure pass
369/// even when `.config` already carries `CONFIG_SCHED_CLASS_EXT=y`
370/// (the user fragment may add or invert symbols the baked-in pass
371/// alone wouldn't have produced).
372///
373/// Two metadata fields capture the build inputs separately:
374/// - `ktstr_kconfig_hash` always holds the bare baked-in hash
375/// (`crate::kconfig_hash()` of `EMBEDDED_KCONFIG`) so
376/// `KconfigStatus::Matches/Stale/Untracked` keeps comparing
377/// against the live baked-in fragment.
378/// - `extra_kconfig_hash` holds `Some(crate::extra_kconfig_hash(content))`
379/// when extras were supplied, `None` otherwise. Drives the
380/// `(extra kconfig)` tag in `kernel list`.
381///
382/// Callers that don't expose `--extra-kconfig` (test/coverage/
383/// shell/verifier) pass `None`.
384#[allow(clippy::too_many_arguments)]
385pub fn kernel_build_pipeline(
386 acquired: &crate::fetch::AcquiredSource,
387 cache: &crate::cache::CacheDir,
388 cli_label: &str,
389 clean: bool,
390 is_local_source: bool,
391 cpu_cap: Option<crate::vmm::host_topology::CpuCap>,
392 extra_kconfig: Option<&str>,
393 progress: Option<&crate::cli::FetchProgress>,
394) -> Result<KernelBuildResult> {
395 let source_dir = &acquired.source_dir;
396 let (arch, image_name) = crate::fetch::arch_info();
397
398 // Bind a guaranteed-live progress group for the build phase: the
399 // caller's group (the parallel resolve's shared group, or a
400 // single-shot caller's local one), or a fresh local group when the
401 // caller passed none. The build phase renders through this group and
402 // NEVER through a standalone `Spinner`, so concurrent builds in the
403 // parallel resolve cannot race the process-global `SPINNER_ACTIVE`
404 // guard. Off-TTY the group is hidden and inert.
405 let owned_group;
406 let progress = match progress {
407 Some(p) => p,
408 None => {
409 owned_group = crate::cli::FetchProgress::new();
410 &owned_group
411 }
412 };
413
414 // Two-phase reservation. A concurrent perf-mode test run must
415 // not have its measured CPUs stomped by a `make -j$(nproc)`
416 // explosion of gcc children, and vice-versa a concurrent
417 // kernel build must not have its compile window extended by
418 // a test pinning RT-FIFO on shared cores. Phase 1 of the
419 // reservation is the LLC-level flock from
420 // [`acquire_llc_plan`]: whole-LLC flocks whose count is
421 // chosen to cover the CPU budget (either an explicit
422 // `--cpu-cap N` or the 30%-of-allowed default). Phase 2 is
423 // the cgroup v2 sandbox from
424 // [`BuildSandbox::try_create`] that binds make/gcc's
425 // cpu+mem sets to the plan's CPUs + NUMA nodes so the
426 // parallelism hint is enforced, not just advisory.
427 //
428 // Binding order is load-bearing: `_sandbox` is declared first
429 // and drops first per Rust's declaration-order field-drop rule,
430 // which migrates the build pid out of the cgroup and rmdirs the
431 // child while the LLC flocks are still held. Otherwise a peer
432 // could observe the LLC released before the cgroup is gone,
433 // mint a new plan against the same LLCs, and see an orphan
434 // cgroup lingering for up to the 24h sweep window.
435 //
436 // Escape hatches:
437 // - `KTSTR_BYPASS_LLC_LOCKS=1`: skip the LLC plan+flock
438 // acquisition entirely; the build proceeds immediately
439 // without coordinating with any concurrent perf-mode run.
440 // Use when the operator explicitly accepts measurement
441 // noise (one shell doing unrelated work, an isolated
442 // developer workstation, or a CI queue that already
443 // serializes jobs at a higher layer). Mutually exclusive
444 // with `--cpu-cap` at CLI parse time — see the CLI
445 // binaries' pre-dispatch conflict check.
446 // - Sysfs-unreadable host (non-Linux, degraded container):
447 // `HostTopology::from_sysfs()` returns `Err`. Without
448 // `--cpu-cap`, we emit a `tracing::warn!` and proceed
449 // without locks. With `--cpu-cap`, the flag cannot be
450 // honoured and we fail hard — cpu_cap is a contract, not
451 // a hint: a silent degrade would let a build exceed the
452 // declared resource budget without surfacing.
453 // `_plan` + `_sandbox` are kept alive via RAII — their Drops
454 // release the LLC flocks and cgroup on scope exit. Struct
455 // field order in BuildReservation ensures `_sandbox` drops
456 // BEFORE `plan`, per Rust's declaration-order field-drop rule.
457 let BuildReservation {
458 plan: _plan,
459 _sandbox,
460 make_jobs,
461 } = acquire_build_reservation(cli_label, cpu_cap)?;
462
463 // Source-tree flock for local sources. Two parallel
464 // `cargo ktstr test --kernel ./linux` runs would otherwise race
465 // in `make` against the same source tree (e.g. one's
466 // `make defconfig` racing with another's `make compile_commands.json`)
467 // and produce inconsistent .config / build artifacts. The flock is
468 // taken on the SOURCE TREE itself (per canonical path), distinct from
469 // the cache-entry flock acquired inside `cache.store` (per cache key).
470 // The two are complementary: the source-tree flock serializes the
471 // build phase; the cache-entry flock serializes the atomic install.
472 //
473 // Held via `OwnedFd` for the lifetime of `_source_lock` — drops at
474 // end of pipeline. Skipped under `KTSTR_BYPASS_LLC_LOCKS` to share
475 // the operator's escape hatch with the LLC-flock bypass; that
476 // env var already declares "I accept noise from concurrent runs."
477 //
478 // `acquire_source_tree_lock` does a non-blocking `try_flock`
479 // first; on EWOULDBLOCK it surfaces the holder via
480 // `/proc/locks` (so the operator's terminal shows which peer is
481 // holding the lock) and then parks in a blocking `flock(LOCK_EX)`
482 // until the holder releases. The wait is intentional: when the
483 // peer's build finishes, the cache slot is likely populated and
484 // the post-acquire cache check below short-circuits the
485 // redundant rebuild. The pre-wait `eprintln!` inside
486 // `acquire_source_tree_lock` ensures the operator sees what
487 // they're waiting on rather than a silent stall.
488 let _source_lock = if is_local_source && !crate::bypass_llc_locks_active() {
489 Some(acquire_source_tree_lock(source_dir, cli_label)?)
490 } else {
491 None
492 };
493
494 // Post-acquire cache re-check. N peers racing on a cold cache all
495 // queue on the source-tree EX above. When the first peer's build
496 // completes and releases, the cache slot is populated — every
497 // subsequent peer should observe the hit and skip a redundant
498 // rebuild rather than serially repeat the same work. The
499 // pre-acquire `cache_lookup` in `resolve_kernel_dir_to_entry`
500 // catches the warm-cache case (no lock taken at all); this check
501 // catches the cold-then-warmed-during-wait case.
502 let mid_wait_state = compute_mid_wait_state(acquired, source_dir, is_local_source, cli_label);
503 let mid_wait_clean = mid_wait_state == MidWaitState::Clean;
504
505 if let Some(body) = mid_wait_state.diagnostic() {
506 eprintln!("{cli_label}: {body}");
507 }
508
509 if mid_wait_clean
510 && let Some(entry) =
511 crate::cli::resolve::cache_lookup(cache, &acquired.cache_key, cli_label)
512 && entry.image_path().exists()
513 {
514 eprintln!("{cli_label}: {}", cache_hit_diagnostic(&acquired.cache_key));
515 let image_path = entry.image_path();
516 return Ok(KernelBuildResult {
517 entry: Some(entry),
518 image_path,
519 post_build_is_dirty: false,
520 });
521 }
522
523 if clean {
524 if !is_local_source {
525 eprintln!(
526 "{cli_label}: --clean is only meaningful with a --kernel <path> source (downloaded/cloned sources start clean)"
527 );
528 } else {
529 eprintln!("{cli_label}: make mrproper");
530 run_make(source_dir, &["mrproper"])?;
531 }
532 }
533
534 reconfigure_and_build(source_dir, extra_kconfig, cli_label, make_jobs, progress)?;
535
536 // Validate critical config options were not silently disabled.
537 // When `--extra-kconfig` is set, attach an actionable hint
538 // pointing at the user fragment as a likely cause. The most
539 // plausible failure mode is a user override that disables a
540 // baked-in invariant (e.g. a fragment containing
541 // `# CONFIG_BPF is not set` defeats the BPF dep chain), so
542 // name `--extra-kconfig` in the wrap context.
543 validate_kernel_config(source_dir).with_context(|| {
544 if extra_kconfig.is_some() {
545 "post-build kernel config validation failed; check that your \
546 --extra-kconfig fragment does not disable a CONFIG_X required by \
547 ktstr (e.g. CONFIG_BPF, CONFIG_DEBUG_INFO_BTF, CONFIG_FTRACE, \
548 CONFIG_SCHED_CLASS_EXT)"
549 .to_string()
550 } else {
551 "post-build kernel config validation failed".to_string()
552 }
553 })?;
554
555 if !acquired.is_temp {
556 generate_compile_commands(source_dir, progress)?;
557 }
558
559 let (image_path, vmlinux_opt) = find_built_image(source_dir, cli_label)?;
560 let vmlinux_ref = vmlinux_opt.as_deref();
561
562 // Cache (skip for dirty local trees).
563 if acquired.is_dirty {
564 eprintln!("{cli_label}: kernel built at {}", image_path.display());
565 // Branch the hint wording: commit/stash is only an actionable
566 // remediation for an actual git repo. A non-git source tree
567 // is force-marked dirty (see `acquire_local_source` in
568 // `fetch.rs`) because dirty detection is impossible, and
569 // telling the operator to "commit or stash" leads nowhere.
570 let hint = dirty_cache_skip_hint(acquired.is_git);
571 eprintln!("{cli_label}: {hint}");
572 return Ok(KernelBuildResult {
573 entry: None,
574 image_path,
575 post_build_is_dirty: true,
576 });
577 }
578
579 if let Some(skip_result) = post_build_dirty_skip(
580 acquired,
581 source_dir,
582 is_local_source,
583 &image_path,
584 cli_label,
585 ) {
586 return Ok(skip_result);
587 }
588
589 build_metadata_and_store(
590 acquired,
591 cache,
592 cli_label,
593 is_local_source,
594 arch,
595 image_name,
596 extra_kconfig,
597 source_dir,
598 image_path,
599 vmlinux_ref,
600 )
601}
602
603/// Classify the source tree at the post-acquire re-probe site.
604///
605/// Mid-wait edit guard: the operator may edit a tracked file in
606/// the source tree DURING our EX wait (long peer build = long
607/// window). `acquired.is_dirty` snapshots clean-at-acquire; a fresh
608/// probe via `inspect_local_source_state` catches edits that landed
609/// during the wait. If dirty/hash-changed, the operator's intent
610/// is "build what's on disk" — skip the cache re-check and fall
611/// through to the build branch, where the post-build dirty re-check
612/// at the cache-store site will recognise the mutation and skip
613/// caching. Probe errors are warnings (not fatal) — same Err
614/// disposition as the post-build re-check.
615/// PreAcquireDirty distinguishes "operator started with a dirty
616/// tree" (the wait wasn't the cause) from "operator dirtied the
617/// tree during the wait" (DirtyEdit). The split keeps the enum
618/// variants honest about cause-attribution per the
619/// [`MidWaitState::diagnostic`] dispatch in [`kernel_build_pipeline`].
620///
621/// TOCTOU acceptance: the source tree can mutate between this
622/// probe and the `cache_lookup` call in the caller — a microsecond
623/// window (typically) where an operator edit, background
624/// autoformatter write, `git commit`, or IDE pre-commit hook would
625/// slip through both this guard AND the post-build dirty re-check at
626/// the cache-store site (the cache-hit return in the caller
627/// short-circuits before `make`, so the post-build re-check never
628/// runs for the racing-into-hit path). Publication-side staleness is
629/// gated by the separate post-build dirty re-check at the cache-store
630/// site; this paragraph is about the consumer-side stale-serving
631/// window only. The cache slot is keyed on `acquired.cache_key`
632/// (frozen at acquire time inside `local_source`), so the served
633/// artifact's identity is the acquire-time HEAD — a mid-window
634/// mutation produces a cache hit that serves a slightly-stale
635/// source state without destroying the operator's later state.
636/// Bounded race; the operator's next invocation re-acquires and
637/// observes the new state.
638///
639/// "Next invocation correct" is NOT a remediation for: single-shot
640/// CI pipelines without retry, `git bisect run` invocations (each
641/// commit is independent), or pipelined CI flows where one job
642/// builds the kernel and a downstream job consumes the cached
643/// image without re-probing the source tree. Operators in those
644/// workflows should treat cache hits as acquire-time-correct, not
645/// invocation-time correct. Holding an EX flock across [probe,
646/// cache_lookup] or re-probing after the lookup were considered
647/// and rejected as adding common-path latency for a microsecond-
648/// wide window.
649fn compute_mid_wait_state(
650 acquired: &crate::fetch::AcquiredSource,
651 source_dir: &Path,
652 is_local_source: bool,
653 cli_label: &str,
654) -> MidWaitState {
655 if is_local_source && !acquired.is_dirty {
656 match crate::fetch::inspect_local_source_state(source_dir) {
657 Ok(post) => {
658 let hash_changed = post.short_hash
659 != acquired
660 .kernel_source
661 .as_local_git_hash()
662 .map(str::to_string);
663 if post.is_dirty {
664 MidWaitState::DirtyEdit
665 } else if hash_changed {
666 MidWaitState::HashAdvanced
667 } else {
668 MidWaitState::Clean
669 }
670 }
671 Err(e) => {
672 tracing::warn!(
673 cli_label = cli_label,
674 err = %format!("{e:#}"),
675 "mid-wait dirty re-check failed; proceeding to build",
676 );
677 MidWaitState::ProbeFailed
678 }
679 }
680 } else if acquired.is_dirty {
681 MidWaitState::PreAcquireDirty
682 } else {
683 MidWaitState::Clean
684 }
685}
686
687/// Merge the kconfig fragments, reconfigure when stale, then build.
688///
689/// Builds the merged fragment ONCE so the configure call observes
690/// the byte layout `{EMBEDDED_KCONFIG}\n{extra}` (with a `\n`
691/// interleave) defined in [`crate::merge_kconfig_fragments`]. The
692/// helper returns a `Cow<'_, str>` so the no-extras path borrows
693/// `EMBEDDED_KCONFIG` without allocating; only the user-fragment
694/// case heaps the merged string. Unit tests pin the exact
695/// ordering kbuild's last-wins rule operates on.
696///
697/// Reconfigures when any merged-fragment line is missing from the
698/// current `.config`. The prior `has_sched_ext` probe was a proxy for
699/// "configured" — but a stale `.config` from an earlier build can carry
700/// sched_ext while MISSING a changed baked-in value (e.g. an edited
701/// CONFIG_NR_CPUS in ktstr.kconfig) or every user `--extra-kconfig` line,
702/// silently ignoring the edit. `all_fragment_lines_present` checks the
703/// actual merged fragment (exact-line) instead, so an edited baked-in
704/// symbol or a user extra both trigger the merged configure.
705fn reconfigure_and_build(
706 source_dir: &Path,
707 extra_kconfig: Option<&str>,
708 cli_label: &str,
709 make_jobs: Option<usize>,
710 progress: &crate::cli::FetchProgress,
711) -> Result<()> {
712 let merged_fragment = crate::merge_kconfig_fragments(EMBEDDED_KCONFIG, extra_kconfig);
713
714 // Surface a `tracing::warn!` for each user fragment line that
715 // overrides a baked-in symbol from `EMBEDDED_KCONFIG`. The build
716 // proceeds with the user value winning (last-wins is the design
717 // intent) — the warning lets the operator see they are shadowing
718 // a baked-in setting before configure_kernel (which runs
719 // olddefconfig only when new lines are needed), which is when
720 // an over-aggressive override can still be addressed by editing
721 // the fragment. A separate post-build `validate_kernel_config`
722 // pass catches critical-baked-in disablement (e.g. CONFIG_BPF).
723 if let Some(extra) = extra_kconfig {
724 warn_extra_kconfig_overrides_baked_in(extra, cli_label);
725 }
726
727 let config_now = std::fs::read_to_string(source_dir.join(".config")).unwrap_or_default();
728 let needs_configure =
729 extra_kconfig.is_some() || !all_fragment_lines_present(&merged_fragment, &config_now);
730 if needs_configure {
731 let bar = progress.step_bar("Configuring kernel...");
732 let configure_result = configure_kernel(source_dir, &merged_fragment, Some(progress));
733 bar.finish();
734 // Wrap configure errors with `--extra-kconfig` context when
735 // extras are present so the user can pinpoint which input is
736 // responsible for an olddefconfig failure (e.g. a malformed
737 // `CONFIG_X=` line in their fragment).
738 configure_result.with_context(|| {
739 if extra_kconfig.is_some() {
740 "kernel configure failed (with --extra-kconfig fragment merged on top of \
741 baked-in ktstr.kconfig); check the fragment for syntax errors or \
742 conflicting symbol declarations"
743 .to_string()
744 } else {
745 "kernel configure failed".to_string()
746 }
747 })?;
748
749 // Post-olddefconfig validation — warn (not error) when a
750 // user-requested option from `--extra-kconfig` did not
751 // survive into the final `.config` (typically because
752 // olddefconfig dropped it for an unmet dependency). Emits
753 // one `tracing::warn!` per dropped line naming the
754 // requested setting and the actual final value.
755 // The hard-fail "user override killed a baked-in invariant"
756 // case (e.g. user disabled `CONFIG_BPF`) is caught at
757 // `validate_kernel_config` post-build with extra context.
758 if let Some(extra) = extra_kconfig {
759 warn_dropped_extra_kconfig_lines(source_dir, extra, cli_label);
760 }
761 }
762
763 let bar = progress.step_bar("Building kernel...");
764 let build_result = make_kernel_with_output(source_dir, Some(progress), make_jobs);
765 bar.finish();
766 build_result?;
767 Ok(())
768}
769
770/// Generate `compile_commands.json` for local trees (LSP support).
771///
772/// The args MUST match `make_kernel_with_output`'s
773/// (`-jN`, `KCFLAGS=-Wno-error`) — otherwise make's "command line
774/// flag changed" detection invalidates the build's object cache
775/// and recompiles every translation unit single-threaded under
776/// the compile_commands.json rule. Build the same `-jN +
777/// KCFLAGS=-Wno-error` prefix via `build_make_args`, then append
778/// the target.
779fn generate_compile_commands(
780 source_dir: &Path,
781 progress: &crate::cli::FetchProgress,
782) -> Result<()> {
783 let nproc = std::thread::available_parallelism()
784 .map(|n| n.get())
785 .unwrap_or(1);
786 let mut cc_args = build_make_args(nproc);
787 cc_args.push("compile_commands.json".into());
788 let cc_arg_refs: Vec<&str> = cc_args.iter().map(|s| s.as_str()).collect();
789 let bar = progress.step_bar("Generating compile_commands.json...");
790 let cc_result = run_make_with_output(source_dir, &cc_arg_refs, Some(progress));
791 bar.finish();
792 cc_result?;
793 Ok(())
794}
795
796/// Find the built kernel image and vmlinux. Returns `(image_path,
797/// vmlinux)` where `vmlinux` is `Some` only when `<source_dir>/vmlinux`
798/// exists; emits the operator-facing caching/missing diagnostic.
799fn find_built_image(
800 source_dir: &Path,
801 cli_label: &str,
802) -> Result<(std::path::PathBuf, Option<std::path::PathBuf>)> {
803 let image_path = crate::kernel_path::find_image_in_dir(source_dir)
804 .ok_or_else(|| anyhow::anyhow!("no kernel image found in {}", source_dir.display()))?;
805 let vmlinux_path = source_dir.join("vmlinux");
806 let vmlinux_opt = if vmlinux_path.exists() {
807 let orig_mib = std::fs::metadata(&vmlinux_path)
808 .map(|m| m.len() as f64 / (1024.0 * 1024.0))
809 .unwrap_or(0.0);
810 eprintln!("{cli_label}: caching vmlinux ({orig_mib:.0} MiB, will be stripped)");
811 Some(vmlinux_path)
812 } else {
813 eprintln!("{cli_label}: warning: vmlinux not found, BTF will not be cached");
814 None
815 };
816 Ok((image_path, vmlinux_opt))
817}
818
819/// Post-build dirty re-check. Returns `Some(result)` when the cache
820/// store must be skipped because the source tree changed during the
821/// build; `None` (proceed to store) otherwise.
822///
823/// `local_source` captures `is_dirty` ONCE at acquire time. The
824/// operator may then edit a tracked file (`.config` mutation, source
825/// patch) DURING the build window. The acquire-time `is_dirty=false`
826/// would say "safe to cache" but the on-disk content actually built
827/// differs from the HEAD commit recorded in the cache key — a future
828/// cache hit on that key would serve a build that no longer matches
829/// its identity. Re-running the same gix probes catches the race. On
830/// any change (dirty flip OR HEAD-hash shift from a concurrent
831/// commit), skip the cache store and emit a one-liner explaining why
832/// the cache slot was passed over.
833///
834/// Errors from the re-check are surfaced as a warning rather than a
835/// hard fail — the build itself succeeded; refusing to store on a
836/// re-check probe failure would penalize an otherwise-clean run for a
837/// transient gix glitch. The cache store proceeds with the original
838/// key, on the same pessimistic basis as a tree the re-check could not
839/// classify.
840fn post_build_dirty_skip(
841 acquired: &crate::fetch::AcquiredSource,
842 source_dir: &Path,
843 is_local_source: bool,
844 image_path: &Path,
845 cli_label: &str,
846) -> Option<KernelBuildResult> {
847 if is_local_source {
848 match crate::fetch::inspect_local_source_state(source_dir) {
849 Ok(post) => {
850 let (skip, hash_changed) =
851 post_build_cache_store_skip(&post, acquired.kernel_source.as_local_git_hash());
852 if skip {
853 eprintln!(
854 "{cli_label}: source tree changed during build \
855 (acquire-time dirty={}, post-build dirty={}; \
856 hash_changed={hash_changed}); skipping cache store \
857 to avoid recording a stale identity. Re-run after \
858 the working tree settles to populate the cache.",
859 acquired.is_dirty, post.is_dirty,
860 );
861 return Some(KernelBuildResult {
862 entry: None,
863 image_path: image_path.to_path_buf(),
864 // Mid-build mutation flips the run's
865 // reproducibility — the cache key recorded at
866 // acquire time no longer identifies the actual
867 // build input. Mirror that into the outcome so
868 // the kernel-label downstream gets the
869 // `_dirty` suffix.
870 post_build_is_dirty: true,
871 });
872 }
873 }
874 Err(e) => {
875 tracing::warn!(
876 cli_label = cli_label,
877 err = %format!("{e:#}"),
878 "post-build dirty re-check failed; proceeding to cache store",
879 );
880 }
881 }
882 }
883 None
884}
885
886/// Build the kernel metadata + artifact set, store to cache, and
887/// return the clean (non-dirty) [`KernelBuildResult`].
888///
889/// `too_many_arguments` allow: the cache-store tail threads the
890/// acquire-time inputs (`acquired`, `arch`, `image_name`,
891/// `extra_kconfig`, `is_local_source`) plus the build outputs
892/// (`source_dir`, `image_path`, `vmlinux_ref`) the metadata records;
893/// bundling them into a struct would add indirection without
894/// changing the data flow.
895#[allow(clippy::too_many_arguments)]
896fn build_metadata_and_store(
897 acquired: &crate::fetch::AcquiredSource,
898 cache: &crate::cache::CacheDir,
899 cli_label: &str,
900 is_local_source: bool,
901 arch: &str,
902 image_name: &str,
903 extra_kconfig: Option<&str>,
904 source_dir: &Path,
905 image_path: std::path::PathBuf,
906 vmlinux_ref: Option<&Path>,
907) -> Result<KernelBuildResult> {
908 let config_hash = config_hash_for(source_dir)?;
909
910 // Two-segment metadata: the bare baked-in hash stays in
911 // `ktstr_kconfig_hash` so `kernel list`'s matches/stale/
912 // untracked verdict (see `CacheEntry::kconfig_status`) keeps
913 // comparing against the live `EMBEDDED_KCONFIG`, and the user
914 // extras hash lives in its own slot. Matches the cache-key
915 // suffix shape `kc{baked}-xkc{extra}` produced by
916 // [`crate::cache_key_suffix_with_extra`].
917 let kconfig_hash = embedded_kconfig_hash();
918 let extra_kconfig_hash_value = extra_kconfig.map(crate::extra_kconfig_hash);
919
920 // Source-tree vmlinux stat (size + mtime seconds) so a later
921 // `prefer_source_tree_for_dwarf` lookup can detect a user
922 // rebuild between cache store and DWARF read. Only meaningful
923 // for local sources whose vmlinux survived the build —
924 // `vmlinux_ref` is `None` if vmlinux wasn't found, in which
925 // case there's nothing to stat. mtime read is best-effort:
926 // failure leaves the validation pair `None` and prefers the
927 // pre-validation behavior for this entry.
928 let source_vmlinux_stat = source_vmlinux_stat_for(vmlinux_ref);
929
930 let mut metadata = crate::cache::KernelMetadata::new(
931 acquired.kernel_source.clone(),
932 arch,
933 image_name,
934 crate::test_support::now_iso8601(),
935 )
936 .with_ktstr_kconfig_hash(kconfig_hash);
937 if let Some(v) = acquired.version.clone() {
938 metadata = metadata.with_version(v);
939 }
940 if let Some(h) = config_hash {
941 metadata = metadata.with_config_hash(h);
942 }
943 if let Some(h) = extra_kconfig_hash_value {
944 metadata = metadata.with_extra_kconfig_hash(h);
945 }
946 if is_local_source && let Some((size, mtime_secs)) = source_vmlinux_stat {
947 metadata = metadata.with_source_vmlinux_stat(size, mtime_secs);
948 }
949
950 let mut artifacts = crate::cache::CacheArtifacts::new(&image_path);
951 if let Some(v) = vmlinux_ref {
952 artifacts = artifacts.with_vmlinux(v);
953 }
954 let entry = match cache.store(&acquired.cache_key, &artifacts, &metadata) {
955 Ok(entry) => {
956 success(&format!("\u{2713} Kernel cached: {}", acquired.cache_key));
957 eprintln!("{cli_label}: image: {}", entry.image_path().display());
958 if crate::remote_cache::is_enabled() {
959 crate::remote_cache::remote_store(&entry, cli_label);
960 }
961 Some(entry)
962 }
963 Err(e) => {
964 warn(&format!("{cli_label}: cache store failed: {e:#}"));
965 None
966 }
967 };
968
969 Ok(KernelBuildResult {
970 entry,
971 image_path,
972 post_build_is_dirty: false,
973 })
974}
975
976/// CRC32 of `<source_dir>/.config` rendered as a fixed 8-hex-digit
977/// lowercase string, or `None` when no `.config` exists. The
978/// zero-padded `{:08x}` width is the cache-key suffix contract — a
979/// `{:x}` (no pad) would drop a leading-zero nibble and silently
980/// re-key every cached build, defeating cache hits. Pulled out of
981/// [`kernel_build_pipeline`] so the derivation is unit-testable
982/// without a real `make`.
983pub(crate) fn config_hash_for(source_dir: &Path) -> std::io::Result<Option<String>> {
984 let config_path = source_dir.join(".config");
985 if config_path.exists() {
986 let data = std::fs::read(&config_path)?;
987 Ok(Some(format!("{:08x}", crc32fast::hash(&data))))
988 } else {
989 Ok(None)
990 }
991}
992
993/// `(size, mtime_secs)` of the source-tree vmlinux, or `None` when the
994/// path is absent or unstattable. `mtime_secs` is signed seconds since
995/// the epoch — a pre-epoch mtime (clock skew) maps to a negative
996/// count rather than dropping the stat. The `None` short-circuit on a
997/// missing vmlinux keeps a later `prefer_source_tree_for_dwarf`
998/// staleness check from comparing against a phantom `(0, _)`. Pulled
999/// out of [`kernel_build_pipeline`] for unit-testability.
1000pub(crate) fn source_vmlinux_stat_for(vmlinux_ref: Option<&Path>) -> Option<(u64, i64)> {
1001 let v = vmlinux_ref?;
1002 let stat = std::fs::metadata(v).ok()?;
1003 let mtime_secs = stat.modified().ok().and_then(|t| {
1004 t.duration_since(std::time::UNIX_EPOCH)
1005 .map(|d| d.as_secs() as i64)
1006 .ok()
1007 .or_else(|| {
1008 std::time::UNIX_EPOCH
1009 .duration_since(t)
1010 .ok()
1011 .map(|d| -(d.as_secs() as i64))
1012 })
1013 })?;
1014 Some((stat.len(), mtime_secs))
1015}
1016
1017/// The cache-skip hint wording for a dirty local source tree. A git
1018/// repo can be remediated by commit/stash; a non-git tree is
1019/// force-marked dirty (dirty detection is impossible) so commit/stash
1020/// advice is unactionable — it gets the put-under-git wording
1021/// instead. Pulled out of [`kernel_build_pipeline`] for
1022/// unit-testability.
1023pub(crate) fn dirty_cache_skip_hint(is_git: bool) -> &'static str {
1024 if is_git {
1025 DIRTY_TREE_CACHE_SKIP_HINT
1026 } else {
1027 NON_GIT_TREE_CACHE_SKIP_HINT
1028 }
1029}
1030
1031/// The post-build cache-store-skip decision. After `make` returns,
1032/// `inspect_local_source_state` is re-run; the store is skipped when
1033/// the worktree went dirty during the build OR HEAD advanced (a
1034/// concurrent commit), because the acquire-time cache key no longer
1035/// identifies the built input. Returns `(skip, hash_changed)` so the
1036/// caller both decides and reports `hash_changed` in the skip message.
1037/// `acquired_local_git_hash` is the acquire-time hash from the kernel
1038/// source's `as_local_git_hash`. Pulled out of
1039/// [`kernel_build_pipeline`] for unit-testability.
1040pub(crate) fn post_build_cache_store_skip(
1041 post: &crate::fetch::LocalSourceState,
1042 acquired_local_git_hash: Option<&str>,
1043) -> (bool, bool) {
1044 let hash_changed = post.short_hash != acquired_local_git_hash.map(str::to_string);
1045 (post.is_dirty || hash_changed, hash_changed)
1046}
1047
1048#[cfg(test)]
1049mod tests {
1050 use super::super::super::kernel_cmd::KernelCommand;
1051 use super::*;
1052
1053 /// Returns `false` when `git` is not on `PATH`. Tests that drive
1054 /// a real git repo in a tempdir call this first and `return` early
1055 /// when git is unavailable so CI without git silently skips
1056 /// instead of failing on a hard-error.
1057 fn git_available() -> bool {
1058 std::process::Command::new("git")
1059 .arg("--version")
1060 .output()
1061 .is_ok()
1062 }
1063
1064 /// Runs `git` in `canonical` with the sandboxed env that the
1065 /// mid-wait tests share — neutralizes `~/.gitconfig` and
1066 /// `/etc/gitconfig` (so a CI host's git identity can't pollute
1067 /// the test repo) and pins author/committer identity so `commit`
1068 /// succeeds without depending on host config. Asserts the command
1069 /// exited successfully; failure surfaces stderr in the panic
1070 /// message.
1071 fn run_git(canonical: &Path, args: &[&str]) {
1072 let out = std::process::Command::new("git")
1073 .args(args)
1074 .current_dir(canonical)
1075 .env("GIT_CONFIG_GLOBAL", "/dev/null")
1076 .env("GIT_CONFIG_SYSTEM", "/dev/null")
1077 .env("GIT_AUTHOR_NAME", "ktstr-test")
1078 .env("GIT_AUTHOR_EMAIL", "ktstr-test@localhost")
1079 .env("GIT_COMMITTER_NAME", "ktstr-test")
1080 .env("GIT_COMMITTER_EMAIL", "ktstr-test@localhost")
1081 .output()
1082 .expect("git");
1083 assert!(
1084 out.status.success(),
1085 "git {args:?} failed: {}",
1086 String::from_utf8_lossy(&out.stderr)
1087 );
1088 }
1089
1090 /// Pins the post-acquire cache re-check at `kernel_build_pipeline`
1091 /// (the early-return path that fires when a peer publishes the
1092 /// cache slot during our source-tree EX wait).
1093 ///
1094 /// The early-return gate is 3-pronged: `!acquired.is_dirty` AND
1095 /// `cache_lookup(...).is_some()` AND `entry.image_path().exists()`.
1096 /// A regression that drops any prong (e.g. someone "simplifies"
1097 /// out the exists check) would let stale-manifest entries slip
1098 /// through and the runtime would crash later on a phantom image.
1099 ///
1100 /// Single-thread, deterministic — the "after EX wait" semantic
1101 /// reduces to "after the lookup, observe the planted state."
1102 /// Real thread orchestration is covered by
1103 /// `acquire_source_tree_lock_blocks_on_contention_then_succeeds`
1104 /// elsewhere in this module.
1105 #[test]
1106 fn cache_lookup_observes_peer_published_entry_after_ex_wait() {
1107 let _env_lock = crate::test_support::test_helpers::lock_env();
1108 let cache_tmp = tempfile::TempDir::new().expect("cache tempdir");
1109 let _cache_env = crate::test_support::test_helpers::EnvVarGuard::set(
1110 crate::KTSTR_CACHE_DIR_ENV,
1111 cache_tmp.path(),
1112 );
1113 let cache = crate::cache::CacheDir::with_root(cache_tmp.path().to_path_buf());
1114 let cache_key = "test-cache-key-7f8a9b";
1115
1116 // Plant a cache entry via `CacheDir::store` (the production
1117 // helper). Going through `store` rather than hand-writing
1118 // metadata.json keeps the test honest against schema drift.
1119 let (arch, image_name) = crate::fetch::arch_info();
1120 let staging = tempfile::TempDir::new().expect("staging tempdir");
1121 let fake_image = staging.path().join(image_name);
1122 std::fs::write(&fake_image, b"fake kernel image bytes").expect("write fake image");
1123 let metadata = crate::cache::KernelMetadata::new(
1124 crate::cache::KernelSource::Local {
1125 source_tree_path: None,
1126 git_hash: None,
1127 },
1128 arch,
1129 image_name,
1130 "2026-04-12T10:00:00Z",
1131 );
1132 let artifacts = crate::cache::CacheArtifacts::new(&fake_image);
1133 cache
1134 .store(cache_key, &artifacts, &metadata)
1135 .expect("plant cache entry");
1136
1137 // Exercise the 3-condition gate. `cache_lookup` is the same
1138 // helper `kernel_build_pipeline` calls at the post-acquire
1139 // re-check; `image_path().exists()` is the second gate; the
1140 // `is_dirty` gate is upstream (this test assumes a clean
1141 // source by construction since `acquired.is_dirty` is the
1142 // caller's responsibility).
1143 let entry = crate::cli::resolve::cache_lookup(&cache, cache_key, "test")
1144 .expect("cache_lookup must surface the planted entry");
1145 assert!(
1146 entry.image_path().exists(),
1147 "image_path existence check must hold for the planted entry",
1148 );
1149 assert_eq!(entry.metadata.built_at, "2026-04-12T10:00:00Z");
1150 }
1151
1152 /// Pins the HashAdvanced branch of [`MidWaitState`] classification
1153 /// at `kernel_build_pipeline` — operator advanced HEAD
1154 /// (`git commit`/`checkout`) during the peer's build wait, leaving
1155 /// the worktree clean but the short_hash bumped.
1156 ///
1157 /// Failure mode pinned: a future "simplification" that drops the
1158 /// `hash_changed` check and trusts only `post.is_dirty` would
1159 /// silently accept a cache slot keyed on the pre-commit hash even
1160 /// though the operator committed (clean post-state) on top during
1161 /// the wait. The served cache slot would correspond to an older
1162 /// HEAD than the operator's current source tree.
1163 #[test]
1164 fn mid_wait_hash_change_invalidates_cache_hit_skip() {
1165 if !git_available() {
1166 eprintln!(
1167 "mid_wait_hash_change_invalidates_cache_hit_skip: \
1168 git unavailable, skipping"
1169 );
1170 return;
1171 }
1172
1173 let tmp = tempfile::TempDir::new().unwrap();
1174 let canonical = tmp.path().to_path_buf();
1175 run_git(&canonical, &["init", "-q", "-b", "main"]);
1176 std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
1177 run_git(&canonical, &["add", "seed.txt"]);
1178 run_git(&canonical, &["commit", "-q", "-m", "initial"]);
1179
1180 let pre = crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
1181 let acquired_hash = pre
1182 .short_hash
1183 .clone()
1184 .expect("clean repo must carry a short_hash");
1185
1186 // Mid-wait commit — different from the acquire-time hash.
1187 std::fs::write(canonical.join("file.txt"), "amended mid-wait").unwrap();
1188 run_git(&canonical, &["add", "file.txt"]);
1189 run_git(&canonical, &["commit", "-q", "-m", "mid-wait commit"]);
1190
1191 let post = crate::fetch::inspect_local_source_state(&canonical).expect("post-wait probe");
1192
1193 assert!(
1194 !post.is_dirty,
1195 "committed changes leave the worktree clean; the hash \
1196 change is what must invalidate the cache hit (not is_dirty)",
1197 );
1198 assert!(
1199 post.short_hash.is_some(),
1200 "clean post-wait state must carry a short_hash",
1201 );
1202 assert_ne!(
1203 post.short_hash.as_ref(),
1204 Some(&acquired_hash),
1205 "the new commit must yield a different short_hash than the \
1206 acquire-time hash",
1207 );
1208
1209 // Mirror the production ternary in `kernel_build_pipeline`'s
1210 // mid_wait_state classification.
1211 let hash_changed = post.short_hash != Some(acquired_hash);
1212 let state = if post.is_dirty {
1213 MidWaitState::DirtyEdit
1214 } else if hash_changed {
1215 MidWaitState::HashAdvanced
1216 } else {
1217 MidWaitState::Clean
1218 };
1219 assert_eq!(
1220 state,
1221 MidWaitState::HashAdvanced,
1222 "clean worktree + advanced HEAD must classify as HashAdvanced",
1223 );
1224 assert!(
1225 state != MidWaitState::Clean,
1226 "hash_changed=true must falsify mid_wait_clean, forcing a \
1227 rebuild for the new cache key",
1228 );
1229 }
1230
1231 /// Pins the Clean branch of [`MidWaitState`] classification at
1232 /// `kernel_build_pipeline` — the positive path where a peer's
1233 /// build wait completes with the source tree unchanged and the
1234 /// `cache_lookup` short-circuit fires.
1235 ///
1236 /// Failure mode pinned: a future refactor that flips the
1237 /// `if post.is_dirty` / `else if hash_changed` order, or one that
1238 /// inverts a `!is_dirty` check, would route a no-mutation
1239 /// post-wait probe into DirtyEdit or HashAdvanced and force a
1240 /// redundant rebuild every time. This test ensures the no-op
1241 /// path keeps returning [`MidWaitState::Clean`] so the cache
1242 /// short-circuit at the consumer site remains reachable.
1243 #[test]
1244 fn mid_wait_clean_path_allows_cache_hit_skip() {
1245 if !git_available() {
1246 eprintln!(
1247 "mid_wait_clean_path_allows_cache_hit_skip: \
1248 git unavailable, skipping"
1249 );
1250 return;
1251 }
1252
1253 let tmp = tempfile::TempDir::new().unwrap();
1254 let canonical = tmp.path().to_path_buf();
1255 run_git(&canonical, &["init", "-q", "-b", "main"]);
1256 std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
1257 run_git(&canonical, &["add", "seed.txt"]);
1258 run_git(&canonical, &["commit", "-q", "-m", "initial"]);
1259
1260 let pre = crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
1261 let acquired_hash = pre
1262 .short_hash
1263 .clone()
1264 .expect("clean repo must carry a short_hash");
1265
1266 // No mid-wait mutation. Post-probe must observe the same hash
1267 // and a clean worktree.
1268 let post = crate::fetch::inspect_local_source_state(&canonical).expect("post-wait probe");
1269
1270 assert!(
1271 !post.is_dirty,
1272 "no mid-wait mutation must leave the post-wait probe clean",
1273 );
1274 assert_eq!(
1275 post.short_hash.as_ref(),
1276 Some(&acquired_hash),
1277 "no mid-wait commit must leave the short_hash unchanged",
1278 );
1279
1280 let hash_changed = post.short_hash != Some(acquired_hash);
1281 let state = if post.is_dirty {
1282 MidWaitState::DirtyEdit
1283 } else if hash_changed {
1284 MidWaitState::HashAdvanced
1285 } else {
1286 MidWaitState::Clean
1287 };
1288 assert_eq!(
1289 state,
1290 MidWaitState::Clean,
1291 "no-mutation post-wait state must classify as Clean so the \
1292 cache_lookup short-circuit fires",
1293 );
1294 assert_eq!(
1295 state.diagnostic(),
1296 None,
1297 "Clean must be silent — the cache-skip gate emits its own \
1298 diagnostic when the lookup hits",
1299 );
1300 }
1301
1302 /// Pins the DirtyEdit branch of [`MidWaitState`] classification at
1303 /// `kernel_build_pipeline` — operator edited a tracked file
1304 /// during the peer's build wait, post-wait probe surfaces
1305 /// `is_dirty=true` with no HEAD advance.
1306 ///
1307 /// Failure mode pinned: a future change that elides the
1308 /// `post.is_dirty` arm (e.g. trusting only `hash_changed`) would
1309 /// silently return a cache slot keyed on the pre-edit HEAD even
1310 /// though the operator's worktree no longer matches it — the
1311 /// rebuilt artifact would reflect the operator's local edits and
1312 /// the served cache slot would not.
1313 #[test]
1314 fn mid_wait_dirty_edit_invalidates_cache_hit_skip() {
1315 if !git_available() {
1316 eprintln!(
1317 "mid_wait_dirty_edit_invalidates_cache_hit_skip: \
1318 git unavailable, skipping"
1319 );
1320 return;
1321 }
1322
1323 let tmp = tempfile::TempDir::new().unwrap();
1324 let canonical = tmp.path().to_path_buf();
1325 run_git(&canonical, &["init", "-q", "-b", "main"]);
1326 std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
1327 run_git(&canonical, &["add", "seed.txt"]);
1328 run_git(&canonical, &["commit", "-q", "-m", "initial"]);
1329
1330 let pre = crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
1331 let acquired_hash = pre
1332 .short_hash
1333 .clone()
1334 .expect("clean repo must carry a short_hash");
1335
1336 // Mid-wait edit to a tracked file (no commit). The post-wait
1337 // probe must classify this as DirtyEdit — same hash, dirty
1338 // worktree.
1339 std::fs::write(canonical.join("seed.txt"), "operator edit during wait").unwrap();
1340
1341 let post = crate::fetch::inspect_local_source_state(&canonical).expect("post-wait probe");
1342
1343 assert!(
1344 post.is_dirty,
1345 "uncommitted edit to a tracked file must mark the post-wait \
1346 probe dirty",
1347 );
1348
1349 let hash_changed = post.short_hash != Some(acquired_hash);
1350 let state = if post.is_dirty {
1351 MidWaitState::DirtyEdit
1352 } else if hash_changed {
1353 MidWaitState::HashAdvanced
1354 } else {
1355 MidWaitState::Clean
1356 };
1357 assert_eq!(
1358 state,
1359 MidWaitState::DirtyEdit,
1360 "dirty worktree without HEAD advance must classify as DirtyEdit",
1361 );
1362 assert!(
1363 state != MidWaitState::Clean,
1364 "DirtyEdit must falsify mid_wait_clean — the cache slot \
1365 corresponds to pre-edit state",
1366 );
1367 }
1368
1369 /// Pins the ProbeFailed branch of [`MidWaitState`] classification at
1370 /// `kernel_build_pipeline` — the probe used to re-check the source
1371 /// tree returned `Err` and the pipeline conservatively rebuilds.
1372 ///
1373 /// Provoke strategy: init + commit, then truncate `.git/HEAD` to
1374 /// empty so `gix::discover` still succeeds (the `.git` dir
1375 /// exists) but `repo.head_id()` fails on the malformed ref —
1376 /// that error path is `inspect_local_source_state`'s only route
1377 /// to `Result::Err`. The non-git arm of `gix::discover` returns
1378 /// `Ok((None, true, false))`, NOT an `Err`, so simply removing
1379 /// `.git` does not reach ProbeFailed.
1380 ///
1381 /// Failure mode pinned: a future refactor that treats probe
1382 /// errors as Clean would silently return a cache slot keyed on
1383 /// unknowable post-wait state. The conservative-rebuild
1384 /// disposition is correct precisely because the alternative
1385 /// hides genuine corruption from the operator.
1386 #[test]
1387 fn mid_wait_probe_failure_invalidates_cache_hit_skip() {
1388 if !git_available() {
1389 eprintln!(
1390 "mid_wait_probe_failure_invalidates_cache_hit_skip: \
1391 git unavailable, skipping"
1392 );
1393 return;
1394 }
1395
1396 let tmp = tempfile::TempDir::new().unwrap();
1397 let canonical = tmp.path().to_path_buf();
1398 run_git(&canonical, &["init", "-q", "-b", "main"]);
1399 std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
1400 run_git(&canonical, &["add", "seed.txt"]);
1401 run_git(&canonical, &["commit", "-q", "-m", "initial"]);
1402
1403 let pre = crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
1404 assert!(
1405 pre.short_hash.is_some(),
1406 "pre-corruption probe must succeed (the corruption happens \
1407 mid-wait, not at acquire time)",
1408 );
1409
1410 // Corrupt HEAD mid-wait. `gix::discover` still sees `.git/`
1411 // and succeeds; the subsequent `head_id()` call fails on the
1412 // empty ref and `inspect_local_source_state` propagates the
1413 // error.
1414 std::fs::write(canonical.join(".git/HEAD"), b"").expect("truncate .git/HEAD");
1415
1416 let post = crate::fetch::inspect_local_source_state(&canonical);
1417 assert!(
1418 post.is_err(),
1419 "truncated .git/HEAD must surface as a probe error, not a \
1420 silent Clean classification — found: {post:?}",
1421 );
1422
1423 // Mirror the production dispatch: probe Err → ProbeFailed,
1424 // which falsifies mid_wait_clean and forces a rebuild.
1425 let state = match post {
1426 Ok(_) => MidWaitState::Clean,
1427 Err(_) => MidWaitState::ProbeFailed,
1428 };
1429 assert_eq!(
1430 state,
1431 MidWaitState::ProbeFailed,
1432 "probe Err must classify as ProbeFailed",
1433 );
1434 assert!(
1435 state != MidWaitState::Clean,
1436 "ProbeFailed must falsify mid_wait_clean — unknowable state \
1437 cannot be assumed Clean",
1438 );
1439 }
1440
1441 /// Pins the non-local-source branch of [`MidWaitState`]
1442 /// classification at `kernel_build_pipeline` — when the source
1443 /// came from a non-local kernel spec (e.g. `Git+ref`,
1444 /// `Tarball`, downloaded archive), the outer
1445 /// `if is_local_source && !acquired.is_dirty` guard short-circuits
1446 /// the probe entirely and the fall-through reaches
1447 /// [`MidWaitState::Clean`] via the `else { Clean }` arm.
1448 ///
1449 /// Failure mode pinned: a future refactor that inverts the outer
1450 /// guard (e.g. mistakenly calls `inspect_local_source_state` on a
1451 /// Git+ref source, which doesn't have a meaningful local probe
1452 /// target) would route a non-local source into the probe branch
1453 /// and likely surface ProbeFailed against a non-git tree — a
1454 /// noisy regression. This test pins the no-probe short-circuit.
1455 #[test]
1456 fn mid_wait_non_local_source_classifies_as_clean() {
1457 // Mirror the outer production switch with is_local_source=false.
1458 // No probe call — the outer `if is_local_source && !acquired.is_dirty`
1459 // guard short-circuits when !is_local_source, falling through
1460 // to the `else if acquired.is_dirty` / else arms.
1461 let is_local_source = false;
1462 let acquired_is_dirty = false;
1463 let state = if is_local_source && !acquired_is_dirty {
1464 unreachable!(
1465 "is_local_source=false must skip the probe branch — the \
1466 outer guard requires both is_local_source AND \
1467 !acquired.is_dirty to reach the probe arm"
1468 )
1469 } else if acquired_is_dirty {
1470 MidWaitState::PreAcquireDirty
1471 } else {
1472 MidWaitState::Clean
1473 };
1474 assert_eq!(
1475 state,
1476 MidWaitState::Clean,
1477 "non-local clean source must classify as Clean — the cache \
1478 short-circuit applies to any source whose state we cannot \
1479 probe (or did not need to probe)",
1480 );
1481 assert_eq!(
1482 state.diagnostic(),
1483 None,
1484 "Clean non-local source must be silent",
1485 );
1486 }
1487
1488 /// Pins the PreAcquireDirty variant identity and its silent
1489 /// diagnostic — `MidWaitState::PreAcquireDirty.diagnostic()`
1490 /// returns `None` because the wait was not the cause of the
1491 /// dirty state.
1492 ///
1493 /// SCOPE: does NOT exercise the caller-side dispatch order in
1494 /// `kernel_build_pipeline` — the test reconstructs the
1495 /// `if is_local_source && !acquired.is_dirty / else if
1496 /// acquired.is_dirty / else` chain inline because PreAcquireDirty
1497 /// is constructed without any probe call. A future refactor that
1498 /// flipped the guard order in `kernel_build_pipeline` would not
1499 /// fail this test; the other 4 mid_wait tests ground against
1500 /// `inspect_local_source_state` and would catch a probe-arm
1501 /// regression. This test pins the variant + diagnostic pair only.
1502 #[test]
1503 fn mid_wait_pre_acquire_dirty_suppresses_wait_diagnostic() {
1504 // Mirror the production dispatch with acquired.is_dirty=true.
1505 // No probe call — the `else if acquired.is_dirty` arm fires
1506 // before the probe-bearing branch. If the guard structure in
1507 // `kernel_build_pipeline` changes (e.g. PreAcquireDirty moves
1508 // inside the probe match), update this mirror.
1509 let is_local_source = true;
1510 let acquired_is_dirty = true;
1511 let state = if is_local_source && !acquired_is_dirty {
1512 unreachable!(
1513 "the guard requires !acquired.is_dirty before the probe \
1514 branch; acquired_is_dirty=true must skip this arm"
1515 )
1516 } else if acquired_is_dirty {
1517 MidWaitState::PreAcquireDirty
1518 } else {
1519 MidWaitState::Clean
1520 };
1521 assert_eq!(
1522 state,
1523 MidWaitState::PreAcquireDirty,
1524 "acquired.is_dirty=true must classify as PreAcquireDirty",
1525 );
1526 assert_eq!(
1527 state.diagnostic(),
1528 None,
1529 "PreAcquireDirty must be silent — the wait was not the \
1530 cause of the dirty state, so a wait-related diagnostic \
1531 would fabricate attribution",
1532 );
1533 }
1534
1535 /// Pins the exact diagnostic bodies emitted by each
1536 /// [`MidWaitState`] variant so a future copywriting change to
1537 /// the operator-facing messages is a deliberate, reviewed
1538 /// edit rather than silent drift.
1539 ///
1540 /// Clean and PreAcquireDirty return `None` (silent). DirtyEdit,
1541 /// HashAdvanced, and ProbeFailed return their full body strings
1542 /// without the `{cli_label}: ` prefix — the caller composes the
1543 /// prefix at the eprintln site.
1544 #[test]
1545 fn mid_wait_state_diagnostics_pinned() {
1546 assert_eq!(MidWaitState::Clean.diagnostic(), None);
1547 assert_eq!(MidWaitState::PreAcquireDirty.diagnostic(), None);
1548 assert_eq!(
1549 MidWaitState::DirtyEdit.diagnostic(),
1550 Some(
1551 "source tree changed during peer's build wait \
1552 — rebuilding to capture your local edits"
1553 ),
1554 );
1555 assert_eq!(
1556 MidWaitState::HashAdvanced.diagnostic(),
1557 Some(
1558 "source HEAD advanced during peer's build wait \
1559 — rebuilding for the new commit"
1560 ),
1561 );
1562 assert_eq!(
1563 MidWaitState::ProbeFailed.diagnostic(),
1564 Some(
1565 "source-tree dirty re-check failed during peer's \
1566 build wait — rebuilding conservatively (re-run with \
1567 RUST_LOG=warn for the probe error)"
1568 ),
1569 );
1570 }
1571
1572 /// Pins the exact diagnostic body emitted by the post-mid-wait
1573 /// `cache_lookup` short-circuit so a future copywriting change
1574 /// to the operator-facing message is a deliberate, reviewed
1575 /// edit rather than silent drift — parallel to
1576 /// [`mid_wait_state_diagnostics_pinned`] for the
1577 /// [`MidWaitState`] family.
1578 ///
1579 /// Two assertions: a byte-for-byte match on the formatted body
1580 /// with a representative cache_key, plus an inequality between
1581 /// two distinct keys to prove the `{cache_key}` placeholder is
1582 /// load-bearing (catches a regression that replaces the
1583 /// substitution with a static label and would silently produce
1584 /// a constant string regardless of input).
1585 #[test]
1586 fn cache_hit_diagnostic_pinned() {
1587 let cache_key = "test-cache-key-7f8a9b";
1588 assert_eq!(
1589 cache_hit_diagnostic(cache_key),
1590 "concurrent ktstr build populated cache slot test-cache-key-7f8a9b \
1591 during peer's build wait — skipping redundant rebuild",
1592 );
1593 assert_ne!(
1594 cache_hit_diagnostic(cache_key),
1595 cache_hit_diagnostic("different-key-x86-64"),
1596 "cache_key substitution must be load-bearing, not a no-op",
1597 );
1598 }
1599
1600 /// `kernel build --cpu-cap N` parses through clap into
1601 /// `KernelCommand::Build { cpu_cap: Some(N), .. }`. Pins the
1602 /// flag's wire path: a future rename of the field, a stray
1603 /// `default_value`, or a `value_parser` change that altered
1604 /// rejection semantics would surface as a parse failure or a
1605 /// shape mismatch on the assertion.
1606 #[test]
1607 fn kernel_build_parses_cpu_cap_without_extra_flags() {
1608 use clap::Parser as _;
1609 #[derive(clap::Parser, Debug)]
1610 struct TestCli {
1611 #[command(subcommand)]
1612 cmd: KernelCommand,
1613 }
1614 let parsed =
1615 TestCli::try_parse_from(["prog", "build", "--kernel", "6.14.2", "--cpu-cap", "4"])
1616 .expect("kernel build --cpu-cap N must parse");
1617 match parsed.cmd {
1618 KernelCommand::Build {
1619 cpu_cap, kernel, ..
1620 } => {
1621 assert_eq!(cpu_cap, Some(4));
1622 assert_eq!(kernel.as_deref(), Some("6.14.2"));
1623 }
1624 other => panic!("expected KernelCommand::Build, got {other:?}"),
1625 }
1626 }
1627
1628 /// `kernel build` without `--cpu-cap` parses with `cpu_cap: None`
1629 /// — the "unset" sentinel the downstream planner expands into the
1630 /// 30%-of-allowed default. Pins the no-flag path so a future
1631 /// rename of the clap field or a stray `default_value = "0"`
1632 /// surfaces as a test failure, not a silent runtime behavior change.
1633 #[test]
1634 fn kernel_build_without_cpu_cap_defaults_to_none() {
1635 use clap::Parser as _;
1636 #[derive(clap::Parser, Debug)]
1637 struct TestCli {
1638 #[command(subcommand)]
1639 cmd: KernelCommand,
1640 }
1641 let parsed = TestCli::try_parse_from(["prog", "build", "--kernel", "6.14.2"])
1642 .expect("kernel build without --cpu-cap must parse");
1643 match parsed.cmd {
1644 KernelCommand::Build { cpu_cap, .. } => {
1645 assert_eq!(cpu_cap, None, "no --cpu-cap must produce None, not Some(0)",);
1646 }
1647 other => panic!("expected KernelCommand::Build, got {other:?}"),
1648 }
1649 }
1650
1651 /// `kernel build --cpu-cap 0` parses successfully at clap level
1652 /// — the "must be ≥ 1" check lives in [`CpuCap::new`], not in
1653 /// the clap value parser. Pins the two-layer validation: clap
1654 /// accepts any usize; runtime resolution via `CpuCap::resolve` is
1655 /// responsible for the "0 is rejected" diagnostic.
1656 #[test]
1657 fn kernel_build_cpu_cap_zero_passes_clap() {
1658 use clap::Parser as _;
1659 #[derive(clap::Parser, Debug)]
1660 struct TestCli {
1661 #[command(subcommand)]
1662 cmd: KernelCommand,
1663 }
1664 let parsed =
1665 TestCli::try_parse_from(["prog", "build", "--kernel", "6.14.2", "--cpu-cap", "0"])
1666 .expect("clap-level parse must accept 0; runtime validation rejects");
1667 match parsed.cmd {
1668 KernelCommand::Build { cpu_cap, .. } => {
1669 assert_eq!(
1670 cpu_cap,
1671 Some(0),
1672 "clap parses 0 verbatim; validation is downstream",
1673 );
1674 }
1675 other => panic!("expected KernelCommand::Build, got {other:?}"),
1676 }
1677 }
1678
1679 // ---------------------------------------------------------------
1680 // kernel_build_pipeline reservation phase — factored-out
1681 // `acquire_build_reservation` covers the cpu_cap → acquire →
1682 // sandbox → make_jobs flow without needing a real kernel source.
1683 // ---------------------------------------------------------------
1684
1685 /// Serialize `KTSTR_BYPASS_LLC_LOCKS` env-var mutation across test
1686 /// threads. Delegates to the ONE crate-wide env mutex so it
1687 /// serializes against EVERY env-touching test (process-wide
1688 /// `std::env`), including the builder tests that read
1689 /// `KTSTR_BYPASS_LLC_LOCKS` — a module-local mutex left them racing.
1690 /// `lock_env()` recovers from poison.
1691 fn bypass_env_lock() -> std::sync::MutexGuard<'static, ()> {
1692 crate::test_support::test_helpers::lock_env()
1693 }
1694
1695 /// RAII guard for scoped `KTSTR_BYPASS_LLC_LOCKS` mutation.
1696 /// Caller holds `bypass_env_lock()` before constructing.
1697 struct BypassGuard;
1698 impl BypassGuard {
1699 fn set(value: &str) -> Self {
1700 // SAFETY: env_lock held by caller; serializes with
1701 // every other env-mutating test.
1702 unsafe {
1703 std::env::set_var(crate::KTSTR_BYPASS_LLC_LOCKS_ENV, value);
1704 }
1705 BypassGuard
1706 }
1707 fn remove() -> Self {
1708 // SAFETY: caller holds env_lock.
1709 unsafe {
1710 std::env::remove_var(crate::KTSTR_BYPASS_LLC_LOCKS_ENV);
1711 }
1712 BypassGuard
1713 }
1714 }
1715 impl Drop for BypassGuard {
1716 fn drop(&mut self) {
1717 // SAFETY: guard lifetime bounded by env_lock held by
1718 // caller; Drop runs before the mutex guard releases.
1719 unsafe {
1720 std::env::remove_var(crate::KTSTR_BYPASS_LLC_LOCKS_ENV);
1721 }
1722 }
1723 }
1724
1725 /// `acquire_build_reservation` with `KTSTR_BYPASS_LLC_LOCKS=1`
1726 /// plus `cpu_cap=None` returns a no-reservation `BuildReservation`:
1727 /// plan, sandbox, and make_jobs all None. Pins the "bypass
1728 /// disables both layers" contract.
1729 #[test]
1730 fn acquire_build_reservation_bypass_returns_no_reservation() {
1731 let _lock = bypass_env_lock();
1732 let _env = BypassGuard::set("1");
1733 let r = acquire_build_reservation("test", None).expect("bypass + no cap must succeed");
1734 assert!(r.plan.is_none(), "bypass must produce no LLC plan");
1735 assert!(
1736 r._sandbox.is_none(),
1737 "bypass must produce no cgroup sandbox",
1738 );
1739 assert!(
1740 r.make_jobs.is_none(),
1741 "bypass must fall back to nproc (None signals to caller)",
1742 );
1743 }
1744
1745 /// Regression pin: empty-string-as-unset contract for
1746 /// `KTSTR_BYPASS_LLC_LOCKS`. A bare `KTSTR_BYPASS_LLC_LOCKS=`
1747 /// (CI shells, Docker `--env` pass-through without value) must
1748 /// NOT activate the bypass — the reader at L102 uses
1749 /// `.is_some_and(|v| !v.is_empty())` and that contract is
1750 /// shared by all 7 sibling readers. If a future contributor
1751 /// flips to `.is_some_and(|_| true)` or bare `.is_ok()`, this
1752 /// test catches the regression before it silently disables LLC
1753 /// flock contention enforcement in CI.
1754 #[test]
1755 fn acquire_build_reservation_bypass_empty_string_rejected() {
1756 let _lock = bypass_env_lock();
1757 let _env = BypassGuard::set("");
1758 match acquire_build_reservation("test", None) {
1759 Ok(r) => {
1760 // Empty-as-unset means we take the standard branch,
1761 // not the bypass branch. Standard branch produces a
1762 // BuildReservation with plan / sandbox / make_jobs
1763 // tied together (set-or-unset together per the
1764 // `plan_and_make_jobs_consistent` invariant). If the
1765 // bypass had been (incorrectly) triggered, all 3
1766 // would be None.
1767 assert_eq!(
1768 r.plan.is_some(),
1769 r.make_jobs.is_some(),
1770 "empty-string must NOT activate bypass — plan + make_jobs \
1771 should follow the standard-branch invariant",
1772 );
1773 }
1774 Err(e) => {
1775 // Sysfs-unreadable host: standard branch failed for
1776 // unrelated reasons. The empty-string-as-unset
1777 // contract is still proven because the bypass branch
1778 // would have returned `Ok` with all-None fields (per
1779 // the `bypass_returns_no_reservation` test); reaching
1780 // Err proves the standard branch was taken.
1781 eprintln!("standard-branch error confirms bypass was NOT taken (good): {e:#}");
1782 }
1783 }
1784 }
1785
1786 /// `acquire_build_reservation` with `KTSTR_BYPASS_LLC_LOCKS=1`
1787 /// plus `cpu_cap=Some(_)` must error with the "resource contract"
1788 /// substring. Pins the conflict check at the pipeline's
1789 /// reservation entry point.
1790 #[test]
1791 fn acquire_build_reservation_bypass_with_cap_errors() {
1792 let _lock = bypass_env_lock();
1793 let _env = BypassGuard::set("1");
1794 let cap = crate::vmm::host_topology::CpuCap::new(2).expect("cap=2 valid");
1795 let err =
1796 acquire_build_reservation("test", Some(cap)).expect_err("bypass + cap must error");
1797 let msg = format!("{err:#}");
1798 assert!(
1799 msg.contains("resource contract"),
1800 "err must name the resource contract: {msg}",
1801 );
1802 }
1803
1804 /// `acquire_build_reservation` without bypass on a sysfs-capable
1805 /// host: returns a `BuildReservation` whose fields populate
1806 /// consistently — plan.is_some() iff make_jobs.is_some() iff
1807 /// sandbox.is_some(). Pins the "plan and make_jobs must never
1808 /// diverge" invariant.
1809 #[test]
1810 fn acquire_build_reservation_plan_and_make_jobs_consistent() {
1811 let _lock = bypass_env_lock();
1812 let _env = BypassGuard::remove();
1813 match acquire_build_reservation("test", None) {
1814 Ok(r) => {
1815 assert_eq!(
1816 r.plan.is_some(),
1817 r.make_jobs.is_some(),
1818 "plan and make_jobs must agree on reservation presence",
1819 );
1820 if let (Some(p), Some(jobs)) = (r.plan.as_ref(), r.make_jobs) {
1821 assert_eq!(
1822 jobs,
1823 crate::vmm::host_topology::make_jobs_for_plan(p),
1824 "make_jobs must equal make_jobs_for_plan(&plan)",
1825 );
1826 }
1827 assert_eq!(
1828 r.plan.is_some(),
1829 r._sandbox.is_some(),
1830 "sandbox and plan must agree on reservation presence",
1831 );
1832 }
1833 Err(e) => {
1834 // Sysfs-unreadable host or contested LLCs. Accept
1835 // either outcome; the test's intent is to pin the
1836 // invariant in the success case, not force success.
1837 eprintln!("acquire_build_reservation unavailable on this host: {e:#}");
1838 }
1839 }
1840 }
1841
1842 /// `acquire_build_reservation` plain bypass (no `--cpu-cap`)
1843 /// must NOT touch the sysfs probe. The test sets the bypass and
1844 /// confirms no error escapes, even on a host whose
1845 /// `HostTopology::from_sysfs()` would otherwise fail (the
1846 /// bypass branch is taken FIRST in the function, before the
1847 /// sysfs probe is attempted). Pins the "bypass short-circuits
1848 /// the topology probe" branch shape — a regression that
1849 /// re-ordered the bypass check below the sysfs probe would
1850 /// surface as a sysfs-error escape.
1851 #[test]
1852 fn acquire_build_reservation_bypass_does_not_touch_sysfs() {
1853 let _lock = bypass_env_lock();
1854 let _env = BypassGuard::set("1");
1855 let r = acquire_build_reservation("test", None)
1856 .expect("bypass must succeed regardless of sysfs availability");
1857 // The bypass branch produces (None, None, None) by
1858 // construction — no further state to assert beyond the
1859 // sibling tests that already pin the field shape.
1860 assert!(r.plan.is_none());
1861 assert!(r._sandbox.is_none());
1862 assert!(r.make_jobs.is_none());
1863 }
1864
1865 // ---------------------------------------------------------------
1866 // acquire_source_tree_lock — per-source-tree flock that
1867 // serializes parallel builds against the same on-disk source.
1868 // ---------------------------------------------------------------
1869 //
1870 // Tests use `isolated_cache_dir()` to point `KTSTR_CACHE_DIR` at
1871 // a tempdir for the test's lifetime, so the production
1872 // `CacheDir::new()` resolves into the tempdir without touching
1873 // the operator's real cache directory. The lockfile path is
1874 // deterministic (cache_root/.locks/source-{path_hash}.lock) so
1875 // we can re-derive it from the canonical input path and assert
1876 // its presence.
1877
1878 /// `acquire_source_tree_lock` on a fresh canonical path under
1879 /// an isolated cache root succeeds (no peer holding the lock)
1880 /// and creates the lockfile under `cache_root/.locks/`. Pins
1881 /// the lockfile placement: a regression that moved the lockfile
1882 /// to `/tmp/` (where `tmpwatch` could sweep it under an active
1883 /// holder) would surface here as the assertion failing on
1884 /// "lockfile not found at expected path."
1885 #[test]
1886 fn acquire_source_tree_lock_succeeds_on_fresh_path() {
1887 use crate::test_support::test_helpers::{isolated_cache_dir, lock_env};
1888 let _env_lock = lock_env();
1889 let cache = isolated_cache_dir();
1890 let canonical = std::path::PathBuf::from("/tmp/fake-source-tree-for-test");
1891 let fd = acquire_source_tree_lock(&canonical, "test")
1892 .expect("fresh-path acquire must succeed under isolated cache");
1893 // Lockfile must land under the isolated cache root's
1894 // `.locks/` subdirectory. The naming is `source-{hash}.lock`
1895 // where `{hash}` is `canonical_path_hash(canonical)`.
1896 let path_hash = crate::fetch::canonical_path_hash(&canonical);
1897 let expected = cache
1898 .path()
1899 .join(crate::flock::LOCK_DIR_NAME)
1900 .join(format!("source-{path_hash}.lock"));
1901 assert!(
1902 expected.exists(),
1903 "lockfile must exist at {} after acquire",
1904 expected.display(),
1905 );
1906 // Drop the FD explicitly to release the flock before the
1907 // tempdir cleanup races with it.
1908 drop(fd);
1909 }
1910
1911 /// `acquire_source_tree_lock` returns the SAME lockfile path
1912 /// for two different canonical inputs IFF they share the same
1913 /// `canonical_path_hash`. Two distinct inputs (`/srv/linux-a`
1914 /// and `/srv/linux-b`) must produce DIFFERENT lockfiles so
1915 /// concurrent builds against unrelated source trees don't
1916 /// serialize against each other. Pins the per-tree
1917 /// disambiguation contract.
1918 #[test]
1919 fn acquire_source_tree_lock_distinct_paths_yield_distinct_lockfiles() {
1920 use crate::test_support::test_helpers::{isolated_cache_dir, lock_env};
1921 let _env_lock = lock_env();
1922 let cache = isolated_cache_dir();
1923 let path_a = std::path::PathBuf::from("/tmp/fake-source-a");
1924 let path_b = std::path::PathBuf::from("/tmp/fake-source-b");
1925 let fd_a = acquire_source_tree_lock(&path_a, "test")
1926 .expect("path A acquire must succeed under isolated cache");
1927 // Acquiring path B while path A's lock is still held must
1928 // ALSO succeed — they hash to different lockfiles, so
1929 // there's no contention.
1930 let fd_b = acquire_source_tree_lock(&path_b, "test").expect(
1931 "path B acquire must succeed concurrently with A — \
1932 distinct canonical paths must hash to distinct \
1933 lockfiles so unrelated builds don't serialize",
1934 );
1935 let hash_a = crate::fetch::canonical_path_hash(&path_a);
1936 let hash_b = crate::fetch::canonical_path_hash(&path_b);
1937 assert_ne!(
1938 hash_a, hash_b,
1939 "distinct canonical paths must produce distinct CRC32 hashes",
1940 );
1941 let lock_a = cache
1942 .path()
1943 .join(crate::flock::LOCK_DIR_NAME)
1944 .join(format!("source-{hash_a}.lock"));
1945 let lock_b = cache
1946 .path()
1947 .join(crate::flock::LOCK_DIR_NAME)
1948 .join(format!("source-{hash_b}.lock"));
1949 assert!(lock_a.exists());
1950 assert!(lock_b.exists());
1951 assert_ne!(lock_a, lock_b);
1952 drop(fd_a);
1953 drop(fd_b);
1954 }
1955
1956 /// `acquire_source_tree_lock` on a path whose lockfile is
1957 /// already held by a peer parks in a blocking flock(2) until the
1958 /// holder releases, then succeeds. Pins the try-then-wait
1959 /// contract: a regression that re-introduced the bail-on-EWOULDBLOCK
1960 /// behavior, or any other path that returns without ever calling
1961 /// `flock(LOCK_EX)` blocking, would surface here as either the
1962 /// `/proc/locks` waiter scan timing out (no `-> FLOCK` line ever
1963 /// appears against the lockfile inode) or the worker's elapsed
1964 /// time being below the holder-retention window.
1965 ///
1966 /// We simulate "concurrent peer" by holding the first FD on the
1967 /// main thread, spawn a worker that issues a second acquire (which
1968 /// blocks in `block_flock`), poll `/proc/locks` until the kernel
1969 /// records the worker as a waiter against the lockfile inode
1970 /// (kernel emits blocked flock waiters as lines containing both
1971 /// `->` and the `{major:02x}:{minor:02x}:{inode}` triple — see
1972 /// `fs/locks.c::lock_get_status`), retain the holder for a fixed
1973 /// window after the waiter appears so the worker's blocking call
1974 /// can be measured, drop the holder, then collect the worker's
1975 /// `Result` via `recv_timeout` so a real regression that caused
1976 /// the worker to hang forever surfaces as a bounded test failure
1977 /// rather than an indefinite test-runner stall.
1978 ///
1979 /// Two assertions guard the blocking semantic together:
1980 /// 1. The `/proc/locks` waiter scan: proves the worker entered
1981 /// the kernel's blocked-flock state. A non-blocking
1982 /// regression never enters that state.
1983 /// 2. The worker's measured elapsed time `>= HOLD_WINDOW`:
1984 /// proves the worker stayed parked until the holder
1985 /// released. A non-blocking regression that eagerly
1986 /// returned `Err` would record a near-zero elapsed time
1987 /// even if the waiter scan happened to be flaky.
1988 #[test]
1989 fn acquire_source_tree_lock_blocks_on_contention_then_succeeds() {
1990 use crate::test_support::test_helpers::{isolated_cache_dir, lock_env};
1991 // `_env_lock` and `cache` MUST outlive the spawned worker
1992 // thread. The worker reads `KTSTR_CACHE_DIR` inside
1993 // `acquire_source_tree_lock`'s `CacheDir::new()`; if
1994 // `IsolatedCacheDir`'s drop ran while the worker was still
1995 // resolving the cache root, the worker would observe a
1996 // restored / empty env var and either land outside the
1997 // tempdir or fail with a stale-cache-root error. The bindings
1998 // below are declared here and dropped at end-of-scope, AFTER
1999 // the explicit `worker_result` collection point below.
2000 let _env_lock = lock_env();
2001 let cache = isolated_cache_dir();
2002 let canonical = std::path::PathBuf::from("/tmp/fake-source-contention");
2003 let holder = acquire_source_tree_lock(&canonical, "test")
2004 .expect("first acquire must succeed under isolated cache");
2005
2006 // Re-derive the lockfile path so we can needle `/proc/locks`
2007 // for waiter lines below. The production code constructs the
2008 // same path via `CacheDir::lock_path(format!("source-{hash}"))`
2009 // — see [`acquire_source_tree_lock`] above. The lockfile was
2010 // materialized by the holder's successful `try_flock` open
2011 // (O_CREAT), so by this point the inode exists on disk and
2012 // `needle_from_path` can stat it.
2013 let path_hash = crate::fetch::canonical_path_hash(&canonical);
2014 let lock_path = cache
2015 .path()
2016 .join(crate::flock::LOCK_DIR_NAME)
2017 .join(format!("source-{path_hash}.lock"));
2018 let needle = crate::flock::mountinfo::needle_from_path(&lock_path)
2019 .expect("needle_from_path must resolve the lockfile inode");
2020
2021 // Spawn a worker that issues the second acquire. The worker's
2022 // non-blocking `try_flock` will see the held lock and fall
2023 // through to `block_flock`, which parks the worker thread in
2024 // `flock(2)` until the holder's FD closes. `OwnedFd` and
2025 // `anyhow::Error` are both `Send`, so the `Result<OwnedFd>`
2026 // returns through the channel below. The worker also
2027 // captures its own elapsed time around the
2028 // `acquire_source_tree_lock` call so the assertion below can
2029 // verify the blocking path actually executed for the holder
2030 // retention window — a regression that returned non-blockingly
2031 // without parking in the kernel would surface as a near-zero
2032 // elapsed value even if the `/proc/locks` waiter scan happened
2033 // to be flaky.
2034 //
2035 // `sync_channel(1)`: a single-slot buffered channel lets the
2036 // worker `send` and exit even if the main thread already
2037 // panicked from an earlier assertion failure (rendezvous
2038 // bound-0 would leave the worker parked in `send` forever,
2039 // a thread leak on top of an already-failed test). A worker
2040 // that hangs forever before reaching `send` leaves the
2041 // channel empty and the `recv_timeout` below bails the test
2042 // within 5s rather than hanging the test runner indefinitely.
2043 let worker_canonical = canonical.clone();
2044 let (tx, rx) = std::sync::mpsc::sync_channel::<(
2045 std::result::Result<std::os::fd::OwnedFd, anyhow::Error>,
2046 std::time::Duration,
2047 )>(1);
2048 let _worker = std::thread::spawn(move || {
2049 let started = std::time::Instant::now();
2050 let result = acquire_source_tree_lock(&worker_canonical, "test");
2051 let elapsed = started.elapsed();
2052 // Send result + elapsed through the rendezvous channel.
2053 // If the main thread already abandoned the test (panic)
2054 // before the worker reached this point the send fails;
2055 // discarding the failure is correct because the test is
2056 // already failing for a different reason.
2057 let _ = tx.send((result, elapsed));
2058 });
2059
2060 // Poll `/proc/locks` for a waiter line against the lockfile
2061 // inode. The kernel emits one `-> FLOCK ... {dev}:{ino}` line
2062 // per blocked waiter (`fs/locks.c::lock_get_status` — the
2063 // leading `-> ` distinguishes a waiter from a holder); seeing
2064 // such a line proves the worker is parked in `flock(2)`.
2065 // `parse_flock_pids_for_needle` (the production scanner) does
2066 // NOT match `-> FLOCK` lines because it filters on `FLOCK` in
2067 // field-2, so the test scans the raw text directly with the
2068 // `->` + needle byte-pattern documented in the user-facing
2069 // task description.
2070 //
2071 // 10ms poll interval × 500 iterations = 5s deadline. A
2072 // healthy host enters the waiter state within a single
2073 // 10ms tick; the 5s ceiling exists only to bail a
2074 // pathologically-slow CI runner before the test runner's
2075 // own hang detector fires.
2076 const POLL_INTERVAL: std::time::Duration = std::time::Duration::from_millis(10);
2077 const POLL_DEADLINE: std::time::Duration = std::time::Duration::from_secs(5);
2078 let poll_start = std::time::Instant::now();
2079 let mut waiter_observed = false;
2080 while poll_start.elapsed() < POLL_DEADLINE {
2081 let contents = std::fs::read_to_string("/proc/locks")
2082 .expect("/proc/locks must be readable on a Linux host");
2083 if contents
2084 .lines()
2085 .any(|line| line.contains("->") && line.contains(&needle))
2086 {
2087 waiter_observed = true;
2088 break;
2089 }
2090 std::thread::sleep(POLL_INTERVAL);
2091 }
2092 assert!(
2093 waiter_observed,
2094 "no `-> FLOCK ... {needle}` waiter line appeared in \
2095 /proc/locks within {POLL_DEADLINE:?} — worker did not \
2096 enter the kernel's blocked-flock state, which means \
2097 `acquire_source_tree_lock` regressed off the blocking path",
2098 );
2099
2100 // Hold the lock for `HOLD_WINDOW` AFTER the waiter is
2101 // observed so the worker's measured elapsed time provably
2102 // exceeds the window. A regression that returned
2103 // non-blockingly would still record a sub-window elapsed
2104 // time even if a waiter line happened to flicker through
2105 // /proc/locks for unrelated reasons; the elapsed-window
2106 // assertion catches that. The window is wall-clock from
2107 // observation, not from worker entry, so the worker's
2108 // measured elapsed includes its own pre-park work plus the
2109 // window — `worker_elapsed >= HOLD_WINDOW` is sufficient.
2110 const HOLD_WINDOW: std::time::Duration = std::time::Duration::from_millis(200);
2111 std::thread::sleep(HOLD_WINDOW);
2112
2113 // Drop the holder. The worker's blocking flock(2) returns,
2114 // it acquires the lock, and the worker thread sends its
2115 // result through the channel.
2116 drop(holder);
2117
2118 // `recv_timeout` bounds the test's worst-case wall time.
2119 // Healthy worker delivers within microseconds of the
2120 // holder drop; the 5s ceiling fires only on a true
2121 // regression (worker stuck, fd not released, etc.).
2122 let (worker_result, worker_elapsed) =
2123 rx.recv_timeout(std::time::Duration::from_secs(5)).expect(
2124 "worker must deliver its acquire result within 5s of \
2125 holder release — a regression that caused the worker \
2126 to hang forever lands here",
2127 );
2128 let acquired = worker_result.expect("worker acquire must succeed once the holder releases");
2129
2130 // Elapsed-window assertion: the worker's measured time around
2131 // `acquire_source_tree_lock` must be at least the holder
2132 // retention window, because the worker was parked in
2133 // `flock(2)` for at least that long after `/proc/locks`
2134 // observed the waiter line. A revert to non-blocking
2135 // EWOULDBLOCK behavior would record a sub-window elapsed
2136 // value here and fail this assertion even if the
2137 // `/proc/locks` waiter scan happened to flake-pass.
2138 assert!(
2139 worker_elapsed >= HOLD_WINDOW,
2140 "worker's acquire returned in {worker_elapsed:?}, less than \
2141 the {HOLD_WINDOW:?} holder-retention window — worker did \
2142 not actually block on the held flock",
2143 );
2144
2145 // Drop the worker's FD explicitly so the lockfile flock
2146 // releases before the isolated cache dir is torn down.
2147 // `_env_lock` and `cache` are bound at function-scope above
2148 // and drop at end-of-scope, AFTER this point.
2149 drop(acquired);
2150 }
2151
2152 /// `BuildReservation` field declaration order is load-bearing:
2153 /// `_sandbox` MUST be declared BEFORE `plan` so Rust's
2154 /// in-declaration-order field-drop runs the sandbox cgroup
2155 /// rmdir BEFORE the LLC flock release.
2156 ///
2157 /// A regression that swapped the field order would mean
2158 /// LLC flocks release first, which lets a peer claim the LLC
2159 /// while gcc children are still bound to a cgroup whose rmdir
2160 /// hasn't run yet.
2161 ///
2162 /// We can't assert drop ORDER directly without exotic
2163 /// machinery, but we can assert the field order is what we
2164 /// expect via the `Debug` derive: `_sandbox` appears in the
2165 /// formatted struct BEFORE `plan` IFF the field declaration
2166 /// order matches the Drop-order requirement. The field-name
2167 /// regex is enough to pin the order without depending on the
2168 /// inner field shapes (which evolve as the planner / sandbox
2169 /// types add or rename their own fields).
2170 #[test]
2171 fn build_reservation_field_order_pins_drop_invariant() {
2172 let r = BuildReservation {
2173 _sandbox: None,
2174 plan: None,
2175 make_jobs: None,
2176 };
2177 let dbg = format!("{r:?}");
2178 let sandbox_pos = dbg
2179 .find("_sandbox")
2180 .expect("Debug output must mention _sandbox field");
2181 let plan_pos = dbg
2182 .find("plan")
2183 .expect("Debug output must mention plan field");
2184 assert!(
2185 sandbox_pos < plan_pos,
2186 "_sandbox MUST be declared before plan so cgroup rmdir \
2187 runs BEFORE LLC flock release on Drop. Debug: {dbg}",
2188 );
2189 }
2190
2191 // ---------------------------------------------------------------
2192 // Post-build metadata-derivation arms inside
2193 // `kernel_build_pipeline`. These pure-logic blocks (config_hash
2194 // CRC32, source_vmlinux_stat, the dirty-tree cache-skip hint, the
2195 // post-build dirty re-check store decision) are unreachable through
2196 // the public `kernel_build_pipeline` entry without a real `make`
2197 // invocation, so each was extracted into a `pub(crate)` helper
2198 // (`config_hash_for`, `source_vmlinux_stat_for`,
2199 // `dirty_cache_skip_hint`, `post_build_cache_store_skip`) that both
2200 // the pipeline and these tests call — so the tests exercise the
2201 // real production code, not a copy.
2202 // ---------------------------------------------------------------
2203
2204 /// Drives the production [`config_hash_for`] helper (extracted from
2205 /// `kernel_build_pipeline`). Pins: (1) absent `.config` yields
2206 /// `None`; (2) present `.config` hashes to `crc32fast::hash` of the
2207 /// exact bytes; (3) the `{:08x}` zero-pad is load-bearing.
2208 ///
2209 /// Property (2) is checked against an INDEPENDENT `crc32fast::hash`
2210 /// so a hasher swap diverges. Property (3) is checked with an input
2211 /// whose CRC32 has a leading-zero nibble: a `{:x}` regression would
2212 /// drop the leading zero, shortening the hash below 8 chars and
2213 /// silently re-keying every cached build.
2214 #[test]
2215 fn config_hash_derivation_matches_crc32_and_width() {
2216 let tmp = tempfile::TempDir::new().expect("config tempdir");
2217 let source_dir = tmp.path();
2218
2219 // Absent `.config` arm: the production `else { None }` branch.
2220 assert_eq!(
2221 config_hash_for(source_dir).expect("absent-config read"),
2222 None,
2223 "no .config must yield None config_hash",
2224 );
2225
2226 // Present `.config` arm: hash matches an independent crc32fast.
2227 let body = b"CONFIG_SCHED_CLASS_EXT=y\nCONFIG_BPF=y\n";
2228 std::fs::write(source_dir.join(".config"), body).expect("write .config");
2229 let present = config_hash_for(source_dir).expect("present-config read");
2230 let expected = format!("{:08x}", crc32fast::hash(body));
2231 assert_eq!(
2232 present.as_deref(),
2233 Some(expected.as_str()),
2234 "present .config must hash to crc32fast::hash of its bytes",
2235 );
2236
2237 // Width guard: find an input whose CRC32 has a leading-zero
2238 // nibble (< 0x1000_0000) so the `{:08x}` zero-pad is the only
2239 // thing keeping the rendered hash at 8 chars. A `{:x}`
2240 // regression would render it as 7 chars and fail the length
2241 // assert below. The probe sequence is deterministic; ~1 in 16
2242 // inputs qualifies, so 10k iterations always finds one.
2243 let probe = (0u32..10_000)
2244 .map(|n| format!("probe-{n}"))
2245 .find(|p| crc32fast::hash(p.as_bytes()) < 0x1000_0000)
2246 .expect("a leading-zero CRC32 within 10k probes");
2247 std::fs::write(source_dir.join(".config"), probe.as_bytes()).expect("rewrite .config");
2248 let zh = config_hash_for(source_dir)
2249 .expect("probe read")
2250 .expect("present .config must hash");
2251 assert_eq!(
2252 zh.len(),
2253 8,
2254 "config_hash must always be 8 hex chars ({{:08x}}): {zh}"
2255 );
2256 assert!(
2257 zh.starts_with('0'),
2258 "leading-zero CRC32 must keep its zero pad: {zh}"
2259 );
2260 assert!(
2261 zh.bytes().all(|b| b.is_ascii_hexdigit()),
2262 "config_hash must be lowercase hex: {zh}",
2263 );
2264 }
2265
2266 /// Drives the production [`source_vmlinux_stat_for`] helper
2267 /// (extracted from `kernel_build_pipeline`). Pins three arms: a
2268 /// `None` ref, a ref to a missing file, and a present file.
2269 ///
2270 /// Failure mode pinned: a regression that dropped the `None`
2271 /// short-circuit (yielding `Some((0, _))` for an absent/missing
2272 /// vmlinux) or returned the wrong size would defeat the
2273 /// `prefer_source_tree_for_dwarf` staleness check that compares
2274 /// this stat against a later read.
2275 #[test]
2276 fn source_vmlinux_stat_present_and_absent_arms() {
2277 let tmp = tempfile::TempDir::new().expect("vmlinux tempdir");
2278 let vmlinux_path = tmp.path().join("vmlinux");
2279
2280 // None ref → short-circuits to None.
2281 assert_eq!(
2282 source_vmlinux_stat_for(None),
2283 None,
2284 "a None vmlinux_ref must yield None",
2285 );
2286 // Ref to a non-existent file → metadata fails → None (NOT a
2287 // phantom (0, _)).
2288 assert_eq!(
2289 source_vmlinux_stat_for(Some(vmlinux_path.as_path())),
2290 None,
2291 "a vmlinux_ref to a missing file must yield None, not (0, _)",
2292 );
2293
2294 // Present file → (real length, positive post-epoch mtime).
2295 let body = b"\x7fELF fake vmlinux payload bytes for stat";
2296 std::fs::write(&vmlinux_path, body).expect("write vmlinux");
2297 let (size, mtime_secs) = source_vmlinux_stat_for(Some(vmlinux_path.as_path()))
2298 .expect("present vmlinux must stat to Some");
2299 assert_eq!(
2300 size,
2301 body.len() as u64,
2302 "stat size must equal the real source-tree vmlinux length",
2303 );
2304 assert!(
2305 mtime_secs > 0,
2306 "a freshly-written vmlinux must carry a positive post-epoch \
2307 mtime in seconds, got {mtime_secs}",
2308 );
2309 }
2310
2311 /// Drives the production [`dirty_cache_skip_hint`] helper (extracted
2312 /// from `kernel_build_pipeline`). A git repo with uncommitted
2313 /// changes gets the "commit or stash" hint; a non-git tree
2314 /// (force-marked dirty because dirty detection is impossible) gets
2315 /// the "put the source under git" hint — telling a non-git operator
2316 /// to "commit or stash" leads nowhere.
2317 ///
2318 /// Failure mode pinned: a regression that dropped the `is_git`
2319 /// branch and always returned `DIRTY_TREE_CACHE_SKIP_HINT` would
2320 /// give non-git operators unactionable advice. The inequality of
2321 /// the two constants proves the branch the helper selects on is
2322 /// load-bearing.
2323 #[test]
2324 fn dirty_tree_cache_skip_hint_branches_on_is_git() {
2325 assert_eq!(
2326 dirty_cache_skip_hint(true),
2327 DIRTY_TREE_CACHE_SKIP_HINT,
2328 "is_git=true must select the commit/stash hint",
2329 );
2330 assert_eq!(
2331 dirty_cache_skip_hint(false),
2332 NON_GIT_TREE_CACHE_SKIP_HINT,
2333 "is_git=false must select the put-under-git hint",
2334 );
2335 assert_ne!(
2336 DIRTY_TREE_CACHE_SKIP_HINT, NON_GIT_TREE_CACHE_SKIP_HINT,
2337 "the two hints must differ so the is_git branch is \
2338 load-bearing — a non-git operator must not be told to \
2339 commit/stash",
2340 );
2341 }
2342
2343 /// Drives the production [`post_build_cache_store_skip`] predicate
2344 /// (extracted from `kernel_build_pipeline`, DISTINCT from the
2345 /// mid-wait dirty classifier above) against a real git tree for the
2346 /// clean (store proceeds) and mid-build-commit (hash advanced →
2347 /// skip) cases. `inspect_local_source_state` is the real probe; the
2348 /// skip decision is the real predicate.
2349 ///
2350 /// Failure mode pinned: a regression that dropped the
2351 /// `hash_changed` disjunct and trusted only `post.is_dirty` would
2352 /// store a build under a cache key keyed on the pre-commit HEAD
2353 /// even though the operator committed (clean worktree) on top
2354 /// during the build — a future cache hit would serve a build that
2355 /// no longer matches its recorded identity.
2356 #[test]
2357 fn post_build_dirty_recheck_skips_store_on_hash_advance() {
2358 if !git_available() {
2359 eprintln!(
2360 "post_build_dirty_recheck_skips_store_on_hash_advance: \
2361 git unavailable, skipping"
2362 );
2363 return;
2364 }
2365
2366 let tmp = tempfile::TempDir::new().unwrap();
2367 let canonical = tmp.path().to_path_buf();
2368 run_git(&canonical, &["init", "-q", "-b", "main"]);
2369 std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
2370 run_git(&canonical, &["add", "seed.txt"]);
2371 run_git(&canonical, &["commit", "-q", "-m", "initial"]);
2372
2373 // Acquire-time identity (frozen at build start in `local_source`).
2374 let acquire =
2375 crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
2376 let acquired_hash = acquire.short_hash.clone();
2377
2378 // Clean post-build: same hash, clean worktree → store proceeds.
2379 let post_clean =
2380 crate::fetch::inspect_local_source_state(&canonical).expect("post-build clean probe");
2381 let (skip_clean, hash_changed_clean) =
2382 post_build_cache_store_skip(&post_clean, acquired_hash.as_deref());
2383 assert!(
2384 !hash_changed_clean,
2385 "an unchanged HEAD must not flag hash_changed",
2386 );
2387 assert!(
2388 !skip_clean,
2389 "an unchanged tree post-build must NOT skip the cache store",
2390 );
2391
2392 // Mid-build commit: HEAD advanced, worktree clean → skip store.
2393 std::fs::write(canonical.join("midbuild.txt"), "landed during make").unwrap();
2394 run_git(&canonical, &["add", "midbuild.txt"]);
2395 run_git(&canonical, &["commit", "-q", "-m", "mid-build commit"]);
2396 let post_advanced = crate::fetch::inspect_local_source_state(&canonical)
2397 .expect("post-build advanced probe");
2398 assert!(
2399 !post_advanced.is_dirty,
2400 "a committed mid-build change leaves the worktree clean; the \
2401 hash advance (not is_dirty) must drive the store skip",
2402 );
2403 let (skip_advanced, hash_changed_advanced) =
2404 post_build_cache_store_skip(&post_advanced, acquired_hash.as_deref());
2405 assert!(
2406 hash_changed_advanced,
2407 "the mid-build commit must yield a short_hash distinct from \
2408 the acquire-time hash",
2409 );
2410 assert!(
2411 skip_advanced,
2412 "a HEAD advance during the build must skip the cache store so \
2413 a stale identity is never recorded",
2414 );
2415 }
2416}