ktstr/cli/kernel_build/
build.rs

1//! Top-level kernel build orchestration.
2//!
3//! Holds [`kernel_build_pipeline`] (the post-acquisition orchestrator
4//! that runs `clean` → configure → build → validate → cache-store),
5//! the two-phase reservation acquisition
6//! ([`acquire_build_reservation`]) for LLC flock + cgroup sandbox +
7//! `make -jN` hint, and the source-tree flock helper
8//! ([`acquire_source_tree_lock`]) that serializes parallel builds
9//! against the same on-disk source tree.
10
11use std::path::Path;
12
13use anyhow::{Context, Result};
14
15use super::super::kernel_cmd::{
16    DIRTY_TREE_CACHE_SKIP_HINT, EMBEDDED_KCONFIG, NON_GIT_TREE_CACHE_SKIP_HINT,
17    embedded_kconfig_hash,
18};
19use super::super::util::{success, warn};
20use super::kconfig::{
21    all_fragment_lines_present, configure_kernel, validate_kernel_config,
22    warn_dropped_extra_kconfig_lines, warn_extra_kconfig_overrides_baked_in,
23};
24use super::make::{build_make_args, make_kernel_with_output, run_make, run_make_with_output};
25
26/// Result of the post-acquisition kernel build pipeline.
27///
28/// Returned by [`kernel_build_pipeline`] so callers can inspect
29/// the cache entry and built image path.
30#[non_exhaustive]
31pub struct KernelBuildResult {
32    /// Cache entry, if the build was cached. `None` for dirty trees
33    /// or when cache store fails.
34    pub entry: Option<crate::cache::CacheEntry>,
35    /// Path to the built kernel image.
36    pub image_path: std::path::PathBuf,
37    /// Whether the source tree was dirty as observed by the build
38    /// pipeline. `true` if either the acquire-time inspection
39    /// reported dirty OR the post-build re-check observed a
40    /// mid-build mutation (worktree edit, branch flip, mid-build
41    /// commit). The downstream label decoration in cargo-ktstr's
42    /// `resolve_one` uses this to append `_dirty` so a
43    /// non-reproducible run is distinguishable from a clean rebuild
44    /// of the same path.
45    pub post_build_is_dirty: bool,
46}
47
48/// Two-phase build reservation handles (LLC flock plan + cgroup v2
49/// sandbox + make -jN hint). Consumed by
50/// [`kernel_build_pipeline`]; the factored-out
51/// [`acquire_build_reservation`] builds it from `cpu_cap` without
52/// depending on kernel source, enabling integration tests that
53/// exercise the reservation logic against synthetic topologies.
54///
55/// Drop order is load-bearing: `_sandbox` is declared first and
56/// drops first per Rust's declaration-order field-drop rule;
57/// this ensures the cgroup sandbox is removed before the LLC
58/// flock is released. Otherwise a peer could observe the LLC
59/// released before the cgroup is gone and mint a conflicting
60/// plan.
61#[derive(Debug)]
62pub(crate) struct BuildReservation {
63    /// cgroup v2 sandbox. `None` when `plan` is `None` (no reservation
64    /// to enforce). Drops FIRST per struct field order — cgroup
65    /// rmdir runs while LLC flocks are still held. `_` prefix
66    /// keeps the binding alive through Drop but marks it as
67    /// not-read — the RAII invariant IS the read.
68    pub(crate) _sandbox: Option<crate::vmm::cgroup_sandbox::BuildSandbox>,
69    /// LLC plan (flock fds + cpus + mems). `None` under
70    /// `KTSTR_BYPASS_LLC_LOCKS=1` or sysfs-unreadable host without
71    /// `--cpu-cap`. Drops SECOND per struct field order —
72    /// flocks release AFTER the sandbox rmdir lands.
73    pub(crate) plan: Option<crate::vmm::host_topology::LlcPlan>,
74    /// `make -jN` parallelism hint. `Some(N)` under an active
75    /// `plan`; `None` when no reservation exists (caller falls
76    /// back to `nproc`).
77    pub(crate) make_jobs: Option<usize>,
78}
79
80/// Acquire the two-phase reservation (LLC flocks + cgroup sandbox)
81/// for a kernel build. Factored out of [`kernel_build_pipeline`]
82/// so integration tests can exercise the cpu_cap → acquire →
83/// sandbox → make_jobs decision tree without requiring a real
84/// kernel source tree.
85///
86/// Returns a `BuildReservation` whose fields are the three values
87/// `kernel_build_pipeline` used to bind inline. `_sandbox` is
88/// declared first and drops first per Rust's declaration-order
89/// field-drop rule; this ensures the cgroup sandbox is removed
90/// before the LLC flock is released.
91///
92/// `cli_label` prefixes operator-facing error text.
93///
94/// `cpu_cap` is the resolved CPU-count cap from
95/// [`CpuCap::resolve`](crate::vmm::host_topology::CpuCap::resolve);
96/// `None` means "reserve 30% of the calling process's allowed-CPU
97/// set", applied inside the planner at acquire time.
98pub(crate) fn acquire_build_reservation(
99    cli_label: &str,
100    cpu_cap: Option<crate::vmm::host_topology::CpuCap>,
101) -> Result<BuildReservation> {
102    let bypass = crate::bypass_llc_locks_active();
103    // INVARIANT: `_sandbox` is declared first and drops first per
104    // Rust's declaration-order field-drop rule; this ensures the
105    // cgroup sandbox is removed before the LLC flock is released.
106    // Reordering either would either
107    // (a) unlock LLCs while the sandbox still enforces the
108    // cpuset — a concurrent peer could claim the LLC and stomp
109    // gcc children that haven't exited — or (b) leave the cgroup
110    // hierarchy non-empty when its parent tries to rmdir.
111    let plan: Option<crate::vmm::host_topology::LlcPlan> = if bypass {
112        if cpu_cap.is_some() {
113            anyhow::bail!(
114                "{cli_label}: --cpu-cap conflicts with KTSTR_BYPASS_LLC_LOCKS=1; \
115                 unset one of them. --cpu-cap is a resource contract; bypass \
116                 disables the contract entirely."
117            );
118        }
119        None
120    } else if let Ok(host_topo) = crate::vmm::host_topology::HostTopology::from_sysfs() {
121        let test_topo = crate::topology::TestTopology::from_system()?;
122        let acquired_plan =
123            crate::vmm::host_topology::acquire_llc_plan(&host_topo, &test_topo, cpu_cap)?;
124        crate::vmm::host_topology::warn_if_cross_node_spill(&acquired_plan, &host_topo);
125        Some(acquired_plan)
126    } else {
127        if cpu_cap.is_some() {
128            anyhow::bail!(
129                "{cli_label}: --cpu-cap set but host LLC topology unreadable \
130                 from sysfs — cannot enforce the resource budget. Run on a \
131                 host with /sys/devices/system/cpu populated, or drop \
132                 --cpu-cap to build without enforcement."
133            );
134        }
135        tracing::warn!(
136            "{cli_label}: could not read host LLC topology from sysfs; \
137             skipping kernel-build LLC reservation. Concurrent perf-mode \
138             runs on this host will NOT be serialized against this build"
139        );
140        None
141    };
142
143    // Phase 2: cgroup v2 sandbox that enforces cpu+mem binding on
144    // make/gcc children. `hard_error_on_degrade` is driven by
145    // whether `--cpu-cap` was set explicitly: degradation is fatal
146    // under the flag (the flag promises enforcement), and warn-only
147    // when the 30%-of-allowed default was expanded (the default
148    // contract is best-effort — a parent cgroup narrowing the
149    // reservation should not fail the build).
150    let sandbox: Option<crate::vmm::cgroup_sandbox::BuildSandbox> = match plan.as_ref() {
151        Some(p) => Some(crate::vmm::cgroup_sandbox::BuildSandbox::try_create(
152            &p.cpus,
153            &p.mems,
154            cpu_cap.is_some(),
155        )?),
156        None => None,
157    };
158
159    // `make -jN` parallelism hint. `N` = `plan.cpus.len()` via
160    // `make_jobs_for_plan` — the reserved CPU count, whether that
161    // came from an explicit `--cpu-cap N` or the 30%-of-allowed
162    // default. See `make_kernel_with_output` for the resolution.
163    let make_jobs = plan
164        .as_ref()
165        .map(crate::vmm::host_topology::make_jobs_for_plan);
166
167    Ok(BuildReservation {
168        plan,
169        _sandbox: sandbox,
170        make_jobs,
171    })
172}
173
174/// Acquire an exclusive flock on a per-source-canonical-path lockfile
175/// so two concurrent `cargo ktstr test --kernel <path>` runs against
176/// the SAME source tree don't race in `make` (defconfig vs
177/// olddefconfig vs compile_commands.json) and stomp each other's
178/// `.config` and build artifacts.
179///
180/// The lockfile lives at
181/// `{KTSTR_CACHE_DIR}/.locks/source-{path_hash}.lock` where
182/// `{path_hash}` is the full 8-char CRC32 hex of the canonical
183/// source-path bytes (same shape and helper the
184/// `local-unknown-{path_hash}` cache key uses, see
185/// [`crate::fetch::canonical_path_hash`] /
186/// [`crate::fetch::compose_local_cache_key`]) — one per-tree
187/// identifier ties the source-tree flock to the cache key it gates.
188///
189/// Lockfile placement piggybacks on the cache root's `.locks/`
190/// subdirectory ([`crate::flock::LOCK_DIR_NAME`]) so source-tree
191/// flocks share the same filesystem-residency story as cache-entry
192/// flocks: never under `/tmp`, where `tmpwatch` (or the equivalent
193/// `systemd-tmpfiles` cleanup) can sweep stale-mtime files out from
194/// under an active flock holder. flock(2) does NOT update the
195/// inode's mtime, so a /tmp-resident lockfile would be a candidate
196/// for sweep on every run, with the resulting `unlink(2)` racing
197/// any peer trying to `open(2)` the same path. The `.locks/`
198/// directory under the user-controlled cache root is exempt from
199/// those sweeps.
200///
201/// Try-then-wait: attempts a non-blocking acquire first. If
202/// contended, logs the holder (pid + cmdline from /proc/locks)
203/// and falls through to a blocking acquire that parks until the
204/// peer releases. When the blocking acquire returns, the peer's
205/// build is done and the cache likely contains the artifact —
206/// the caller checks the cache after we return and skips the
207/// build if the slot is populated.
208///
209/// Distinct from the cache-entry flock acquired inside
210/// [`crate::cache::CacheDir::store`]: that lock serializes the
211/// atomic install of an artifact bundle into a cache slot; this
212/// lock serializes the BUILD itself against the source-tree
213/// `make` invocations.
214pub(crate) fn acquire_source_tree_lock(
215    canonical: &Path,
216    cli_label: &str,
217) -> Result<std::os::fd::OwnedFd> {
218    use anyhow::Context;
219
220    // Share the per-path CRC32 with `local-unknown-{hash}` cache
221    // keys so a single per-tree identifier ties the source-tree
222    // flock to the cache slot it gates.
223    let path_hash = crate::fetch::canonical_path_hash(canonical);
224    let cache = crate::cache::CacheDir::new()
225        .with_context(|| "open cache root for source-tree lockfile placement")?;
226    cache
227        .ensure_lock_dir()
228        .with_context(|| "create cache `.locks/` subdir for source-tree lock")?;
229    let lock_path = cache.lock_path(&format!("source-{path_hash}"));
230
231    match crate::flock::try_flock(&lock_path, crate::flock::FlockMode::Exclusive)
232        .with_context(|| format!("acquire source-tree flock {}", lock_path.display()))?
233    {
234        Some(fd) => Ok(fd),
235        None => {
236            // Non-blocking acquire failed (EWOULDBLOCK) — a live
237            // peer holds the lock. Surface the holder, then block
238            // until they release. When the blocking acquire
239            // returns, the peer's build is done and the cache
240            // likely contains the artifact we need — the caller
241            // checks the cache after we return, so it will skip
242            // the build if the peer populated the slot.
243            let holders = crate::flock::read_holders(&lock_path).unwrap_or_default();
244            let holder_text = if holders.is_empty() {
245                String::from("(holder not identified via /proc/locks)")
246            } else {
247                crate::flock::format_holder_list(&holders)
248            };
249            eprintln!(
250                "{cli_label}: source tree {} is locked by a concurrent ktstr \
251                 build — waiting for it to finish.\n{holder_text}",
252                canonical.display(),
253            );
254            crate::flock::block_flock(&lock_path, crate::flock::FlockMode::Exclusive).with_context(
255                || format!("blocking wait on source-tree flock {}", lock_path.display()),
256            )
257        }
258    }
259}
260
261/// Classification of source-tree state at the post-acquire
262/// re-probe site inside [`kernel_build_pipeline`].
263///
264/// The pipeline re-probes the source tree after the source-tree EX
265/// wait completes so a mid-wait mutation (operator edit, branch flip,
266/// commit on top) can invalidate the cache-skip short-circuit instead
267/// of returning a cache slot keyed on the pre-wait identity. The
268/// 5-variant split keeps cause-attribution honest in the operator
269/// diagnostic emitted by [`MidWaitState::diagnostic`]: a `git commit`
270/// during the wait is not "your edits"; an operator who started dirty
271/// did not dirty the tree because of the wait; a probe failure is
272/// not a confirmed mutation, just unknowable state.
273#[derive(Debug, PartialEq, Eq)]
274enum MidWaitState {
275    /// Source tree unchanged across the wait (or non-local source
276    /// where the wait has no source-tree implication). The pipeline
277    /// proceeds to the cache_lookup short-circuit.
278    Clean,
279    /// Operator started with a dirty tree BEFORE the source-tree
280    /// EX wait was taken. The wait was not the cause of the dirty
281    /// state, so the diagnostic is silent (returns `None`) to avoid
282    /// fabricating wait-related attribution.
283    PreAcquireDirty,
284    /// Operator edited a tracked file DURING the wait (acquire-time
285    /// probe was clean, post-wait probe is dirty). Forces a rebuild
286    /// and emits a "your local edits" diagnostic.
287    DirtyEdit,
288    /// Operator advanced HEAD (commit / branch flip) during the wait
289    /// (acquire-time short-hash differs from post-wait short-hash;
290    /// post-wait worktree is clean). Forces a rebuild and emits a
291    /// "HEAD advanced" diagnostic.
292    HashAdvanced,
293    /// Post-wait probe returned `Err` (corrupt git state, removed
294    /// source dir, or a gix internal error). Forces a conservative
295    /// rebuild — unknowable state cannot be assumed Clean.
296    ProbeFailed,
297}
298
299impl MidWaitState {
300    /// Operator-facing diagnostic body (without the `{cli_label}: `
301    /// prefix — caller composes via `eprintln!("{cli_label}: {body}")`).
302    ///
303    /// Returns `None` for [`Self::Clean`] (the cache-skip gate emits
304    /// its own message) and [`Self::PreAcquireDirty`] (the wait was
305    /// not the cause of the dirty state, so a wait-related diagnostic
306    /// would fabricate attribution).
307    fn diagnostic(&self) -> Option<&'static str> {
308        match self {
309            Self::DirtyEdit => Some(
310                "source tree changed during peer's build wait \
311                 — rebuilding to capture your local edits",
312            ),
313            Self::HashAdvanced => Some(
314                "source HEAD advanced during peer's build wait \
315                 — rebuilding for the new commit",
316            ),
317            Self::ProbeFailed => Some(
318                "source-tree dirty re-check failed during peer's \
319                 build wait — rebuilding conservatively (re-run with \
320                 RUST_LOG=warn for the probe error)",
321            ),
322            Self::Clean | Self::PreAcquireDirty => None,
323        }
324    }
325}
326
327/// Operator-facing diagnostic body emitted when the post-mid-wait
328/// `cache_lookup` short-circuit fires (without the `{cli_label}: `
329/// prefix — caller composes via `eprintln!("{cli_label}: {body}")`,
330/// matching the [`MidWaitState::diagnostic`] convention).
331///
332/// Separate from [`MidWaitState::diagnostic`] because the cache-hit
333/// is downstream of the variant classification — the message fires
334/// only when all three of `mid_wait_clean`, a populated cache slot,
335/// and an extant image file align. Tying the message to a single
336/// MidWaitState variant would misrepresent that conjunction.
337fn cache_hit_diagnostic(cache_key: &str) -> String {
338    format!(
339        "concurrent ktstr build populated cache slot {cache_key} during \
340         peer's build wait — skipping redundant rebuild"
341    )
342}
343
344/// Post-acquisition kernel build pipeline.
345///
346/// Handles: clean, configure, build, validate config, generate
347/// compile_commands.json for local trees, find image, strip vmlinux,
348/// compute metadata, cache store, and remote cache store (when
349/// enabled). Callers handle source acquisition.
350///
351/// `cli_label` prefixes diagnostic status output (e.g. `"ktstr"` or
352/// `"cargo ktstr"`).
353///
354/// `is_local_source` should be true when the source is a local
355/// kernel source tree, regardless of how the caller arrived there
356/// (`kernel build --kernel <path>`, `cargo ktstr test --kernel <path>`,
357/// or any other Path-spec entry that funnels through
358/// [`super::super::resolve_kernel_dir`] /
359/// [`super::super::resolve_kernel_dir_to_entry`]). It controls the
360/// mrproper warning and `source_tree_path` in metadata.
361///
362/// `extra_kconfig` is an optional user-supplied kconfig fragment
363/// merged on top of [`EMBEDDED_KCONFIG`] before `configure_kernel`
364/// (which runs olddefconfig only when new lines are needed).
365/// `Some(content)` appends the fragment AFTER the baked-in fragment
366/// so kbuild's last-occurrence-wins semantics
367/// (`scripts/kconfig/confdata.c::conf_read_simple`) make user values
368/// override baked-in ones on conflict, and forces a re-configure pass
369/// even when `.config` already carries `CONFIG_SCHED_CLASS_EXT=y`
370/// (the user fragment may add or invert symbols the baked-in pass
371/// alone wouldn't have produced).
372///
373/// Two metadata fields capture the build inputs separately:
374/// - `ktstr_kconfig_hash` always holds the bare baked-in hash
375///   (`crate::kconfig_hash()` of `EMBEDDED_KCONFIG`) so
376///   `KconfigStatus::Matches/Stale/Untracked` keeps comparing
377///   against the live baked-in fragment.
378/// - `extra_kconfig_hash` holds `Some(crate::extra_kconfig_hash(content))`
379///   when extras were supplied, `None` otherwise. Drives the
380///   `(extra kconfig)` tag in `kernel list`.
381///
382/// Callers that don't expose `--extra-kconfig` (test/coverage/
383/// shell/verifier) pass `None`.
384#[allow(clippy::too_many_arguments)]
385pub fn kernel_build_pipeline(
386    acquired: &crate::fetch::AcquiredSource,
387    cache: &crate::cache::CacheDir,
388    cli_label: &str,
389    clean: bool,
390    is_local_source: bool,
391    cpu_cap: Option<crate::vmm::host_topology::CpuCap>,
392    extra_kconfig: Option<&str>,
393    progress: Option<&crate::cli::FetchProgress>,
394) -> Result<KernelBuildResult> {
395    let source_dir = &acquired.source_dir;
396    let (arch, image_name) = crate::fetch::arch_info();
397
398    // Bind a guaranteed-live progress group for the build phase: the
399    // caller's group (the parallel resolve's shared group, or a
400    // single-shot caller's local one), or a fresh local group when the
401    // caller passed none. The build phase renders through this group and
402    // NEVER through a standalone `Spinner`, so concurrent builds in the
403    // parallel resolve cannot race the process-global `SPINNER_ACTIVE`
404    // guard. Off-TTY the group is hidden and inert.
405    let owned_group;
406    let progress = match progress {
407        Some(p) => p,
408        None => {
409            owned_group = crate::cli::FetchProgress::new();
410            &owned_group
411        }
412    };
413
414    // Two-phase reservation. A concurrent perf-mode test run must
415    // not have its measured CPUs stomped by a `make -j$(nproc)`
416    // explosion of gcc children, and vice-versa a concurrent
417    // kernel build must not have its compile window extended by
418    // a test pinning RT-FIFO on shared cores. Phase 1 of the
419    // reservation is the LLC-level flock from
420    // [`acquire_llc_plan`]: whole-LLC flocks whose count is
421    // chosen to cover the CPU budget (either an explicit
422    // `--cpu-cap N` or the 30%-of-allowed default). Phase 2 is
423    // the cgroup v2 sandbox from
424    // [`BuildSandbox::try_create`] that binds make/gcc's
425    // cpu+mem sets to the plan's CPUs + NUMA nodes so the
426    // parallelism hint is enforced, not just advisory.
427    //
428    // Binding order is load-bearing: `_sandbox` is declared first
429    // and drops first per Rust's declaration-order field-drop rule,
430    // which migrates the build pid out of the cgroup and rmdirs the
431    // child while the LLC flocks are still held. Otherwise a peer
432    // could observe the LLC released before the cgroup is gone,
433    // mint a new plan against the same LLCs, and see an orphan
434    // cgroup lingering for up to the 24h sweep window.
435    //
436    // Escape hatches:
437    //   - `KTSTR_BYPASS_LLC_LOCKS=1`: skip the LLC plan+flock
438    //     acquisition entirely; the build proceeds immediately
439    //     without coordinating with any concurrent perf-mode run.
440    //     Use when the operator explicitly accepts measurement
441    //     noise (one shell doing unrelated work, an isolated
442    //     developer workstation, or a CI queue that already
443    //     serializes jobs at a higher layer). Mutually exclusive
444    //     with `--cpu-cap` at CLI parse time — see the CLI
445    //     binaries' pre-dispatch conflict check.
446    //   - Sysfs-unreadable host (non-Linux, degraded container):
447    //     `HostTopology::from_sysfs()` returns `Err`. Without
448    //     `--cpu-cap`, we emit a `tracing::warn!` and proceed
449    //     without locks. With `--cpu-cap`, the flag cannot be
450    //     honoured and we fail hard — cpu_cap is a contract, not
451    //     a hint: a silent degrade would let a build exceed the
452    //     declared resource budget without surfacing.
453    // `_plan` + `_sandbox` are kept alive via RAII — their Drops
454    // release the LLC flocks and cgroup on scope exit. Struct
455    // field order in BuildReservation ensures `_sandbox` drops
456    // BEFORE `plan`, per Rust's declaration-order field-drop rule.
457    let BuildReservation {
458        plan: _plan,
459        _sandbox,
460        make_jobs,
461    } = acquire_build_reservation(cli_label, cpu_cap)?;
462
463    // Source-tree flock for local sources. Two parallel
464    // `cargo ktstr test --kernel ./linux` runs would otherwise race
465    // in `make` against the same source tree (e.g. one's
466    // `make defconfig` racing with another's `make compile_commands.json`)
467    // and produce inconsistent .config / build artifacts. The flock is
468    // taken on the SOURCE TREE itself (per canonical path), distinct from
469    // the cache-entry flock acquired inside `cache.store` (per cache key).
470    // The two are complementary: the source-tree flock serializes the
471    // build phase; the cache-entry flock serializes the atomic install.
472    //
473    // Held via `OwnedFd` for the lifetime of `_source_lock` — drops at
474    // end of pipeline. Skipped under `KTSTR_BYPASS_LLC_LOCKS` to share
475    // the operator's escape hatch with the LLC-flock bypass; that
476    // env var already declares "I accept noise from concurrent runs."
477    //
478    // `acquire_source_tree_lock` does a non-blocking `try_flock`
479    // first; on EWOULDBLOCK it surfaces the holder via
480    // `/proc/locks` (so the operator's terminal shows which peer is
481    // holding the lock) and then parks in a blocking `flock(LOCK_EX)`
482    // until the holder releases. The wait is intentional: when the
483    // peer's build finishes, the cache slot is likely populated and
484    // the post-acquire cache check below short-circuits the
485    // redundant rebuild. The pre-wait `eprintln!` inside
486    // `acquire_source_tree_lock` ensures the operator sees what
487    // they're waiting on rather than a silent stall.
488    let _source_lock = if is_local_source && !crate::bypass_llc_locks_active() {
489        Some(acquire_source_tree_lock(source_dir, cli_label)?)
490    } else {
491        None
492    };
493
494    // Post-acquire cache re-check. N peers racing on a cold cache all
495    // queue on the source-tree EX above. When the first peer's build
496    // completes and releases, the cache slot is populated — every
497    // subsequent peer should observe the hit and skip a redundant
498    // rebuild rather than serially repeat the same work. The
499    // pre-acquire `cache_lookup` in `resolve_kernel_dir_to_entry`
500    // catches the warm-cache case (no lock taken at all); this check
501    // catches the cold-then-warmed-during-wait case.
502    let mid_wait_state = compute_mid_wait_state(acquired, source_dir, is_local_source, cli_label);
503    let mid_wait_clean = mid_wait_state == MidWaitState::Clean;
504
505    if let Some(body) = mid_wait_state.diagnostic() {
506        eprintln!("{cli_label}: {body}");
507    }
508
509    if mid_wait_clean
510        && let Some(entry) =
511            crate::cli::resolve::cache_lookup(cache, &acquired.cache_key, cli_label)
512        && entry.image_path().exists()
513    {
514        eprintln!("{cli_label}: {}", cache_hit_diagnostic(&acquired.cache_key));
515        let image_path = entry.image_path();
516        return Ok(KernelBuildResult {
517            entry: Some(entry),
518            image_path,
519            post_build_is_dirty: false,
520        });
521    }
522
523    if clean {
524        if !is_local_source {
525            eprintln!(
526                "{cli_label}: --clean is only meaningful with a --kernel <path> source (downloaded/cloned sources start clean)"
527            );
528        } else {
529            eprintln!("{cli_label}: make mrproper");
530            run_make(source_dir, &["mrproper"])?;
531        }
532    }
533
534    reconfigure_and_build(source_dir, extra_kconfig, cli_label, make_jobs, progress)?;
535
536    // Validate critical config options were not silently disabled.
537    // When `--extra-kconfig` is set, attach an actionable hint
538    // pointing at the user fragment as a likely cause. The most
539    // plausible failure mode is a user override that disables a
540    // baked-in invariant (e.g. a fragment containing
541    // `# CONFIG_BPF is not set` defeats the BPF dep chain), so
542    // name `--extra-kconfig` in the wrap context.
543    validate_kernel_config(source_dir).with_context(|| {
544        if extra_kconfig.is_some() {
545            "post-build kernel config validation failed; check that your \
546             --extra-kconfig fragment does not disable a CONFIG_X required by \
547             ktstr (e.g. CONFIG_BPF, CONFIG_DEBUG_INFO_BTF, CONFIG_FTRACE, \
548             CONFIG_SCHED_CLASS_EXT)"
549                .to_string()
550        } else {
551            "post-build kernel config validation failed".to_string()
552        }
553    })?;
554
555    if !acquired.is_temp {
556        generate_compile_commands(source_dir, progress)?;
557    }
558
559    let (image_path, vmlinux_opt) = find_built_image(source_dir, cli_label)?;
560    let vmlinux_ref = vmlinux_opt.as_deref();
561
562    // Cache (skip for dirty local trees).
563    if acquired.is_dirty {
564        eprintln!("{cli_label}: kernel built at {}", image_path.display());
565        // Branch the hint wording: commit/stash is only an actionable
566        // remediation for an actual git repo. A non-git source tree
567        // is force-marked dirty (see `acquire_local_source` in
568        // `fetch.rs`) because dirty detection is impossible, and
569        // telling the operator to "commit or stash" leads nowhere.
570        let hint = dirty_cache_skip_hint(acquired.is_git);
571        eprintln!("{cli_label}: {hint}");
572        return Ok(KernelBuildResult {
573            entry: None,
574            image_path,
575            post_build_is_dirty: true,
576        });
577    }
578
579    if let Some(skip_result) = post_build_dirty_skip(
580        acquired,
581        source_dir,
582        is_local_source,
583        &image_path,
584        cli_label,
585    ) {
586        return Ok(skip_result);
587    }
588
589    build_metadata_and_store(
590        acquired,
591        cache,
592        cli_label,
593        is_local_source,
594        arch,
595        image_name,
596        extra_kconfig,
597        source_dir,
598        image_path,
599        vmlinux_ref,
600    )
601}
602
603/// Classify the source tree at the post-acquire re-probe site.
604///
605/// Mid-wait edit guard: the operator may edit a tracked file in
606/// the source tree DURING our EX wait (long peer build = long
607/// window). `acquired.is_dirty` snapshots clean-at-acquire; a fresh
608/// probe via `inspect_local_source_state` catches edits that landed
609/// during the wait. If dirty/hash-changed, the operator's intent
610/// is "build what's on disk" — skip the cache re-check and fall
611/// through to the build branch, where the post-build dirty re-check
612/// at the cache-store site will recognise the mutation and skip
613/// caching. Probe errors are warnings (not fatal) — same Err
614/// disposition as the post-build re-check.
615/// PreAcquireDirty distinguishes "operator started with a dirty
616/// tree" (the wait wasn't the cause) from "operator dirtied the
617/// tree during the wait" (DirtyEdit). The split keeps the enum
618/// variants honest about cause-attribution per the
619/// [`MidWaitState::diagnostic`] dispatch in [`kernel_build_pipeline`].
620///
621/// TOCTOU acceptance: the source tree can mutate between this
622/// probe and the `cache_lookup` call in the caller — a microsecond
623/// window (typically) where an operator edit, background
624/// autoformatter write, `git commit`, or IDE pre-commit hook would
625/// slip through both this guard AND the post-build dirty re-check at
626/// the cache-store site (the cache-hit return in the caller
627/// short-circuits before `make`, so the post-build re-check never
628/// runs for the racing-into-hit path). Publication-side staleness is
629/// gated by the separate post-build dirty re-check at the cache-store
630/// site; this paragraph is about the consumer-side stale-serving
631/// window only. The cache slot is keyed on `acquired.cache_key`
632/// (frozen at acquire time inside `local_source`), so the served
633/// artifact's identity is the acquire-time HEAD — a mid-window
634/// mutation produces a cache hit that serves a slightly-stale
635/// source state without destroying the operator's later state.
636/// Bounded race; the operator's next invocation re-acquires and
637/// observes the new state.
638///
639/// "Next invocation correct" is NOT a remediation for: single-shot
640/// CI pipelines without retry, `git bisect run` invocations (each
641/// commit is independent), or pipelined CI flows where one job
642/// builds the kernel and a downstream job consumes the cached
643/// image without re-probing the source tree. Operators in those
644/// workflows should treat cache hits as acquire-time-correct, not
645/// invocation-time correct. Holding an EX flock across [probe,
646/// cache_lookup] or re-probing after the lookup were considered
647/// and rejected as adding common-path latency for a microsecond-
648/// wide window.
649fn compute_mid_wait_state(
650    acquired: &crate::fetch::AcquiredSource,
651    source_dir: &Path,
652    is_local_source: bool,
653    cli_label: &str,
654) -> MidWaitState {
655    if is_local_source && !acquired.is_dirty {
656        match crate::fetch::inspect_local_source_state(source_dir) {
657            Ok(post) => {
658                let hash_changed = post.short_hash
659                    != acquired
660                        .kernel_source
661                        .as_local_git_hash()
662                        .map(str::to_string);
663                if post.is_dirty {
664                    MidWaitState::DirtyEdit
665                } else if hash_changed {
666                    MidWaitState::HashAdvanced
667                } else {
668                    MidWaitState::Clean
669                }
670            }
671            Err(e) => {
672                tracing::warn!(
673                    cli_label = cli_label,
674                    err = %format!("{e:#}"),
675                    "mid-wait dirty re-check failed; proceeding to build",
676                );
677                MidWaitState::ProbeFailed
678            }
679        }
680    } else if acquired.is_dirty {
681        MidWaitState::PreAcquireDirty
682    } else {
683        MidWaitState::Clean
684    }
685}
686
687/// Merge the kconfig fragments, reconfigure when stale, then build.
688///
689/// Builds the merged fragment ONCE so the configure call observes
690/// the byte layout `{EMBEDDED_KCONFIG}\n{extra}` (with a `\n`
691/// interleave) defined in [`crate::merge_kconfig_fragments`]. The
692/// helper returns a `Cow<'_, str>` so the no-extras path borrows
693/// `EMBEDDED_KCONFIG` without allocating; only the user-fragment
694/// case heaps the merged string. Unit tests pin the exact
695/// ordering kbuild's last-wins rule operates on.
696///
697/// Reconfigures when any merged-fragment line is missing from the
698/// current `.config`. The prior `has_sched_ext` probe was a proxy for
699/// "configured" — but a stale `.config` from an earlier build can carry
700/// sched_ext while MISSING a changed baked-in value (e.g. an edited
701/// CONFIG_NR_CPUS in ktstr.kconfig) or every user `--extra-kconfig` line,
702/// silently ignoring the edit. `all_fragment_lines_present` checks the
703/// actual merged fragment (exact-line) instead, so an edited baked-in
704/// symbol or a user extra both trigger the merged configure.
705fn reconfigure_and_build(
706    source_dir: &Path,
707    extra_kconfig: Option<&str>,
708    cli_label: &str,
709    make_jobs: Option<usize>,
710    progress: &crate::cli::FetchProgress,
711) -> Result<()> {
712    let merged_fragment = crate::merge_kconfig_fragments(EMBEDDED_KCONFIG, extra_kconfig);
713
714    // Surface a `tracing::warn!` for each user fragment line that
715    // overrides a baked-in symbol from `EMBEDDED_KCONFIG`. The build
716    // proceeds with the user value winning (last-wins is the design
717    // intent) — the warning lets the operator see they are shadowing
718    // a baked-in setting before configure_kernel (which runs
719    // olddefconfig only when new lines are needed), which is when
720    // an over-aggressive override can still be addressed by editing
721    // the fragment. A separate post-build `validate_kernel_config`
722    // pass catches critical-baked-in disablement (e.g. CONFIG_BPF).
723    if let Some(extra) = extra_kconfig {
724        warn_extra_kconfig_overrides_baked_in(extra, cli_label);
725    }
726
727    let config_now = std::fs::read_to_string(source_dir.join(".config")).unwrap_or_default();
728    let needs_configure =
729        extra_kconfig.is_some() || !all_fragment_lines_present(&merged_fragment, &config_now);
730    if needs_configure {
731        let bar = progress.step_bar("Configuring kernel...");
732        let configure_result = configure_kernel(source_dir, &merged_fragment, Some(progress));
733        bar.finish();
734        // Wrap configure errors with `--extra-kconfig` context when
735        // extras are present so the user can pinpoint which input is
736        // responsible for an olddefconfig failure (e.g. a malformed
737        // `CONFIG_X=` line in their fragment).
738        configure_result.with_context(|| {
739            if extra_kconfig.is_some() {
740                "kernel configure failed (with --extra-kconfig fragment merged on top of \
741                 baked-in ktstr.kconfig); check the fragment for syntax errors or \
742                 conflicting symbol declarations"
743                    .to_string()
744            } else {
745                "kernel configure failed".to_string()
746            }
747        })?;
748
749        // Post-olddefconfig validation — warn (not error) when a
750        // user-requested option from `--extra-kconfig` did not
751        // survive into the final `.config` (typically because
752        // olddefconfig dropped it for an unmet dependency). Emits
753        // one `tracing::warn!` per dropped line naming the
754        // requested setting and the actual final value.
755        // The hard-fail "user override killed a baked-in invariant"
756        // case (e.g. user disabled `CONFIG_BPF`) is caught at
757        // `validate_kernel_config` post-build with extra context.
758        if let Some(extra) = extra_kconfig {
759            warn_dropped_extra_kconfig_lines(source_dir, extra, cli_label);
760        }
761    }
762
763    let bar = progress.step_bar("Building kernel...");
764    let build_result = make_kernel_with_output(source_dir, Some(progress), make_jobs);
765    bar.finish();
766    build_result?;
767    Ok(())
768}
769
770/// Generate `compile_commands.json` for local trees (LSP support).
771///
772/// The args MUST match `make_kernel_with_output`'s
773/// (`-jN`, `KCFLAGS=-Wno-error`) — otherwise make's "command line
774/// flag changed" detection invalidates the build's object cache
775/// and recompiles every translation unit single-threaded under
776/// the compile_commands.json rule.  Build the same `-jN +
777/// KCFLAGS=-Wno-error` prefix via `build_make_args`, then append
778/// the target.
779fn generate_compile_commands(
780    source_dir: &Path,
781    progress: &crate::cli::FetchProgress,
782) -> Result<()> {
783    let nproc = std::thread::available_parallelism()
784        .map(|n| n.get())
785        .unwrap_or(1);
786    let mut cc_args = build_make_args(nproc);
787    cc_args.push("compile_commands.json".into());
788    let cc_arg_refs: Vec<&str> = cc_args.iter().map(|s| s.as_str()).collect();
789    let bar = progress.step_bar("Generating compile_commands.json...");
790    let cc_result = run_make_with_output(source_dir, &cc_arg_refs, Some(progress));
791    bar.finish();
792    cc_result?;
793    Ok(())
794}
795
796/// Find the built kernel image and vmlinux. Returns `(image_path,
797/// vmlinux)` where `vmlinux` is `Some` only when `<source_dir>/vmlinux`
798/// exists; emits the operator-facing caching/missing diagnostic.
799fn find_built_image(
800    source_dir: &Path,
801    cli_label: &str,
802) -> Result<(std::path::PathBuf, Option<std::path::PathBuf>)> {
803    let image_path = crate::kernel_path::find_image_in_dir(source_dir)
804        .ok_or_else(|| anyhow::anyhow!("no kernel image found in {}", source_dir.display()))?;
805    let vmlinux_path = source_dir.join("vmlinux");
806    let vmlinux_opt = if vmlinux_path.exists() {
807        let orig_mib = std::fs::metadata(&vmlinux_path)
808            .map(|m| m.len() as f64 / (1024.0 * 1024.0))
809            .unwrap_or(0.0);
810        eprintln!("{cli_label}: caching vmlinux ({orig_mib:.0} MiB, will be stripped)");
811        Some(vmlinux_path)
812    } else {
813        eprintln!("{cli_label}: warning: vmlinux not found, BTF will not be cached");
814        None
815    };
816    Ok((image_path, vmlinux_opt))
817}
818
819/// Post-build dirty re-check. Returns `Some(result)` when the cache
820/// store must be skipped because the source tree changed during the
821/// build; `None` (proceed to store) otherwise.
822///
823/// `local_source` captures `is_dirty` ONCE at acquire time. The
824/// operator may then edit a tracked file (`.config` mutation, source
825/// patch) DURING the build window. The acquire-time `is_dirty=false`
826/// would say "safe to cache" but the on-disk content actually built
827/// differs from the HEAD commit recorded in the cache key — a future
828/// cache hit on that key would serve a build that no longer matches
829/// its identity. Re-running the same gix probes catches the race. On
830/// any change (dirty flip OR HEAD-hash shift from a concurrent
831/// commit), skip the cache store and emit a one-liner explaining why
832/// the cache slot was passed over.
833///
834/// Errors from the re-check are surfaced as a warning rather than a
835/// hard fail — the build itself succeeded; refusing to store on a
836/// re-check probe failure would penalize an otherwise-clean run for a
837/// transient gix glitch. The cache store proceeds with the original
838/// key, on the same pessimistic basis as a tree the re-check could not
839/// classify.
840fn post_build_dirty_skip(
841    acquired: &crate::fetch::AcquiredSource,
842    source_dir: &Path,
843    is_local_source: bool,
844    image_path: &Path,
845    cli_label: &str,
846) -> Option<KernelBuildResult> {
847    if is_local_source {
848        match crate::fetch::inspect_local_source_state(source_dir) {
849            Ok(post) => {
850                let (skip, hash_changed) =
851                    post_build_cache_store_skip(&post, acquired.kernel_source.as_local_git_hash());
852                if skip {
853                    eprintln!(
854                        "{cli_label}: source tree changed during build \
855                         (acquire-time dirty={}, post-build dirty={}; \
856                         hash_changed={hash_changed}); skipping cache store \
857                         to avoid recording a stale identity. Re-run after \
858                         the working tree settles to populate the cache.",
859                        acquired.is_dirty, post.is_dirty,
860                    );
861                    return Some(KernelBuildResult {
862                        entry: None,
863                        image_path: image_path.to_path_buf(),
864                        // Mid-build mutation flips the run's
865                        // reproducibility — the cache key recorded at
866                        // acquire time no longer identifies the actual
867                        // build input. Mirror that into the outcome so
868                        // the kernel-label downstream gets the
869                        // `_dirty` suffix.
870                        post_build_is_dirty: true,
871                    });
872                }
873            }
874            Err(e) => {
875                tracing::warn!(
876                    cli_label = cli_label,
877                    err = %format!("{e:#}"),
878                    "post-build dirty re-check failed; proceeding to cache store",
879                );
880            }
881        }
882    }
883    None
884}
885
886/// Build the kernel metadata + artifact set, store to cache, and
887/// return the clean (non-dirty) [`KernelBuildResult`].
888///
889/// `too_many_arguments` allow: the cache-store tail threads the
890/// acquire-time inputs (`acquired`, `arch`, `image_name`,
891/// `extra_kconfig`, `is_local_source`) plus the build outputs
892/// (`source_dir`, `image_path`, `vmlinux_ref`) the metadata records;
893/// bundling them into a struct would add indirection without
894/// changing the data flow.
895#[allow(clippy::too_many_arguments)]
896fn build_metadata_and_store(
897    acquired: &crate::fetch::AcquiredSource,
898    cache: &crate::cache::CacheDir,
899    cli_label: &str,
900    is_local_source: bool,
901    arch: &str,
902    image_name: &str,
903    extra_kconfig: Option<&str>,
904    source_dir: &Path,
905    image_path: std::path::PathBuf,
906    vmlinux_ref: Option<&Path>,
907) -> Result<KernelBuildResult> {
908    let config_hash = config_hash_for(source_dir)?;
909
910    // Two-segment metadata: the bare baked-in hash stays in
911    // `ktstr_kconfig_hash` so `kernel list`'s matches/stale/
912    // untracked verdict (see `CacheEntry::kconfig_status`) keeps
913    // comparing against the live `EMBEDDED_KCONFIG`, and the user
914    // extras hash lives in its own slot. Matches the cache-key
915    // suffix shape `kc{baked}-xkc{extra}` produced by
916    // [`crate::cache_key_suffix_with_extra`].
917    let kconfig_hash = embedded_kconfig_hash();
918    let extra_kconfig_hash_value = extra_kconfig.map(crate::extra_kconfig_hash);
919
920    // Source-tree vmlinux stat (size + mtime seconds) so a later
921    // `prefer_source_tree_for_dwarf` lookup can detect a user
922    // rebuild between cache store and DWARF read. Only meaningful
923    // for local sources whose vmlinux survived the build —
924    // `vmlinux_ref` is `None` if vmlinux wasn't found, in which
925    // case there's nothing to stat. mtime read is best-effort:
926    // failure leaves the validation pair `None` and prefers the
927    // pre-validation behavior for this entry.
928    let source_vmlinux_stat = source_vmlinux_stat_for(vmlinux_ref);
929
930    let mut metadata = crate::cache::KernelMetadata::new(
931        acquired.kernel_source.clone(),
932        arch,
933        image_name,
934        crate::test_support::now_iso8601(),
935    )
936    .with_ktstr_kconfig_hash(kconfig_hash);
937    if let Some(v) = acquired.version.clone() {
938        metadata = metadata.with_version(v);
939    }
940    if let Some(h) = config_hash {
941        metadata = metadata.with_config_hash(h);
942    }
943    if let Some(h) = extra_kconfig_hash_value {
944        metadata = metadata.with_extra_kconfig_hash(h);
945    }
946    if is_local_source && let Some((size, mtime_secs)) = source_vmlinux_stat {
947        metadata = metadata.with_source_vmlinux_stat(size, mtime_secs);
948    }
949
950    let mut artifacts = crate::cache::CacheArtifacts::new(&image_path);
951    if let Some(v) = vmlinux_ref {
952        artifacts = artifacts.with_vmlinux(v);
953    }
954    let entry = match cache.store(&acquired.cache_key, &artifacts, &metadata) {
955        Ok(entry) => {
956            success(&format!("\u{2713} Kernel cached: {}", acquired.cache_key));
957            eprintln!("{cli_label}: image: {}", entry.image_path().display());
958            if crate::remote_cache::is_enabled() {
959                crate::remote_cache::remote_store(&entry, cli_label);
960            }
961            Some(entry)
962        }
963        Err(e) => {
964            warn(&format!("{cli_label}: cache store failed: {e:#}"));
965            None
966        }
967    };
968
969    Ok(KernelBuildResult {
970        entry,
971        image_path,
972        post_build_is_dirty: false,
973    })
974}
975
976/// CRC32 of `<source_dir>/.config` rendered as a fixed 8-hex-digit
977/// lowercase string, or `None` when no `.config` exists. The
978/// zero-padded `{:08x}` width is the cache-key suffix contract — a
979/// `{:x}` (no pad) would drop a leading-zero nibble and silently
980/// re-key every cached build, defeating cache hits. Pulled out of
981/// [`kernel_build_pipeline`] so the derivation is unit-testable
982/// without a real `make`.
983pub(crate) fn config_hash_for(source_dir: &Path) -> std::io::Result<Option<String>> {
984    let config_path = source_dir.join(".config");
985    if config_path.exists() {
986        let data = std::fs::read(&config_path)?;
987        Ok(Some(format!("{:08x}", crc32fast::hash(&data))))
988    } else {
989        Ok(None)
990    }
991}
992
993/// `(size, mtime_secs)` of the source-tree vmlinux, or `None` when the
994/// path is absent or unstattable. `mtime_secs` is signed seconds since
995/// the epoch — a pre-epoch mtime (clock skew) maps to a negative
996/// count rather than dropping the stat. The `None` short-circuit on a
997/// missing vmlinux keeps a later `prefer_source_tree_for_dwarf`
998/// staleness check from comparing against a phantom `(0, _)`. Pulled
999/// out of [`kernel_build_pipeline`] for unit-testability.
1000pub(crate) fn source_vmlinux_stat_for(vmlinux_ref: Option<&Path>) -> Option<(u64, i64)> {
1001    let v = vmlinux_ref?;
1002    let stat = std::fs::metadata(v).ok()?;
1003    let mtime_secs = stat.modified().ok().and_then(|t| {
1004        t.duration_since(std::time::UNIX_EPOCH)
1005            .map(|d| d.as_secs() as i64)
1006            .ok()
1007            .or_else(|| {
1008                std::time::UNIX_EPOCH
1009                    .duration_since(t)
1010                    .ok()
1011                    .map(|d| -(d.as_secs() as i64))
1012            })
1013    })?;
1014    Some((stat.len(), mtime_secs))
1015}
1016
1017/// The cache-skip hint wording for a dirty local source tree. A git
1018/// repo can be remediated by commit/stash; a non-git tree is
1019/// force-marked dirty (dirty detection is impossible) so commit/stash
1020/// advice is unactionable — it gets the put-under-git wording
1021/// instead. Pulled out of [`kernel_build_pipeline`] for
1022/// unit-testability.
1023pub(crate) fn dirty_cache_skip_hint(is_git: bool) -> &'static str {
1024    if is_git {
1025        DIRTY_TREE_CACHE_SKIP_HINT
1026    } else {
1027        NON_GIT_TREE_CACHE_SKIP_HINT
1028    }
1029}
1030
1031/// The post-build cache-store-skip decision. After `make` returns,
1032/// `inspect_local_source_state` is re-run; the store is skipped when
1033/// the worktree went dirty during the build OR HEAD advanced (a
1034/// concurrent commit), because the acquire-time cache key no longer
1035/// identifies the built input. Returns `(skip, hash_changed)` so the
1036/// caller both decides and reports `hash_changed` in the skip message.
1037/// `acquired_local_git_hash` is the acquire-time hash from the kernel
1038/// source's `as_local_git_hash`. Pulled out of
1039/// [`kernel_build_pipeline`] for unit-testability.
1040pub(crate) fn post_build_cache_store_skip(
1041    post: &crate::fetch::LocalSourceState,
1042    acquired_local_git_hash: Option<&str>,
1043) -> (bool, bool) {
1044    let hash_changed = post.short_hash != acquired_local_git_hash.map(str::to_string);
1045    (post.is_dirty || hash_changed, hash_changed)
1046}
1047
1048#[cfg(test)]
1049mod tests {
1050    use super::super::super::kernel_cmd::KernelCommand;
1051    use super::*;
1052
1053    /// Returns `false` when `git` is not on `PATH`. Tests that drive
1054    /// a real git repo in a tempdir call this first and `return` early
1055    /// when git is unavailable so CI without git silently skips
1056    /// instead of failing on a hard-error.
1057    fn git_available() -> bool {
1058        std::process::Command::new("git")
1059            .arg("--version")
1060            .output()
1061            .is_ok()
1062    }
1063
1064    /// Runs `git` in `canonical` with the sandboxed env that the
1065    /// mid-wait tests share — neutralizes `~/.gitconfig` and
1066    /// `/etc/gitconfig` (so a CI host's git identity can't pollute
1067    /// the test repo) and pins author/committer identity so `commit`
1068    /// succeeds without depending on host config. Asserts the command
1069    /// exited successfully; failure surfaces stderr in the panic
1070    /// message.
1071    fn run_git(canonical: &Path, args: &[&str]) {
1072        let out = std::process::Command::new("git")
1073            .args(args)
1074            .current_dir(canonical)
1075            .env("GIT_CONFIG_GLOBAL", "/dev/null")
1076            .env("GIT_CONFIG_SYSTEM", "/dev/null")
1077            .env("GIT_AUTHOR_NAME", "ktstr-test")
1078            .env("GIT_AUTHOR_EMAIL", "ktstr-test@localhost")
1079            .env("GIT_COMMITTER_NAME", "ktstr-test")
1080            .env("GIT_COMMITTER_EMAIL", "ktstr-test@localhost")
1081            .output()
1082            .expect("git");
1083        assert!(
1084            out.status.success(),
1085            "git {args:?} failed: {}",
1086            String::from_utf8_lossy(&out.stderr)
1087        );
1088    }
1089
1090    /// Pins the post-acquire cache re-check at `kernel_build_pipeline`
1091    /// (the early-return path that fires when a peer publishes the
1092    /// cache slot during our source-tree EX wait).
1093    ///
1094    /// The early-return gate is 3-pronged: `!acquired.is_dirty` AND
1095    /// `cache_lookup(...).is_some()` AND `entry.image_path().exists()`.
1096    /// A regression that drops any prong (e.g. someone "simplifies"
1097    /// out the exists check) would let stale-manifest entries slip
1098    /// through and the runtime would crash later on a phantom image.
1099    ///
1100    /// Single-thread, deterministic — the "after EX wait" semantic
1101    /// reduces to "after the lookup, observe the planted state."
1102    /// Real thread orchestration is covered by
1103    /// `acquire_source_tree_lock_blocks_on_contention_then_succeeds`
1104    /// elsewhere in this module.
1105    #[test]
1106    fn cache_lookup_observes_peer_published_entry_after_ex_wait() {
1107        let _env_lock = crate::test_support::test_helpers::lock_env();
1108        let cache_tmp = tempfile::TempDir::new().expect("cache tempdir");
1109        let _cache_env = crate::test_support::test_helpers::EnvVarGuard::set(
1110            crate::KTSTR_CACHE_DIR_ENV,
1111            cache_tmp.path(),
1112        );
1113        let cache = crate::cache::CacheDir::with_root(cache_tmp.path().to_path_buf());
1114        let cache_key = "test-cache-key-7f8a9b";
1115
1116        // Plant a cache entry via `CacheDir::store` (the production
1117        // helper). Going through `store` rather than hand-writing
1118        // metadata.json keeps the test honest against schema drift.
1119        let (arch, image_name) = crate::fetch::arch_info();
1120        let staging = tempfile::TempDir::new().expect("staging tempdir");
1121        let fake_image = staging.path().join(image_name);
1122        std::fs::write(&fake_image, b"fake kernel image bytes").expect("write fake image");
1123        let metadata = crate::cache::KernelMetadata::new(
1124            crate::cache::KernelSource::Local {
1125                source_tree_path: None,
1126                git_hash: None,
1127            },
1128            arch,
1129            image_name,
1130            "2026-04-12T10:00:00Z",
1131        );
1132        let artifacts = crate::cache::CacheArtifacts::new(&fake_image);
1133        cache
1134            .store(cache_key, &artifacts, &metadata)
1135            .expect("plant cache entry");
1136
1137        // Exercise the 3-condition gate. `cache_lookup` is the same
1138        // helper `kernel_build_pipeline` calls at the post-acquire
1139        // re-check; `image_path().exists()` is the second gate; the
1140        // `is_dirty` gate is upstream (this test assumes a clean
1141        // source by construction since `acquired.is_dirty` is the
1142        // caller's responsibility).
1143        let entry = crate::cli::resolve::cache_lookup(&cache, cache_key, "test")
1144            .expect("cache_lookup must surface the planted entry");
1145        assert!(
1146            entry.image_path().exists(),
1147            "image_path existence check must hold for the planted entry",
1148        );
1149        assert_eq!(entry.metadata.built_at, "2026-04-12T10:00:00Z");
1150    }
1151
1152    /// Pins the HashAdvanced branch of [`MidWaitState`] classification
1153    /// at `kernel_build_pipeline` — operator advanced HEAD
1154    /// (`git commit`/`checkout`) during the peer's build wait, leaving
1155    /// the worktree clean but the short_hash bumped.
1156    ///
1157    /// Failure mode pinned: a future "simplification" that drops the
1158    /// `hash_changed` check and trusts only `post.is_dirty` would
1159    /// silently accept a cache slot keyed on the pre-commit hash even
1160    /// though the operator committed (clean post-state) on top during
1161    /// the wait. The served cache slot would correspond to an older
1162    /// HEAD than the operator's current source tree.
1163    #[test]
1164    fn mid_wait_hash_change_invalidates_cache_hit_skip() {
1165        if !git_available() {
1166            eprintln!(
1167                "mid_wait_hash_change_invalidates_cache_hit_skip: \
1168                 git unavailable, skipping"
1169            );
1170            return;
1171        }
1172
1173        let tmp = tempfile::TempDir::new().unwrap();
1174        let canonical = tmp.path().to_path_buf();
1175        run_git(&canonical, &["init", "-q", "-b", "main"]);
1176        std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
1177        run_git(&canonical, &["add", "seed.txt"]);
1178        run_git(&canonical, &["commit", "-q", "-m", "initial"]);
1179
1180        let pre = crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
1181        let acquired_hash = pre
1182            .short_hash
1183            .clone()
1184            .expect("clean repo must carry a short_hash");
1185
1186        // Mid-wait commit — different from the acquire-time hash.
1187        std::fs::write(canonical.join("file.txt"), "amended mid-wait").unwrap();
1188        run_git(&canonical, &["add", "file.txt"]);
1189        run_git(&canonical, &["commit", "-q", "-m", "mid-wait commit"]);
1190
1191        let post = crate::fetch::inspect_local_source_state(&canonical).expect("post-wait probe");
1192
1193        assert!(
1194            !post.is_dirty,
1195            "committed changes leave the worktree clean; the hash \
1196             change is what must invalidate the cache hit (not is_dirty)",
1197        );
1198        assert!(
1199            post.short_hash.is_some(),
1200            "clean post-wait state must carry a short_hash",
1201        );
1202        assert_ne!(
1203            post.short_hash.as_ref(),
1204            Some(&acquired_hash),
1205            "the new commit must yield a different short_hash than the \
1206             acquire-time hash",
1207        );
1208
1209        // Mirror the production ternary in `kernel_build_pipeline`'s
1210        // mid_wait_state classification.
1211        let hash_changed = post.short_hash != Some(acquired_hash);
1212        let state = if post.is_dirty {
1213            MidWaitState::DirtyEdit
1214        } else if hash_changed {
1215            MidWaitState::HashAdvanced
1216        } else {
1217            MidWaitState::Clean
1218        };
1219        assert_eq!(
1220            state,
1221            MidWaitState::HashAdvanced,
1222            "clean worktree + advanced HEAD must classify as HashAdvanced",
1223        );
1224        assert!(
1225            state != MidWaitState::Clean,
1226            "hash_changed=true must falsify mid_wait_clean, forcing a \
1227             rebuild for the new cache key",
1228        );
1229    }
1230
1231    /// Pins the Clean branch of [`MidWaitState`] classification at
1232    /// `kernel_build_pipeline` — the positive path where a peer's
1233    /// build wait completes with the source tree unchanged and the
1234    /// `cache_lookup` short-circuit fires.
1235    ///
1236    /// Failure mode pinned: a future refactor that flips the
1237    /// `if post.is_dirty` / `else if hash_changed` order, or one that
1238    /// inverts a `!is_dirty` check, would route a no-mutation
1239    /// post-wait probe into DirtyEdit or HashAdvanced and force a
1240    /// redundant rebuild every time. This test ensures the no-op
1241    /// path keeps returning [`MidWaitState::Clean`] so the cache
1242    /// short-circuit at the consumer site remains reachable.
1243    #[test]
1244    fn mid_wait_clean_path_allows_cache_hit_skip() {
1245        if !git_available() {
1246            eprintln!(
1247                "mid_wait_clean_path_allows_cache_hit_skip: \
1248                 git unavailable, skipping"
1249            );
1250            return;
1251        }
1252
1253        let tmp = tempfile::TempDir::new().unwrap();
1254        let canonical = tmp.path().to_path_buf();
1255        run_git(&canonical, &["init", "-q", "-b", "main"]);
1256        std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
1257        run_git(&canonical, &["add", "seed.txt"]);
1258        run_git(&canonical, &["commit", "-q", "-m", "initial"]);
1259
1260        let pre = crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
1261        let acquired_hash = pre
1262            .short_hash
1263            .clone()
1264            .expect("clean repo must carry a short_hash");
1265
1266        // No mid-wait mutation. Post-probe must observe the same hash
1267        // and a clean worktree.
1268        let post = crate::fetch::inspect_local_source_state(&canonical).expect("post-wait probe");
1269
1270        assert!(
1271            !post.is_dirty,
1272            "no mid-wait mutation must leave the post-wait probe clean",
1273        );
1274        assert_eq!(
1275            post.short_hash.as_ref(),
1276            Some(&acquired_hash),
1277            "no mid-wait commit must leave the short_hash unchanged",
1278        );
1279
1280        let hash_changed = post.short_hash != Some(acquired_hash);
1281        let state = if post.is_dirty {
1282            MidWaitState::DirtyEdit
1283        } else if hash_changed {
1284            MidWaitState::HashAdvanced
1285        } else {
1286            MidWaitState::Clean
1287        };
1288        assert_eq!(
1289            state,
1290            MidWaitState::Clean,
1291            "no-mutation post-wait state must classify as Clean so the \
1292             cache_lookup short-circuit fires",
1293        );
1294        assert_eq!(
1295            state.diagnostic(),
1296            None,
1297            "Clean must be silent — the cache-skip gate emits its own \
1298             diagnostic when the lookup hits",
1299        );
1300    }
1301
1302    /// Pins the DirtyEdit branch of [`MidWaitState`] classification at
1303    /// `kernel_build_pipeline` — operator edited a tracked file
1304    /// during the peer's build wait, post-wait probe surfaces
1305    /// `is_dirty=true` with no HEAD advance.
1306    ///
1307    /// Failure mode pinned: a future change that elides the
1308    /// `post.is_dirty` arm (e.g. trusting only `hash_changed`) would
1309    /// silently return a cache slot keyed on the pre-edit HEAD even
1310    /// though the operator's worktree no longer matches it — the
1311    /// rebuilt artifact would reflect the operator's local edits and
1312    /// the served cache slot would not.
1313    #[test]
1314    fn mid_wait_dirty_edit_invalidates_cache_hit_skip() {
1315        if !git_available() {
1316            eprintln!(
1317                "mid_wait_dirty_edit_invalidates_cache_hit_skip: \
1318                 git unavailable, skipping"
1319            );
1320            return;
1321        }
1322
1323        let tmp = tempfile::TempDir::new().unwrap();
1324        let canonical = tmp.path().to_path_buf();
1325        run_git(&canonical, &["init", "-q", "-b", "main"]);
1326        std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
1327        run_git(&canonical, &["add", "seed.txt"]);
1328        run_git(&canonical, &["commit", "-q", "-m", "initial"]);
1329
1330        let pre = crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
1331        let acquired_hash = pre
1332            .short_hash
1333            .clone()
1334            .expect("clean repo must carry a short_hash");
1335
1336        // Mid-wait edit to a tracked file (no commit). The post-wait
1337        // probe must classify this as DirtyEdit — same hash, dirty
1338        // worktree.
1339        std::fs::write(canonical.join("seed.txt"), "operator edit during wait").unwrap();
1340
1341        let post = crate::fetch::inspect_local_source_state(&canonical).expect("post-wait probe");
1342
1343        assert!(
1344            post.is_dirty,
1345            "uncommitted edit to a tracked file must mark the post-wait \
1346             probe dirty",
1347        );
1348
1349        let hash_changed = post.short_hash != Some(acquired_hash);
1350        let state = if post.is_dirty {
1351            MidWaitState::DirtyEdit
1352        } else if hash_changed {
1353            MidWaitState::HashAdvanced
1354        } else {
1355            MidWaitState::Clean
1356        };
1357        assert_eq!(
1358            state,
1359            MidWaitState::DirtyEdit,
1360            "dirty worktree without HEAD advance must classify as DirtyEdit",
1361        );
1362        assert!(
1363            state != MidWaitState::Clean,
1364            "DirtyEdit must falsify mid_wait_clean — the cache slot \
1365             corresponds to pre-edit state",
1366        );
1367    }
1368
1369    /// Pins the ProbeFailed branch of [`MidWaitState`] classification at
1370    /// `kernel_build_pipeline` — the probe used to re-check the source
1371    /// tree returned `Err` and the pipeline conservatively rebuilds.
1372    ///
1373    /// Provoke strategy: init + commit, then truncate `.git/HEAD` to
1374    /// empty so `gix::discover` still succeeds (the `.git` dir
1375    /// exists) but `repo.head_id()` fails on the malformed ref —
1376    /// that error path is `inspect_local_source_state`'s only route
1377    /// to `Result::Err`. The non-git arm of `gix::discover` returns
1378    /// `Ok((None, true, false))`, NOT an `Err`, so simply removing
1379    /// `.git` does not reach ProbeFailed.
1380    ///
1381    /// Failure mode pinned: a future refactor that treats probe
1382    /// errors as Clean would silently return a cache slot keyed on
1383    /// unknowable post-wait state. The conservative-rebuild
1384    /// disposition is correct precisely because the alternative
1385    /// hides genuine corruption from the operator.
1386    #[test]
1387    fn mid_wait_probe_failure_invalidates_cache_hit_skip() {
1388        if !git_available() {
1389            eprintln!(
1390                "mid_wait_probe_failure_invalidates_cache_hit_skip: \
1391                 git unavailable, skipping"
1392            );
1393            return;
1394        }
1395
1396        let tmp = tempfile::TempDir::new().unwrap();
1397        let canonical = tmp.path().to_path_buf();
1398        run_git(&canonical, &["init", "-q", "-b", "main"]);
1399        std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
1400        run_git(&canonical, &["add", "seed.txt"]);
1401        run_git(&canonical, &["commit", "-q", "-m", "initial"]);
1402
1403        let pre = crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
1404        assert!(
1405            pre.short_hash.is_some(),
1406            "pre-corruption probe must succeed (the corruption happens \
1407             mid-wait, not at acquire time)",
1408        );
1409
1410        // Corrupt HEAD mid-wait. `gix::discover` still sees `.git/`
1411        // and succeeds; the subsequent `head_id()` call fails on the
1412        // empty ref and `inspect_local_source_state` propagates the
1413        // error.
1414        std::fs::write(canonical.join(".git/HEAD"), b"").expect("truncate .git/HEAD");
1415
1416        let post = crate::fetch::inspect_local_source_state(&canonical);
1417        assert!(
1418            post.is_err(),
1419            "truncated .git/HEAD must surface as a probe error, not a \
1420             silent Clean classification — found: {post:?}",
1421        );
1422
1423        // Mirror the production dispatch: probe Err → ProbeFailed,
1424        // which falsifies mid_wait_clean and forces a rebuild.
1425        let state = match post {
1426            Ok(_) => MidWaitState::Clean,
1427            Err(_) => MidWaitState::ProbeFailed,
1428        };
1429        assert_eq!(
1430            state,
1431            MidWaitState::ProbeFailed,
1432            "probe Err must classify as ProbeFailed",
1433        );
1434        assert!(
1435            state != MidWaitState::Clean,
1436            "ProbeFailed must falsify mid_wait_clean — unknowable state \
1437             cannot be assumed Clean",
1438        );
1439    }
1440
1441    /// Pins the non-local-source branch of [`MidWaitState`]
1442    /// classification at `kernel_build_pipeline` — when the source
1443    /// came from a non-local kernel spec (e.g. `Git+ref`,
1444    /// `Tarball`, downloaded archive), the outer
1445    /// `if is_local_source && !acquired.is_dirty` guard short-circuits
1446    /// the probe entirely and the fall-through reaches
1447    /// [`MidWaitState::Clean`] via the `else { Clean }` arm.
1448    ///
1449    /// Failure mode pinned: a future refactor that inverts the outer
1450    /// guard (e.g. mistakenly calls `inspect_local_source_state` on a
1451    /// Git+ref source, which doesn't have a meaningful local probe
1452    /// target) would route a non-local source into the probe branch
1453    /// and likely surface ProbeFailed against a non-git tree — a
1454    /// noisy regression. This test pins the no-probe short-circuit.
1455    #[test]
1456    fn mid_wait_non_local_source_classifies_as_clean() {
1457        // Mirror the outer production switch with is_local_source=false.
1458        // No probe call — the outer `if is_local_source && !acquired.is_dirty`
1459        // guard short-circuits when !is_local_source, falling through
1460        // to the `else if acquired.is_dirty` / else arms.
1461        let is_local_source = false;
1462        let acquired_is_dirty = false;
1463        let state = if is_local_source && !acquired_is_dirty {
1464            unreachable!(
1465                "is_local_source=false must skip the probe branch — the \
1466                 outer guard requires both is_local_source AND \
1467                 !acquired.is_dirty to reach the probe arm"
1468            )
1469        } else if acquired_is_dirty {
1470            MidWaitState::PreAcquireDirty
1471        } else {
1472            MidWaitState::Clean
1473        };
1474        assert_eq!(
1475            state,
1476            MidWaitState::Clean,
1477            "non-local clean source must classify as Clean — the cache \
1478             short-circuit applies to any source whose state we cannot \
1479             probe (or did not need to probe)",
1480        );
1481        assert_eq!(
1482            state.diagnostic(),
1483            None,
1484            "Clean non-local source must be silent",
1485        );
1486    }
1487
1488    /// Pins the PreAcquireDirty variant identity and its silent
1489    /// diagnostic — `MidWaitState::PreAcquireDirty.diagnostic()`
1490    /// returns `None` because the wait was not the cause of the
1491    /// dirty state.
1492    ///
1493    /// SCOPE: does NOT exercise the caller-side dispatch order in
1494    /// `kernel_build_pipeline` — the test reconstructs the
1495    /// `if is_local_source && !acquired.is_dirty / else if
1496    /// acquired.is_dirty / else` chain inline because PreAcquireDirty
1497    /// is constructed without any probe call. A future refactor that
1498    /// flipped the guard order in `kernel_build_pipeline` would not
1499    /// fail this test; the other 4 mid_wait tests ground against
1500    /// `inspect_local_source_state` and would catch a probe-arm
1501    /// regression. This test pins the variant + diagnostic pair only.
1502    #[test]
1503    fn mid_wait_pre_acquire_dirty_suppresses_wait_diagnostic() {
1504        // Mirror the production dispatch with acquired.is_dirty=true.
1505        // No probe call — the `else if acquired.is_dirty` arm fires
1506        // before the probe-bearing branch. If the guard structure in
1507        // `kernel_build_pipeline` changes (e.g. PreAcquireDirty moves
1508        // inside the probe match), update this mirror.
1509        let is_local_source = true;
1510        let acquired_is_dirty = true;
1511        let state = if is_local_source && !acquired_is_dirty {
1512            unreachable!(
1513                "the guard requires !acquired.is_dirty before the probe \
1514                 branch; acquired_is_dirty=true must skip this arm"
1515            )
1516        } else if acquired_is_dirty {
1517            MidWaitState::PreAcquireDirty
1518        } else {
1519            MidWaitState::Clean
1520        };
1521        assert_eq!(
1522            state,
1523            MidWaitState::PreAcquireDirty,
1524            "acquired.is_dirty=true must classify as PreAcquireDirty",
1525        );
1526        assert_eq!(
1527            state.diagnostic(),
1528            None,
1529            "PreAcquireDirty must be silent — the wait was not the \
1530             cause of the dirty state, so a wait-related diagnostic \
1531             would fabricate attribution",
1532        );
1533    }
1534
1535    /// Pins the exact diagnostic bodies emitted by each
1536    /// [`MidWaitState`] variant so a future copywriting change to
1537    /// the operator-facing messages is a deliberate, reviewed
1538    /// edit rather than silent drift.
1539    ///
1540    /// Clean and PreAcquireDirty return `None` (silent). DirtyEdit,
1541    /// HashAdvanced, and ProbeFailed return their full body strings
1542    /// without the `{cli_label}: ` prefix — the caller composes the
1543    /// prefix at the eprintln site.
1544    #[test]
1545    fn mid_wait_state_diagnostics_pinned() {
1546        assert_eq!(MidWaitState::Clean.diagnostic(), None);
1547        assert_eq!(MidWaitState::PreAcquireDirty.diagnostic(), None);
1548        assert_eq!(
1549            MidWaitState::DirtyEdit.diagnostic(),
1550            Some(
1551                "source tree changed during peer's build wait \
1552                 — rebuilding to capture your local edits"
1553            ),
1554        );
1555        assert_eq!(
1556            MidWaitState::HashAdvanced.diagnostic(),
1557            Some(
1558                "source HEAD advanced during peer's build wait \
1559                 — rebuilding for the new commit"
1560            ),
1561        );
1562        assert_eq!(
1563            MidWaitState::ProbeFailed.diagnostic(),
1564            Some(
1565                "source-tree dirty re-check failed during peer's \
1566                 build wait — rebuilding conservatively (re-run with \
1567                 RUST_LOG=warn for the probe error)"
1568            ),
1569        );
1570    }
1571
1572    /// Pins the exact diagnostic body emitted by the post-mid-wait
1573    /// `cache_lookup` short-circuit so a future copywriting change
1574    /// to the operator-facing message is a deliberate, reviewed
1575    /// edit rather than silent drift — parallel to
1576    /// [`mid_wait_state_diagnostics_pinned`] for the
1577    /// [`MidWaitState`] family.
1578    ///
1579    /// Two assertions: a byte-for-byte match on the formatted body
1580    /// with a representative cache_key, plus an inequality between
1581    /// two distinct keys to prove the `{cache_key}` placeholder is
1582    /// load-bearing (catches a regression that replaces the
1583    /// substitution with a static label and would silently produce
1584    /// a constant string regardless of input).
1585    #[test]
1586    fn cache_hit_diagnostic_pinned() {
1587        let cache_key = "test-cache-key-7f8a9b";
1588        assert_eq!(
1589            cache_hit_diagnostic(cache_key),
1590            "concurrent ktstr build populated cache slot test-cache-key-7f8a9b \
1591             during peer's build wait — skipping redundant rebuild",
1592        );
1593        assert_ne!(
1594            cache_hit_diagnostic(cache_key),
1595            cache_hit_diagnostic("different-key-x86-64"),
1596            "cache_key substitution must be load-bearing, not a no-op",
1597        );
1598    }
1599
1600    /// `kernel build --cpu-cap N` parses through clap into
1601    /// `KernelCommand::Build { cpu_cap: Some(N), .. }`. Pins the
1602    /// flag's wire path: a future rename of the field, a stray
1603    /// `default_value`, or a `value_parser` change that altered
1604    /// rejection semantics would surface as a parse failure or a
1605    /// shape mismatch on the assertion.
1606    #[test]
1607    fn kernel_build_parses_cpu_cap_without_extra_flags() {
1608        use clap::Parser as _;
1609        #[derive(clap::Parser, Debug)]
1610        struct TestCli {
1611            #[command(subcommand)]
1612            cmd: KernelCommand,
1613        }
1614        let parsed =
1615            TestCli::try_parse_from(["prog", "build", "--kernel", "6.14.2", "--cpu-cap", "4"])
1616                .expect("kernel build --cpu-cap N must parse");
1617        match parsed.cmd {
1618            KernelCommand::Build {
1619                cpu_cap, kernel, ..
1620            } => {
1621                assert_eq!(cpu_cap, Some(4));
1622                assert_eq!(kernel.as_deref(), Some("6.14.2"));
1623            }
1624            other => panic!("expected KernelCommand::Build, got {other:?}"),
1625        }
1626    }
1627
1628    /// `kernel build` without `--cpu-cap` parses with `cpu_cap: None`
1629    /// — the "unset" sentinel the downstream planner expands into the
1630    /// 30%-of-allowed default. Pins the no-flag path so a future
1631    /// rename of the clap field or a stray `default_value = "0"`
1632    /// surfaces as a test failure, not a silent runtime behavior change.
1633    #[test]
1634    fn kernel_build_without_cpu_cap_defaults_to_none() {
1635        use clap::Parser as _;
1636        #[derive(clap::Parser, Debug)]
1637        struct TestCli {
1638            #[command(subcommand)]
1639            cmd: KernelCommand,
1640        }
1641        let parsed = TestCli::try_parse_from(["prog", "build", "--kernel", "6.14.2"])
1642            .expect("kernel build without --cpu-cap must parse");
1643        match parsed.cmd {
1644            KernelCommand::Build { cpu_cap, .. } => {
1645                assert_eq!(cpu_cap, None, "no --cpu-cap must produce None, not Some(0)",);
1646            }
1647            other => panic!("expected KernelCommand::Build, got {other:?}"),
1648        }
1649    }
1650
1651    /// `kernel build --cpu-cap 0` parses successfully at clap level
1652    /// — the "must be ≥ 1" check lives in [`CpuCap::new`], not in
1653    /// the clap value parser. Pins the two-layer validation: clap
1654    /// accepts any usize; runtime resolution via `CpuCap::resolve` is
1655    /// responsible for the "0 is rejected" diagnostic.
1656    #[test]
1657    fn kernel_build_cpu_cap_zero_passes_clap() {
1658        use clap::Parser as _;
1659        #[derive(clap::Parser, Debug)]
1660        struct TestCli {
1661            #[command(subcommand)]
1662            cmd: KernelCommand,
1663        }
1664        let parsed =
1665            TestCli::try_parse_from(["prog", "build", "--kernel", "6.14.2", "--cpu-cap", "0"])
1666                .expect("clap-level parse must accept 0; runtime validation rejects");
1667        match parsed.cmd {
1668            KernelCommand::Build { cpu_cap, .. } => {
1669                assert_eq!(
1670                    cpu_cap,
1671                    Some(0),
1672                    "clap parses 0 verbatim; validation is downstream",
1673                );
1674            }
1675            other => panic!("expected KernelCommand::Build, got {other:?}"),
1676        }
1677    }
1678
1679    // ---------------------------------------------------------------
1680    // kernel_build_pipeline reservation phase — factored-out
1681    // `acquire_build_reservation` covers the cpu_cap → acquire →
1682    // sandbox → make_jobs flow without needing a real kernel source.
1683    // ---------------------------------------------------------------
1684
1685    /// Serialize `KTSTR_BYPASS_LLC_LOCKS` env-var mutation across test
1686    /// threads. Delegates to the ONE crate-wide env mutex so it
1687    /// serializes against EVERY env-touching test (process-wide
1688    /// `std::env`), including the builder tests that read
1689    /// `KTSTR_BYPASS_LLC_LOCKS` — a module-local mutex left them racing.
1690    /// `lock_env()` recovers from poison.
1691    fn bypass_env_lock() -> std::sync::MutexGuard<'static, ()> {
1692        crate::test_support::test_helpers::lock_env()
1693    }
1694
1695    /// RAII guard for scoped `KTSTR_BYPASS_LLC_LOCKS` mutation.
1696    /// Caller holds `bypass_env_lock()` before constructing.
1697    struct BypassGuard;
1698    impl BypassGuard {
1699        fn set(value: &str) -> Self {
1700            // SAFETY: env_lock held by caller; serializes with
1701            // every other env-mutating test.
1702            unsafe {
1703                std::env::set_var(crate::KTSTR_BYPASS_LLC_LOCKS_ENV, value);
1704            }
1705            BypassGuard
1706        }
1707        fn remove() -> Self {
1708            // SAFETY: caller holds env_lock.
1709            unsafe {
1710                std::env::remove_var(crate::KTSTR_BYPASS_LLC_LOCKS_ENV);
1711            }
1712            BypassGuard
1713        }
1714    }
1715    impl Drop for BypassGuard {
1716        fn drop(&mut self) {
1717            // SAFETY: guard lifetime bounded by env_lock held by
1718            // caller; Drop runs before the mutex guard releases.
1719            unsafe {
1720                std::env::remove_var(crate::KTSTR_BYPASS_LLC_LOCKS_ENV);
1721            }
1722        }
1723    }
1724
1725    /// `acquire_build_reservation` with `KTSTR_BYPASS_LLC_LOCKS=1`
1726    /// plus `cpu_cap=None` returns a no-reservation `BuildReservation`:
1727    /// plan, sandbox, and make_jobs all None. Pins the "bypass
1728    /// disables both layers" contract.
1729    #[test]
1730    fn acquire_build_reservation_bypass_returns_no_reservation() {
1731        let _lock = bypass_env_lock();
1732        let _env = BypassGuard::set("1");
1733        let r = acquire_build_reservation("test", None).expect("bypass + no cap must succeed");
1734        assert!(r.plan.is_none(), "bypass must produce no LLC plan");
1735        assert!(
1736            r._sandbox.is_none(),
1737            "bypass must produce no cgroup sandbox",
1738        );
1739        assert!(
1740            r.make_jobs.is_none(),
1741            "bypass must fall back to nproc (None signals to caller)",
1742        );
1743    }
1744
1745    /// Regression pin: empty-string-as-unset contract for
1746    /// `KTSTR_BYPASS_LLC_LOCKS`. A bare `KTSTR_BYPASS_LLC_LOCKS=`
1747    /// (CI shells, Docker `--env` pass-through without value) must
1748    /// NOT activate the bypass — the reader at L102 uses
1749    /// `.is_some_and(|v| !v.is_empty())` and that contract is
1750    /// shared by all 7 sibling readers. If a future contributor
1751    /// flips to `.is_some_and(|_| true)` or bare `.is_ok()`, this
1752    /// test catches the regression before it silently disables LLC
1753    /// flock contention enforcement in CI.
1754    #[test]
1755    fn acquire_build_reservation_bypass_empty_string_rejected() {
1756        let _lock = bypass_env_lock();
1757        let _env = BypassGuard::set("");
1758        match acquire_build_reservation("test", None) {
1759            Ok(r) => {
1760                // Empty-as-unset means we take the standard branch,
1761                // not the bypass branch. Standard branch produces a
1762                // BuildReservation with plan / sandbox / make_jobs
1763                // tied together (set-or-unset together per the
1764                // `plan_and_make_jobs_consistent` invariant). If the
1765                // bypass had been (incorrectly) triggered, all 3
1766                // would be None.
1767                assert_eq!(
1768                    r.plan.is_some(),
1769                    r.make_jobs.is_some(),
1770                    "empty-string must NOT activate bypass — plan + make_jobs \
1771                     should follow the standard-branch invariant",
1772                );
1773            }
1774            Err(e) => {
1775                // Sysfs-unreadable host: standard branch failed for
1776                // unrelated reasons. The empty-string-as-unset
1777                // contract is still proven because the bypass branch
1778                // would have returned `Ok` with all-None fields (per
1779                // the `bypass_returns_no_reservation` test); reaching
1780                // Err proves the standard branch was taken.
1781                eprintln!("standard-branch error confirms bypass was NOT taken (good): {e:#}");
1782            }
1783        }
1784    }
1785
1786    /// `acquire_build_reservation` with `KTSTR_BYPASS_LLC_LOCKS=1`
1787    /// plus `cpu_cap=Some(_)` must error with the "resource contract"
1788    /// substring. Pins the conflict check at the pipeline's
1789    /// reservation entry point.
1790    #[test]
1791    fn acquire_build_reservation_bypass_with_cap_errors() {
1792        let _lock = bypass_env_lock();
1793        let _env = BypassGuard::set("1");
1794        let cap = crate::vmm::host_topology::CpuCap::new(2).expect("cap=2 valid");
1795        let err =
1796            acquire_build_reservation("test", Some(cap)).expect_err("bypass + cap must error");
1797        let msg = format!("{err:#}");
1798        assert!(
1799            msg.contains("resource contract"),
1800            "err must name the resource contract: {msg}",
1801        );
1802    }
1803
1804    /// `acquire_build_reservation` without bypass on a sysfs-capable
1805    /// host: returns a `BuildReservation` whose fields populate
1806    /// consistently — plan.is_some() iff make_jobs.is_some() iff
1807    /// sandbox.is_some(). Pins the "plan and make_jobs must never
1808    /// diverge" invariant.
1809    #[test]
1810    fn acquire_build_reservation_plan_and_make_jobs_consistent() {
1811        let _lock = bypass_env_lock();
1812        let _env = BypassGuard::remove();
1813        match acquire_build_reservation("test", None) {
1814            Ok(r) => {
1815                assert_eq!(
1816                    r.plan.is_some(),
1817                    r.make_jobs.is_some(),
1818                    "plan and make_jobs must agree on reservation presence",
1819                );
1820                if let (Some(p), Some(jobs)) = (r.plan.as_ref(), r.make_jobs) {
1821                    assert_eq!(
1822                        jobs,
1823                        crate::vmm::host_topology::make_jobs_for_plan(p),
1824                        "make_jobs must equal make_jobs_for_plan(&plan)",
1825                    );
1826                }
1827                assert_eq!(
1828                    r.plan.is_some(),
1829                    r._sandbox.is_some(),
1830                    "sandbox and plan must agree on reservation presence",
1831                );
1832            }
1833            Err(e) => {
1834                // Sysfs-unreadable host or contested LLCs. Accept
1835                // either outcome; the test's intent is to pin the
1836                // invariant in the success case, not force success.
1837                eprintln!("acquire_build_reservation unavailable on this host: {e:#}");
1838            }
1839        }
1840    }
1841
1842    /// `acquire_build_reservation` plain bypass (no `--cpu-cap`)
1843    /// must NOT touch the sysfs probe. The test sets the bypass and
1844    /// confirms no error escapes, even on a host whose
1845    /// `HostTopology::from_sysfs()` would otherwise fail (the
1846    /// bypass branch is taken FIRST in the function, before the
1847    /// sysfs probe is attempted). Pins the "bypass short-circuits
1848    /// the topology probe" branch shape — a regression that
1849    /// re-ordered the bypass check below the sysfs probe would
1850    /// surface as a sysfs-error escape.
1851    #[test]
1852    fn acquire_build_reservation_bypass_does_not_touch_sysfs() {
1853        let _lock = bypass_env_lock();
1854        let _env = BypassGuard::set("1");
1855        let r = acquire_build_reservation("test", None)
1856            .expect("bypass must succeed regardless of sysfs availability");
1857        // The bypass branch produces (None, None, None) by
1858        // construction — no further state to assert beyond the
1859        // sibling tests that already pin the field shape.
1860        assert!(r.plan.is_none());
1861        assert!(r._sandbox.is_none());
1862        assert!(r.make_jobs.is_none());
1863    }
1864
1865    // ---------------------------------------------------------------
1866    // acquire_source_tree_lock — per-source-tree flock that
1867    // serializes parallel builds against the same on-disk source.
1868    // ---------------------------------------------------------------
1869    //
1870    // Tests use `isolated_cache_dir()` to point `KTSTR_CACHE_DIR` at
1871    // a tempdir for the test's lifetime, so the production
1872    // `CacheDir::new()` resolves into the tempdir without touching
1873    // the operator's real cache directory. The lockfile path is
1874    // deterministic (cache_root/.locks/source-{path_hash}.lock) so
1875    // we can re-derive it from the canonical input path and assert
1876    // its presence.
1877
1878    /// `acquire_source_tree_lock` on a fresh canonical path under
1879    /// an isolated cache root succeeds (no peer holding the lock)
1880    /// and creates the lockfile under `cache_root/.locks/`. Pins
1881    /// the lockfile placement: a regression that moved the lockfile
1882    /// to `/tmp/` (where `tmpwatch` could sweep it under an active
1883    /// holder) would surface here as the assertion failing on
1884    /// "lockfile not found at expected path."
1885    #[test]
1886    fn acquire_source_tree_lock_succeeds_on_fresh_path() {
1887        use crate::test_support::test_helpers::{isolated_cache_dir, lock_env};
1888        let _env_lock = lock_env();
1889        let cache = isolated_cache_dir();
1890        let canonical = std::path::PathBuf::from("/tmp/fake-source-tree-for-test");
1891        let fd = acquire_source_tree_lock(&canonical, "test")
1892            .expect("fresh-path acquire must succeed under isolated cache");
1893        // Lockfile must land under the isolated cache root's
1894        // `.locks/` subdirectory. The naming is `source-{hash}.lock`
1895        // where `{hash}` is `canonical_path_hash(canonical)`.
1896        let path_hash = crate::fetch::canonical_path_hash(&canonical);
1897        let expected = cache
1898            .path()
1899            .join(crate::flock::LOCK_DIR_NAME)
1900            .join(format!("source-{path_hash}.lock"));
1901        assert!(
1902            expected.exists(),
1903            "lockfile must exist at {} after acquire",
1904            expected.display(),
1905        );
1906        // Drop the FD explicitly to release the flock before the
1907        // tempdir cleanup races with it.
1908        drop(fd);
1909    }
1910
1911    /// `acquire_source_tree_lock` returns the SAME lockfile path
1912    /// for two different canonical inputs IFF they share the same
1913    /// `canonical_path_hash`. Two distinct inputs (`/srv/linux-a`
1914    /// and `/srv/linux-b`) must produce DIFFERENT lockfiles so
1915    /// concurrent builds against unrelated source trees don't
1916    /// serialize against each other. Pins the per-tree
1917    /// disambiguation contract.
1918    #[test]
1919    fn acquire_source_tree_lock_distinct_paths_yield_distinct_lockfiles() {
1920        use crate::test_support::test_helpers::{isolated_cache_dir, lock_env};
1921        let _env_lock = lock_env();
1922        let cache = isolated_cache_dir();
1923        let path_a = std::path::PathBuf::from("/tmp/fake-source-a");
1924        let path_b = std::path::PathBuf::from("/tmp/fake-source-b");
1925        let fd_a = acquire_source_tree_lock(&path_a, "test")
1926            .expect("path A acquire must succeed under isolated cache");
1927        // Acquiring path B while path A's lock is still held must
1928        // ALSO succeed — they hash to different lockfiles, so
1929        // there's no contention.
1930        let fd_b = acquire_source_tree_lock(&path_b, "test").expect(
1931            "path B acquire must succeed concurrently with A — \
1932                 distinct canonical paths must hash to distinct \
1933                 lockfiles so unrelated builds don't serialize",
1934        );
1935        let hash_a = crate::fetch::canonical_path_hash(&path_a);
1936        let hash_b = crate::fetch::canonical_path_hash(&path_b);
1937        assert_ne!(
1938            hash_a, hash_b,
1939            "distinct canonical paths must produce distinct CRC32 hashes",
1940        );
1941        let lock_a = cache
1942            .path()
1943            .join(crate::flock::LOCK_DIR_NAME)
1944            .join(format!("source-{hash_a}.lock"));
1945        let lock_b = cache
1946            .path()
1947            .join(crate::flock::LOCK_DIR_NAME)
1948            .join(format!("source-{hash_b}.lock"));
1949        assert!(lock_a.exists());
1950        assert!(lock_b.exists());
1951        assert_ne!(lock_a, lock_b);
1952        drop(fd_a);
1953        drop(fd_b);
1954    }
1955
1956    /// `acquire_source_tree_lock` on a path whose lockfile is
1957    /// already held by a peer parks in a blocking flock(2) until the
1958    /// holder releases, then succeeds. Pins the try-then-wait
1959    /// contract: a regression that re-introduced the bail-on-EWOULDBLOCK
1960    /// behavior, or any other path that returns without ever calling
1961    /// `flock(LOCK_EX)` blocking, would surface here as either the
1962    /// `/proc/locks` waiter scan timing out (no `-> FLOCK` line ever
1963    /// appears against the lockfile inode) or the worker's elapsed
1964    /// time being below the holder-retention window.
1965    ///
1966    /// We simulate "concurrent peer" by holding the first FD on the
1967    /// main thread, spawn a worker that issues a second acquire (which
1968    /// blocks in `block_flock`), poll `/proc/locks` until the kernel
1969    /// records the worker as a waiter against the lockfile inode
1970    /// (kernel emits blocked flock waiters as lines containing both
1971    /// `->` and the `{major:02x}:{minor:02x}:{inode}` triple — see
1972    /// `fs/locks.c::lock_get_status`), retain the holder for a fixed
1973    /// window after the waiter appears so the worker's blocking call
1974    /// can be measured, drop the holder, then collect the worker's
1975    /// `Result` via `recv_timeout` so a real regression that caused
1976    /// the worker to hang forever surfaces as a bounded test failure
1977    /// rather than an indefinite test-runner stall.
1978    ///
1979    /// Two assertions guard the blocking semantic together:
1980    ///   1. The `/proc/locks` waiter scan: proves the worker entered
1981    ///      the kernel's blocked-flock state. A non-blocking
1982    ///      regression never enters that state.
1983    ///   2. The worker's measured elapsed time `>= HOLD_WINDOW`:
1984    ///      proves the worker stayed parked until the holder
1985    ///      released. A non-blocking regression that eagerly
1986    ///      returned `Err` would record a near-zero elapsed time
1987    ///      even if the waiter scan happened to be flaky.
1988    #[test]
1989    fn acquire_source_tree_lock_blocks_on_contention_then_succeeds() {
1990        use crate::test_support::test_helpers::{isolated_cache_dir, lock_env};
1991        // `_env_lock` and `cache` MUST outlive the spawned worker
1992        // thread. The worker reads `KTSTR_CACHE_DIR` inside
1993        // `acquire_source_tree_lock`'s `CacheDir::new()`; if
1994        // `IsolatedCacheDir`'s drop ran while the worker was still
1995        // resolving the cache root, the worker would observe a
1996        // restored / empty env var and either land outside the
1997        // tempdir or fail with a stale-cache-root error. The bindings
1998        // below are declared here and dropped at end-of-scope, AFTER
1999        // the explicit `worker_result` collection point below.
2000        let _env_lock = lock_env();
2001        let cache = isolated_cache_dir();
2002        let canonical = std::path::PathBuf::from("/tmp/fake-source-contention");
2003        let holder = acquire_source_tree_lock(&canonical, "test")
2004            .expect("first acquire must succeed under isolated cache");
2005
2006        // Re-derive the lockfile path so we can needle `/proc/locks`
2007        // for waiter lines below. The production code constructs the
2008        // same path via `CacheDir::lock_path(format!("source-{hash}"))`
2009        // — see [`acquire_source_tree_lock`] above. The lockfile was
2010        // materialized by the holder's successful `try_flock` open
2011        // (O_CREAT), so by this point the inode exists on disk and
2012        // `needle_from_path` can stat it.
2013        let path_hash = crate::fetch::canonical_path_hash(&canonical);
2014        let lock_path = cache
2015            .path()
2016            .join(crate::flock::LOCK_DIR_NAME)
2017            .join(format!("source-{path_hash}.lock"));
2018        let needle = crate::flock::mountinfo::needle_from_path(&lock_path)
2019            .expect("needle_from_path must resolve the lockfile inode");
2020
2021        // Spawn a worker that issues the second acquire. The worker's
2022        // non-blocking `try_flock` will see the held lock and fall
2023        // through to `block_flock`, which parks the worker thread in
2024        // `flock(2)` until the holder's FD closes. `OwnedFd` and
2025        // `anyhow::Error` are both `Send`, so the `Result<OwnedFd>`
2026        // returns through the channel below. The worker also
2027        // captures its own elapsed time around the
2028        // `acquire_source_tree_lock` call so the assertion below can
2029        // verify the blocking path actually executed for the holder
2030        // retention window — a regression that returned non-blockingly
2031        // without parking in the kernel would surface as a near-zero
2032        // elapsed value even if the `/proc/locks` waiter scan happened
2033        // to be flaky.
2034        //
2035        // `sync_channel(1)`: a single-slot buffered channel lets the
2036        // worker `send` and exit even if the main thread already
2037        // panicked from an earlier assertion failure (rendezvous
2038        // bound-0 would leave the worker parked in `send` forever,
2039        // a thread leak on top of an already-failed test). A worker
2040        // that hangs forever before reaching `send` leaves the
2041        // channel empty and the `recv_timeout` below bails the test
2042        // within 5s rather than hanging the test runner indefinitely.
2043        let worker_canonical = canonical.clone();
2044        let (tx, rx) = std::sync::mpsc::sync_channel::<(
2045            std::result::Result<std::os::fd::OwnedFd, anyhow::Error>,
2046            std::time::Duration,
2047        )>(1);
2048        let _worker = std::thread::spawn(move || {
2049            let started = std::time::Instant::now();
2050            let result = acquire_source_tree_lock(&worker_canonical, "test");
2051            let elapsed = started.elapsed();
2052            // Send result + elapsed through the rendezvous channel.
2053            // If the main thread already abandoned the test (panic)
2054            // before the worker reached this point the send fails;
2055            // discarding the failure is correct because the test is
2056            // already failing for a different reason.
2057            let _ = tx.send((result, elapsed));
2058        });
2059
2060        // Poll `/proc/locks` for a waiter line against the lockfile
2061        // inode. The kernel emits one `-> FLOCK ... {dev}:{ino}` line
2062        // per blocked waiter (`fs/locks.c::lock_get_status` — the
2063        // leading `-> ` distinguishes a waiter from a holder); seeing
2064        // such a line proves the worker is parked in `flock(2)`.
2065        // `parse_flock_pids_for_needle` (the production scanner) does
2066        // NOT match `-> FLOCK` lines because it filters on `FLOCK` in
2067        // field-2, so the test scans the raw text directly with the
2068        // `->` + needle byte-pattern documented in the user-facing
2069        // task description.
2070        //
2071        // 10ms poll interval × 500 iterations = 5s deadline. A
2072        // healthy host enters the waiter state within a single
2073        // 10ms tick; the 5s ceiling exists only to bail a
2074        // pathologically-slow CI runner before the test runner's
2075        // own hang detector fires.
2076        const POLL_INTERVAL: std::time::Duration = std::time::Duration::from_millis(10);
2077        const POLL_DEADLINE: std::time::Duration = std::time::Duration::from_secs(5);
2078        let poll_start = std::time::Instant::now();
2079        let mut waiter_observed = false;
2080        while poll_start.elapsed() < POLL_DEADLINE {
2081            let contents = std::fs::read_to_string("/proc/locks")
2082                .expect("/proc/locks must be readable on a Linux host");
2083            if contents
2084                .lines()
2085                .any(|line| line.contains("->") && line.contains(&needle))
2086            {
2087                waiter_observed = true;
2088                break;
2089            }
2090            std::thread::sleep(POLL_INTERVAL);
2091        }
2092        assert!(
2093            waiter_observed,
2094            "no `-> FLOCK ... {needle}` waiter line appeared in \
2095             /proc/locks within {POLL_DEADLINE:?} — worker did not \
2096             enter the kernel's blocked-flock state, which means \
2097             `acquire_source_tree_lock` regressed off the blocking path",
2098        );
2099
2100        // Hold the lock for `HOLD_WINDOW` AFTER the waiter is
2101        // observed so the worker's measured elapsed time provably
2102        // exceeds the window. A regression that returned
2103        // non-blockingly would still record a sub-window elapsed
2104        // time even if a waiter line happened to flicker through
2105        // /proc/locks for unrelated reasons; the elapsed-window
2106        // assertion catches that. The window is wall-clock from
2107        // observation, not from worker entry, so the worker's
2108        // measured elapsed includes its own pre-park work plus the
2109        // window — `worker_elapsed >= HOLD_WINDOW` is sufficient.
2110        const HOLD_WINDOW: std::time::Duration = std::time::Duration::from_millis(200);
2111        std::thread::sleep(HOLD_WINDOW);
2112
2113        // Drop the holder. The worker's blocking flock(2) returns,
2114        // it acquires the lock, and the worker thread sends its
2115        // result through the channel.
2116        drop(holder);
2117
2118        // `recv_timeout` bounds the test's worst-case wall time.
2119        // Healthy worker delivers within microseconds of the
2120        // holder drop; the 5s ceiling fires only on a true
2121        // regression (worker stuck, fd not released, etc.).
2122        let (worker_result, worker_elapsed) =
2123            rx.recv_timeout(std::time::Duration::from_secs(5)).expect(
2124                "worker must deliver its acquire result within 5s of \
2125                 holder release — a regression that caused the worker \
2126                 to hang forever lands here",
2127            );
2128        let acquired = worker_result.expect("worker acquire must succeed once the holder releases");
2129
2130        // Elapsed-window assertion: the worker's measured time around
2131        // `acquire_source_tree_lock` must be at least the holder
2132        // retention window, because the worker was parked in
2133        // `flock(2)` for at least that long after `/proc/locks`
2134        // observed the waiter line. A revert to non-blocking
2135        // EWOULDBLOCK behavior would record a sub-window elapsed
2136        // value here and fail this assertion even if the
2137        // `/proc/locks` waiter scan happened to flake-pass.
2138        assert!(
2139            worker_elapsed >= HOLD_WINDOW,
2140            "worker's acquire returned in {worker_elapsed:?}, less than \
2141             the {HOLD_WINDOW:?} holder-retention window — worker did \
2142             not actually block on the held flock",
2143        );
2144
2145        // Drop the worker's FD explicitly so the lockfile flock
2146        // releases before the isolated cache dir is torn down.
2147        // `_env_lock` and `cache` are bound at function-scope above
2148        // and drop at end-of-scope, AFTER this point.
2149        drop(acquired);
2150    }
2151
2152    /// `BuildReservation` field declaration order is load-bearing:
2153    /// `_sandbox` MUST be declared BEFORE `plan` so Rust's
2154    /// in-declaration-order field-drop runs the sandbox cgroup
2155    /// rmdir BEFORE the LLC flock release.
2156    ///
2157    /// A regression that swapped the field order would mean
2158    /// LLC flocks release first, which lets a peer claim the LLC
2159    /// while gcc children are still bound to a cgroup whose rmdir
2160    /// hasn't run yet.
2161    ///
2162    /// We can't assert drop ORDER directly without exotic
2163    /// machinery, but we can assert the field order is what we
2164    /// expect via the `Debug` derive: `_sandbox` appears in the
2165    /// formatted struct BEFORE `plan` IFF the field declaration
2166    /// order matches the Drop-order requirement. The field-name
2167    /// regex is enough to pin the order without depending on the
2168    /// inner field shapes (which evolve as the planner / sandbox
2169    /// types add or rename their own fields).
2170    #[test]
2171    fn build_reservation_field_order_pins_drop_invariant() {
2172        let r = BuildReservation {
2173            _sandbox: None,
2174            plan: None,
2175            make_jobs: None,
2176        };
2177        let dbg = format!("{r:?}");
2178        let sandbox_pos = dbg
2179            .find("_sandbox")
2180            .expect("Debug output must mention _sandbox field");
2181        let plan_pos = dbg
2182            .find("plan")
2183            .expect("Debug output must mention plan field");
2184        assert!(
2185            sandbox_pos < plan_pos,
2186            "_sandbox MUST be declared before plan so cgroup rmdir \
2187             runs BEFORE LLC flock release on Drop. Debug: {dbg}",
2188        );
2189    }
2190
2191    // ---------------------------------------------------------------
2192    // Post-build metadata-derivation arms inside
2193    // `kernel_build_pipeline`. These pure-logic blocks (config_hash
2194    // CRC32, source_vmlinux_stat, the dirty-tree cache-skip hint, the
2195    // post-build dirty re-check store decision) are unreachable through
2196    // the public `kernel_build_pipeline` entry without a real `make`
2197    // invocation, so each was extracted into a `pub(crate)` helper
2198    // (`config_hash_for`, `source_vmlinux_stat_for`,
2199    // `dirty_cache_skip_hint`, `post_build_cache_store_skip`) that both
2200    // the pipeline and these tests call — so the tests exercise the
2201    // real production code, not a copy.
2202    // ---------------------------------------------------------------
2203
2204    /// Drives the production [`config_hash_for`] helper (extracted from
2205    /// `kernel_build_pipeline`). Pins: (1) absent `.config` yields
2206    /// `None`; (2) present `.config` hashes to `crc32fast::hash` of the
2207    /// exact bytes; (3) the `{:08x}` zero-pad is load-bearing.
2208    ///
2209    /// Property (2) is checked against an INDEPENDENT `crc32fast::hash`
2210    /// so a hasher swap diverges. Property (3) is checked with an input
2211    /// whose CRC32 has a leading-zero nibble: a `{:x}` regression would
2212    /// drop the leading zero, shortening the hash below 8 chars and
2213    /// silently re-keying every cached build.
2214    #[test]
2215    fn config_hash_derivation_matches_crc32_and_width() {
2216        let tmp = tempfile::TempDir::new().expect("config tempdir");
2217        let source_dir = tmp.path();
2218
2219        // Absent `.config` arm: the production `else { None }` branch.
2220        assert_eq!(
2221            config_hash_for(source_dir).expect("absent-config read"),
2222            None,
2223            "no .config must yield None config_hash",
2224        );
2225
2226        // Present `.config` arm: hash matches an independent crc32fast.
2227        let body = b"CONFIG_SCHED_CLASS_EXT=y\nCONFIG_BPF=y\n";
2228        std::fs::write(source_dir.join(".config"), body).expect("write .config");
2229        let present = config_hash_for(source_dir).expect("present-config read");
2230        let expected = format!("{:08x}", crc32fast::hash(body));
2231        assert_eq!(
2232            present.as_deref(),
2233            Some(expected.as_str()),
2234            "present .config must hash to crc32fast::hash of its bytes",
2235        );
2236
2237        // Width guard: find an input whose CRC32 has a leading-zero
2238        // nibble (< 0x1000_0000) so the `{:08x}` zero-pad is the only
2239        // thing keeping the rendered hash at 8 chars. A `{:x}`
2240        // regression would render it as 7 chars and fail the length
2241        // assert below. The probe sequence is deterministic; ~1 in 16
2242        // inputs qualifies, so 10k iterations always finds one.
2243        let probe = (0u32..10_000)
2244            .map(|n| format!("probe-{n}"))
2245            .find(|p| crc32fast::hash(p.as_bytes()) < 0x1000_0000)
2246            .expect("a leading-zero CRC32 within 10k probes");
2247        std::fs::write(source_dir.join(".config"), probe.as_bytes()).expect("rewrite .config");
2248        let zh = config_hash_for(source_dir)
2249            .expect("probe read")
2250            .expect("present .config must hash");
2251        assert_eq!(
2252            zh.len(),
2253            8,
2254            "config_hash must always be 8 hex chars ({{:08x}}): {zh}"
2255        );
2256        assert!(
2257            zh.starts_with('0'),
2258            "leading-zero CRC32 must keep its zero pad: {zh}"
2259        );
2260        assert!(
2261            zh.bytes().all(|b| b.is_ascii_hexdigit()),
2262            "config_hash must be lowercase hex: {zh}",
2263        );
2264    }
2265
2266    /// Drives the production [`source_vmlinux_stat_for`] helper
2267    /// (extracted from `kernel_build_pipeline`). Pins three arms: a
2268    /// `None` ref, a ref to a missing file, and a present file.
2269    ///
2270    /// Failure mode pinned: a regression that dropped the `None`
2271    /// short-circuit (yielding `Some((0, _))` for an absent/missing
2272    /// vmlinux) or returned the wrong size would defeat the
2273    /// `prefer_source_tree_for_dwarf` staleness check that compares
2274    /// this stat against a later read.
2275    #[test]
2276    fn source_vmlinux_stat_present_and_absent_arms() {
2277        let tmp = tempfile::TempDir::new().expect("vmlinux tempdir");
2278        let vmlinux_path = tmp.path().join("vmlinux");
2279
2280        // None ref → short-circuits to None.
2281        assert_eq!(
2282            source_vmlinux_stat_for(None),
2283            None,
2284            "a None vmlinux_ref must yield None",
2285        );
2286        // Ref to a non-existent file → metadata fails → None (NOT a
2287        // phantom (0, _)).
2288        assert_eq!(
2289            source_vmlinux_stat_for(Some(vmlinux_path.as_path())),
2290            None,
2291            "a vmlinux_ref to a missing file must yield None, not (0, _)",
2292        );
2293
2294        // Present file → (real length, positive post-epoch mtime).
2295        let body = b"\x7fELF fake vmlinux payload bytes for stat";
2296        std::fs::write(&vmlinux_path, body).expect("write vmlinux");
2297        let (size, mtime_secs) = source_vmlinux_stat_for(Some(vmlinux_path.as_path()))
2298            .expect("present vmlinux must stat to Some");
2299        assert_eq!(
2300            size,
2301            body.len() as u64,
2302            "stat size must equal the real source-tree vmlinux length",
2303        );
2304        assert!(
2305            mtime_secs > 0,
2306            "a freshly-written vmlinux must carry a positive post-epoch \
2307             mtime in seconds, got {mtime_secs}",
2308        );
2309    }
2310
2311    /// Drives the production [`dirty_cache_skip_hint`] helper (extracted
2312    /// from `kernel_build_pipeline`). A git repo with uncommitted
2313    /// changes gets the "commit or stash" hint; a non-git tree
2314    /// (force-marked dirty because dirty detection is impossible) gets
2315    /// the "put the source under git" hint — telling a non-git operator
2316    /// to "commit or stash" leads nowhere.
2317    ///
2318    /// Failure mode pinned: a regression that dropped the `is_git`
2319    /// branch and always returned `DIRTY_TREE_CACHE_SKIP_HINT` would
2320    /// give non-git operators unactionable advice. The inequality of
2321    /// the two constants proves the branch the helper selects on is
2322    /// load-bearing.
2323    #[test]
2324    fn dirty_tree_cache_skip_hint_branches_on_is_git() {
2325        assert_eq!(
2326            dirty_cache_skip_hint(true),
2327            DIRTY_TREE_CACHE_SKIP_HINT,
2328            "is_git=true must select the commit/stash hint",
2329        );
2330        assert_eq!(
2331            dirty_cache_skip_hint(false),
2332            NON_GIT_TREE_CACHE_SKIP_HINT,
2333            "is_git=false must select the put-under-git hint",
2334        );
2335        assert_ne!(
2336            DIRTY_TREE_CACHE_SKIP_HINT, NON_GIT_TREE_CACHE_SKIP_HINT,
2337            "the two hints must differ so the is_git branch is \
2338             load-bearing — a non-git operator must not be told to \
2339             commit/stash",
2340        );
2341    }
2342
2343    /// Drives the production [`post_build_cache_store_skip`] predicate
2344    /// (extracted from `kernel_build_pipeline`, DISTINCT from the
2345    /// mid-wait dirty classifier above) against a real git tree for the
2346    /// clean (store proceeds) and mid-build-commit (hash advanced →
2347    /// skip) cases. `inspect_local_source_state` is the real probe; the
2348    /// skip decision is the real predicate.
2349    ///
2350    /// Failure mode pinned: a regression that dropped the
2351    /// `hash_changed` disjunct and trusted only `post.is_dirty` would
2352    /// store a build under a cache key keyed on the pre-commit HEAD
2353    /// even though the operator committed (clean worktree) on top
2354    /// during the build — a future cache hit would serve a build that
2355    /// no longer matches its recorded identity.
2356    #[test]
2357    fn post_build_dirty_recheck_skips_store_on_hash_advance() {
2358        if !git_available() {
2359            eprintln!(
2360                "post_build_dirty_recheck_skips_store_on_hash_advance: \
2361                 git unavailable, skipping"
2362            );
2363            return;
2364        }
2365
2366        let tmp = tempfile::TempDir::new().unwrap();
2367        let canonical = tmp.path().to_path_buf();
2368        run_git(&canonical, &["init", "-q", "-b", "main"]);
2369        std::fs::write(canonical.join("seed.txt"), "initial").unwrap();
2370        run_git(&canonical, &["add", "seed.txt"]);
2371        run_git(&canonical, &["commit", "-q", "-m", "initial"]);
2372
2373        // Acquire-time identity (frozen at build start in `local_source`).
2374        let acquire =
2375            crate::fetch::inspect_local_source_state(&canonical).expect("acquire-time probe");
2376        let acquired_hash = acquire.short_hash.clone();
2377
2378        // Clean post-build: same hash, clean worktree → store proceeds.
2379        let post_clean =
2380            crate::fetch::inspect_local_source_state(&canonical).expect("post-build clean probe");
2381        let (skip_clean, hash_changed_clean) =
2382            post_build_cache_store_skip(&post_clean, acquired_hash.as_deref());
2383        assert!(
2384            !hash_changed_clean,
2385            "an unchanged HEAD must not flag hash_changed",
2386        );
2387        assert!(
2388            !skip_clean,
2389            "an unchanged tree post-build must NOT skip the cache store",
2390        );
2391
2392        // Mid-build commit: HEAD advanced, worktree clean → skip store.
2393        std::fs::write(canonical.join("midbuild.txt"), "landed during make").unwrap();
2394        run_git(&canonical, &["add", "midbuild.txt"]);
2395        run_git(&canonical, &["commit", "-q", "-m", "mid-build commit"]);
2396        let post_advanced = crate::fetch::inspect_local_source_state(&canonical)
2397            .expect("post-build advanced probe");
2398        assert!(
2399            !post_advanced.is_dirty,
2400            "a committed mid-build change leaves the worktree clean; the \
2401             hash advance (not is_dirty) must drive the store skip",
2402        );
2403        let (skip_advanced, hash_changed_advanced) =
2404            post_build_cache_store_skip(&post_advanced, acquired_hash.as_deref());
2405        assert!(
2406            hash_changed_advanced,
2407            "the mid-build commit must yield a short_hash distinct from \
2408             the acquire-time hash",
2409        );
2410        assert!(
2411            skip_advanced,
2412            "a HEAD advance during the build must skip the cache store so \
2413             a stale identity is never recorded",
2414        );
2415    }
2416}
ktstr/cli/kernel_build/build.rs

ktstr/cli/kernel_build/
build.rs