ktstr/cache/
cache_dir.rs

1//! [`CacheDir`] handle, lock guards, and cache-lock timeout policy.
2//!
3//! Public surface: [`CacheDir`] (the operator-facing handle exposed
4//! via `crate::cache::CacheDir`), [`SharedLockGuard`] /
5//! [`ExclusiveLockGuard`] (RAII wrappers around per-key flock
6//! acquisitions), and the [`CacheDir::store`] /
7//! [`CacheDir::lookup`] / [`CacheDir::list`] /
8//! [`CacheDir::clean_all`] / [`CacheDir::clean_keep`] lifecycle methods.
9//! The internal `warn_if_unstripped_vmlinux` and `should_warn_unstripped`
10//! helpers gate a per-lookup warning on entries whose vmlinux
11//! sidecar took the strip-failure fallback in
12//! `super::vmlinux_strip::strip_vmlinux_debug`.
13//!
14//! Sibling modules:
15//! - `super::metadata` — pure types ([`super::KernelSource`],
16//!   [`super::KernelMetadata`], [`super::CacheArtifacts`], [`super::KconfigStatus`],
17//!   [`super::CacheEntry`], [`super::ListedEntry`]) plus the
18//!   `super::metadata::classify_corrupt_reason` dispatcher and
19//!   `super::metadata::format_image_missing_reason` helper that
20//!   `list` uses to emit corrupt-entry reason strings.
21//! - `super::housekeeping` — atomic-rename install primitives
22//!   (`atomic_swap_dirs`, `TmpDirGuard`), cache-key /
23//!   filename validators, the JSON metadata reader
24//!   (`read_metadata`), and the cross-PID
25//!   orphan-tempdir sweep (`clean_orphaned_tmp_dirs`).
26//! - `super::vmlinux_strip` — the ELF strip pipeline
27//!   (`strip_vmlinux_debug`) `store()`
28//!   invokes when an artifact carries a vmlinux sidecar.
29//! - [`super::resolve`] — env-cascade root resolution that
30//!   `CacheDir::new` and `CacheDir::default_root` flow through.
31//!
32//! Reader/writer asymmetry: shared (reader) lock blocks 10 s — the
33//! reader timeout is fixed and not operator-tunable. The exclusive
34//! (writer) lock blocks 5 minutes by default but is the ONLY one
35//! overridable, via the [`STORE_EXCLUSIVE_LOCK_TIMEOUT_ENV`]
36//! environment variable. Writer must outlast every concurrent test
37//! reader; reader bails fast on a stuck writer. See
38//! [`SHARED_LOCK_DEFAULT_TIMEOUT`] and
39//! [`STORE_EXCLUSIVE_LOCK_DEFAULT_TIMEOUT`] for the literal
40//! durations and their rationale.
41//!
42//! Tests live in a sibling file `cache_dir_tests.rs`, pulled in
43//! below via `#[path]` so they remain the `cache_dir::tests`
44//! submodule. That preserves access to private items
45//! (`lookup_silent`, `should_emit_unstripped_warn`,
46//! `store_exclusive_lock_timeout`, the `STORE_EXCLUSIVE_LOCK_*`
47//! constants) and `super::*` resolution; the split is purely a
48//! file-size measure.
49
50use std::collections::HashSet;
51use std::fs;
52use std::path::{Path, PathBuf};
53use std::sync::{Mutex, OnceLock};
54
55use crate::sync::MutexExt;
56use anyhow::Context;
57
58use super::housekeeping::{
59    TmpDirGuard, atomic_swap_dirs, clean_orphaned_tmp_dirs, fsync_parent, fsync_staging_dir,
60    read_metadata, validate_cache_key, validate_filename,
61};
62#[cfg(test)]
63use super::metadata::KconfigStatus;
64use super::metadata::{
65    CacheArtifacts, CacheEntry, KernelMetadata, ListedEntry, format_image_missing_reason,
66};
67use super::resolve::resolve_cache_root;
68use super::vmlinux_strip::strip_vmlinux_debug;
69use super::{LOCK_DIR_NAME, TMP_DIR_PREFIX};
70use crate::flock::{FlockMode, acquire_flock_with_timeout};
71
72/// Default wall-clock timeout for [`CacheDir::acquire_shared_lock`].
73const SHARED_LOCK_DEFAULT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10);
74
75/// Default timeout for [`CacheDir::store`]'s internal `LOCK_EX`
76/// acquire when [`STORE_EXCLUSIVE_LOCK_TIMEOUT_ENV`] is unset.
77///
78/// 5 minutes covers a `store` peer's full critical section in the
79/// worst case: under heavy parallelism N concurrent runners may
80/// contend on the SAME `cache_key`, where the head writer holds
81/// `LOCK_EX` while it copies the boot image, runs the two-stage
82/// vmlinux strip pipeline ([`super::vmlinux_strip::strip_vmlinux_debug`]),
83/// writes `metadata.json`, and finishes the
84/// [`super::housekeeping::atomic_swap_dirs`] swap. A real vmlinux
85/// strip on a debug-symbol-rich build can spend tens of seconds
86/// inside the strip pipeline alone, and stacking N peers in series
87/// behind that producer scales the wait linearly. 60 s was tight
88/// enough that 5–10 contending peers reliably timed out before
89/// the head writer finished. The new 5-minute default leaves
90/// headroom for ~50 contending peers behind a slow strip without
91/// losing the "fail loud rather than block forever" property of a
92/// finite timeout.
93const STORE_EXCLUSIVE_LOCK_DEFAULT_TIMEOUT: std::time::Duration =
94    std::time::Duration::from_secs(300);
95
96/// Environment variable name that overrides
97/// [`STORE_EXCLUSIVE_LOCK_DEFAULT_TIMEOUT`]. Parsed via
98/// [`humantime::parse_duration`] so operators can tune with
99/// human-readable units (`30s`, `2m`, `10min`, `1h`). An invalid
100/// value falls back to the default with a `warn!` so a typo never
101/// silently disables the lock — the operator can see the
102/// fall-through in their tracing output and fix the setting.
103const STORE_EXCLUSIVE_LOCK_TIMEOUT_ENV: &str = "KTSTR_CACHE_STORE_LOCK_TIMEOUT";
104
105/// Resolve the per-store `LOCK_EX` acquire timeout, honoring the
106/// [`STORE_EXCLUSIVE_LOCK_TIMEOUT_ENV`] override. Pure function so
107/// tests can exercise the parse/fall-through branches without
108/// driving a full `store()` cycle.
109fn store_exclusive_lock_timeout() -> std::time::Duration {
110    match std::env::var(STORE_EXCLUSIVE_LOCK_TIMEOUT_ENV) {
111        Ok(v) if !v.is_empty() => match humantime::parse_duration(&v) {
112            Ok(d) => d,
113            Err(e) => {
114                tracing::warn!(
115                    env = %STORE_EXCLUSIVE_LOCK_TIMEOUT_ENV,
116                    value = %v,
117                    err = %e,
118                    "invalid cache-store lock timeout env value; \
119                     falling back to default timeout",
120                );
121                STORE_EXCLUSIVE_LOCK_DEFAULT_TIMEOUT
122            }
123        },
124        _ => STORE_EXCLUSIVE_LOCK_DEFAULT_TIMEOUT,
125    }
126}
127
128/// Handle to the kernel image cache directory.
129#[derive(Debug)]
130#[non_exhaustive]
131pub struct CacheDir {
132    root: PathBuf,
133}
134
135/// Process-level dedup set for the unstripped-vmlinux warning.
136///
137/// `lookup()` is the user-visible entry point and may be called many
138/// times per CLI invocation against the same cache_key (for example,
139/// a multi-kernel gauntlet does N lookups of the same stale entry
140/// across its scenario fan-out). Without dedup, every lookup would
141/// re-emit the strip-fallback warn — N copies of the same line drowns
142/// out unrelated diagnostics. The set holds every cache_key for which
143/// the warn has already fired in this process; on hit, the warn
144/// helper skips re-emission.
145///
146/// `OnceLock` rather than `LazyLock` to keep the lazy init explicit.
147/// The mutex is held only across an O(1) HashSet insert; contention
148/// under realistic lookup fan-out is negligible.
149fn warned_keys() -> &'static Mutex<HashSet<String>> {
150    static SET: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
151    SET.get_or_init(|| Mutex::new(HashSet::new()))
152}
153
154/// Pure dedup-gate logic for [`warn_if_unstripped_vmlinux`].
155///
156/// Returns `true` iff a fresh `tracing::warn!` should fire for this
157/// entry: `should_warn_unstripped` accepts the entry AND the entry's
158/// cache_key is being recorded in `set` for the first time. Returns
159/// `false` if the entry does not need warning at all OR if the key
160/// was already in the set (already-warned suppression).
161///
162/// Takes `&Mutex<HashSet<String>>` rather than reaching into the
163/// process-wide [`warned_keys`] static so tests can drive the gate
164/// against a fresh per-test mutex without polluting (or being
165/// polluted by) the global set. Production callers pass
166/// `warned_keys()`; the bool return decouples the side effect (the
167/// `tracing::warn!`) from the decision so the latter is unit-testable.
168fn should_emit_unstripped_warn(entry: &CacheEntry, set: &Mutex<HashSet<String>>) -> bool {
169    if !should_warn_unstripped(entry) {
170        return false;
171    }
172    let mut guard = set.lock_unpoisoned();
173    guard.insert(entry.key.clone())
174}
175
176/// Emit a per-lookup warning when a cache entry was created with an
177/// unstripped vmlinux.
178///
179/// **Once per cache_key per process.** A `static` HashSet (see
180/// [`warned_keys`]) records every key for which the warn has already
181/// fired; subsequent calls for the same key are silent. Suppression
182/// covers callers that lookup the same stale entry repeatedly within
183/// one CLI invocation (e.g. multi-kernel gauntlet). The dedup
184/// decision is delegated to [`should_emit_unstripped_warn`], which
185/// is independently unit-tested.
186///
187/// Uses [`tracing::warn!`] so the message routes through the same
188/// observability pipeline as every other cache-layer diagnostic
189/// (the cargo-ktstr binary's `tracing_subscriber::fmt` writes warns
190/// to stderr; library consumers can subscribe a different layer).
191/// `eprintln!` would bypass that pipeline and force every consumer
192/// to live with raw-stderr output regardless of their tracing
193/// configuration.
194///
195/// The mutex is held only across the O(1) HashSet insert inside
196/// `should_emit_unstripped_warn`; the `tracing::warn!` macro fires
197/// AFTER lock release so a slow tracing subscriber cannot serialise
198/// concurrent lookups.
199fn warn_if_unstripped_vmlinux(entry: &CacheEntry) {
200    if should_emit_unstripped_warn(entry, warned_keys()) {
201        tracing::warn!(
202            cache_key = %entry.key,
203            "cache: using unstripped vmlinux (strip failed on a prior build; \
204             re-run with a clean cache to retry)",
205        );
206    }
207}
208
209/// Pure decision logic for [`warn_if_unstripped_vmlinux`].
210pub(crate) fn should_warn_unstripped(entry: &CacheEntry) -> bool {
211    entry.metadata.has_vmlinux() && !entry.metadata.vmlinux_stripped()
212}
213
214/// Whether the existing `cached` cache entry already satisfies a
215/// caller's intent to `store` an artifact under the same cache key.
216///
217/// Pure decision logic for [`CacheDir::store`]'s in-lock re-lookup
218/// (step 3 of the docs). When N concurrent peers race on the same
219/// `cache_key` they all miss the pre-lock cache check, serialise
220/// behind `LOCK_EX`, and would otherwise each repeat the head
221/// writer's copy / strip / atomic-publish work. This predicate
222/// answers the post-lock question: "is the head writer's output
223/// byte-equivalent to what I'd publish?" If yes, the late peers
224/// short-circuit — only the head writer pays the publish cost.
225///
226/// Compares only the metadata fields that drive the on-disk bytes
227/// `store()` would write:
228///
229/// - `config_hash` (CRC32 of the final `.config`) — pins the
230///   kernel image identity.
231/// - `ktstr_kconfig_hash` (CRC32 of `ktstr.kconfig`) — kconfig
232///   fragment that produced the build.
233/// - `extra_kconfig_hash` (CRC32 of the user `--extra-kconfig`
234///   fragment) — same.
235/// - `caller_has_vmlinux` — whether the caller passed a vmlinux
236///   sidecar in `CacheArtifacts`. This is the actual switch
237///   `store()` keys on (it overwrites `metadata.has_vmlinux`
238///   from the artifacts argument), so the predicate compares
239///   against the artifacts shape, not the caller's metadata
240///   field.
241///
242/// Excludes:
243///
244/// - `built_at` — wall-clock timestamp that drifts every build;
245///   pinning it would break the early-return and serialise every
246///   peer through a redundant publish.
247/// - `version` — display-only string, not a byte-difference.
248/// - `source` — acquire-time provenance (Tarball / Git / Local +
249///   payload). Two peers may publish the same image under
250///   different `source` payloads (e.g. one from a tarball mirror,
251///   one from a git checkout) and still produce byte-equivalent
252///   bytes. The kconfig hash is the authoritative content key.
253/// - `arch`, `image_name` — fixed by the cache key shape.
254/// - `vmlinux_stripped` — set by `store()` based on
255///   strip pipeline success/failure, not caller intent. The head
256///   writer either succeeded (stripped) or fell back (unstripped);
257///   late peers would just observe the head writer's outcome.
258/// - `source_vmlinux_size`, `source_vmlinux_mtime_secs` —
259///   DWARF-routing hints, not cached content.
260///
261/// Pure function so a unit test can pin every accept/reject branch
262/// without driving a full `store()` cycle through a temp cache.
263pub(crate) fn cache_content_matches(
264    cached: &KernelMetadata,
265    caller: &KernelMetadata,
266    caller_has_vmlinux: bool,
267) -> bool {
268    cached.config_hash == caller.config_hash
269        && cached.ktstr_kconfig_hash == caller.ktstr_kconfig_hash
270        && cached.extra_kconfig_hash == caller.extra_kconfig_hash
271        && cached.has_vmlinux() == caller_has_vmlinux
272}
273
274impl CacheDir {
275    /// Open a cache directory at the resolved root path.
276    pub fn new() -> anyhow::Result<Self> {
277        let root = resolve_cache_root()?;
278        Ok(CacheDir { root })
279    }
280
281    /// Open a cache directory at a specific path.
282    pub fn with_root(root: PathBuf) -> Self {
283        CacheDir { root }
284    }
285
286    /// Resolve the default cache root path without side effects.
287    pub fn default_root() -> anyhow::Result<PathBuf> {
288        resolve_cache_root()
289    }
290
291    /// Root directory this `CacheDir` is anchored at.
292    pub fn root(&self) -> &Path {
293        &self.root
294    }
295
296    /// Look up a cached kernel by cache key.
297    ///
298    /// On hit, emits a `tracing::warn!` via
299    /// `warn_if_unstripped_vmlinux` when the cached entry took the
300    /// strip-failure fallback (see `should_warn_unstripped` for the
301    /// exact predicate). Caller-facing call sites want the warning;
302    /// internal call sites that look the entry up only to compare
303    /// against caller intent (notably [`Self::store`]'s in-lock
304    /// recheck) use `Self::lookup_silent` to avoid double-emitting
305    /// the same warning the caller will see on its next `lookup`.
306    pub fn lookup(&self, cache_key: &str) -> Option<CacheEntry> {
307        let entry = self.lookup_silent(cache_key)?;
308        warn_if_unstripped_vmlinux(&entry);
309        Some(entry)
310    }
311
312    /// Look up a cached kernel without emitting the unstripped-vmlinux
313    /// warning. Internal callers that consume the entry's metadata
314    /// without surfacing it to the user — specifically the in-lock
315    /// recheck inside [`Self::store`] — use this variant so a recheck
316    /// hit on a strip-fallback entry does not log a duplicate warning
317    /// that the user-facing [`Self::lookup`] will already log on their
318    /// next call.
319    fn lookup_silent(&self, cache_key: &str) -> Option<CacheEntry> {
320        if let Err(e) = validate_cache_key(cache_key) {
321            tracing::warn!("invalid cache key: {e}");
322            return None;
323        }
324        let entry_dir = self.root.join(cache_key);
325        if !entry_dir.is_dir() {
326            return None;
327        }
328        let metadata = read_metadata(&entry_dir).ok()?;
329        if !entry_dir.join(&metadata.image_name).exists() {
330            return None;
331        }
332        Some(CacheEntry {
333            key: cache_key.to_string(),
334            path: entry_dir,
335            metadata,
336        })
337    }
338
339    /// List all cached kernel entries, sorted by build time (newest
340    /// first).
341    pub fn list(&self) -> anyhow::Result<Vec<ListedEntry>> {
342        let mut entries: Vec<ListedEntry> = Vec::new();
343        let read_dir = match fs::read_dir(&self.root) {
344            Ok(rd) => rd,
345            Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(entries),
346            Err(e) => return Err(e.into()),
347        };
348        for dir_entry in read_dir {
349            let dir_entry = dir_entry?;
350            let path = dir_entry.path();
351            let name = match dir_entry.file_name().into_string() {
352                Ok(n) => n,
353                Err(_) => continue,
354            };
355            // Skip every dotfile child — ktstr reserves all
356            // dot-prefixed names (current uses: `.locks/`, `.tmp-*`).
357            // `validate_cache_key` rejects leading-dot inputs, so a
358            // dotfile in the cache root is either ktstr bookkeeping or
359            // an external artifact; either way `list()` must not
360            // surface it as a cache entry.
361            if name.starts_with('.') {
362                continue;
363            }
364            if !path.is_dir() {
365                continue;
366            }
367            match read_metadata(&path) {
368                Ok(metadata) => {
369                    let image_path = path.join(&metadata.image_name);
370                    if image_path.exists() {
371                        entries.push(ListedEntry::Valid(Box::new(CacheEntry {
372                            key: name,
373                            path,
374                            metadata,
375                        })));
376                    } else {
377                        entries.push(ListedEntry::Corrupt {
378                            key: name,
379                            path,
380                            reason: format_image_missing_reason(&metadata.image_name),
381                        });
382                    }
383                }
384                Err(reason) => {
385                    tracing::info!(
386                        entry = %name,
387                        path = %path.display(),
388                        %reason,
389                        "cache entry corrupt at list-time",
390                    );
391                    entries.push(ListedEntry::Corrupt {
392                        key: name,
393                        path,
394                        reason,
395                    });
396                }
397            }
398        }
399        entries.sort_by(|a, b| {
400            let a_time = a.as_valid().map(|e| e.metadata.built_at.as_str());
401            let b_time = b.as_valid().map(|e| e.metadata.built_at.as_str());
402            b_time.cmp(&a_time)
403        });
404        Ok(entries)
405    }
406
407    /// Store a kernel image (and optional vmlinux sidecar) in the
408    /// cache under `cache_key`. Atomic install via temp directory +
409    /// `renameat2(RENAME_EXCHANGE)`, so a concurrent reader never
410    /// observes a partially-written entry.
411    ///
412    /// # Steps (in order)
413    ///
414    /// 1. **Validate inputs.** `validate_cache_key` rejects
415    ///    `..`, slashes, NUL, leading-dot keys (the `TMP_DIR_PREFIX`
416    ///    reservation plus any other dotfile-shaped key, since
417    ///    `list()` skips every dotfile child);
418    ///    `validate_filename` rejects path-separator characters in
419    ///    the image basename. Invalid input fails before any I/O.
420    /// 2. **Acquire the per-key store lock.** `LOCK_EX` on
421    ///    `<root>/.locks/<cache_key>.lock`. Timeout defaults to
422    ///    `STORE_EXCLUSIVE_LOCK_DEFAULT_TIMEOUT` (5 minutes) and
423    ///    can be overridden via `STORE_EXCLUSIVE_LOCK_TIMEOUT_ENV`
424    ///    for environments where a slow vmlinux strip stacks many
425    ///    contending peers behind the head writer. The lock
426    ///    excludes other writers for the same key while letting
427    ///    readers and writers for unrelated keys proceed. Timeout
428    ///    produces an error rather than blocking forever — a hung
429    ///    writer cannot indefinitely block a fresh rebuild attempt.
430    /// 3. **Double-checked re-lookup inside the lock.** After
431    ///    acquiring `LOCK_EX`, re-run `Self::lookup_silent` for
432    ///    `cache_key`. When N peers race to publish the same key
433    ///    they all miss the pre-lock cache check, queue on
434    ///    `LOCK_EX`, and serialise behind the head writer. Without
435    ///    this recheck, every peer re-runs the full copy + strip +
436    ///    publish steps in series even though the head writer's
437    ///    output already satisfies them. The recheck early-returns
438    ///    when the existing cached entry's content-defining metadata
439    ///    fields (`cache_content_matches` — config_hash,
440    ///    ktstr_kconfig_hash, extra_kconfig_hash, has_vmlinux) match
441    ///    the caller's intent for this publish, so only the head
442    ///    writer pays the strip/copy/rename cost. Cache-relevant
443    ///    differences (a fresh kconfig hash, a different vmlinux
444    ///    presence) bypass the early-return and proceed to a real
445    ///    overwrite-publish. Cache-irrelevant differences (a fresh
446    ///    `built_at` timestamp, a different `version` display
447    ///    string) trigger the early-return — the on-disk bytes the
448    ///    overwrite would write are byte-equivalent to what's
449    ///    already cached, so the publish is redundant.
450    /// 4. **Stage into a temp directory.** `<root>/.tmp-<key>-<pid>`
451    ///    is created (or pruned and recreated if a previous attempt
452    ///    by the same PID exists), with `TmpDirGuard` enrolling the
453    ///    path for cleanup on any subsequent error. A best-effort
454    ///    `clean_orphaned_tmp_dirs` pass also runs here so dead
455    ///    sibling temp directories from crashed PIDs are GC'd before
456    ///    we add another one.
457    /// 5. **Copy the boot image.** `metadata.image_name` lands at
458    ///    `tmp/<image_name>` via `reflink::reflink_or_copy` (copy-on-write
459    ///    when the cache filesystem supports it, else a plain byte copy).
460    /// 6. **Strip and copy vmlinux (if supplied).** When
461    ///    `artifacts.vmlinux` is `Some`, `strip_vmlinux_debug`
462    ///    runs the two-stage strip pipeline and the result is written
463    ///    to `tmp/vmlinux`. **Strip-fallback rationale:** if the
464    ///    strip pipeline returns an error (e.g. an unrecognised ELF
465    ///    layout from a future toolchain or an exotic config), the
466    ///    write does NOT abort — it falls back to copying the raw
467    ///    unstripped vmlinux and records `vmlinux_stripped: false`
468    ///    in metadata. The cache trades a much larger on-disk
469    ///    payload for "still usable for monitoring/probes," and
470    ///    `cargo ktstr kernel list --json` exposes the
471    ///    `vmlinux_stripped` field so operators can spot entries
472    ///    that need rebuilding once the strip-failure root cause is
473    ///    fixed. A hard failure here would be worse: it would
474    ///    effectively brick the cache for that build.
475    /// 7. **Write `metadata.json`.** A pretty-printed serde dump of
476    ///    `KernelMetadata` (with `has_vmlinux` and `vmlinux_stripped`
477    ///    set from step 6) at `tmp/metadata.json`. Pretty-print is
478    ///    intentional — operators inspect this file directly when
479    ///    debugging cache state.
480    /// 8. **Atomic publish.** `fs::rename(tmp → final)` if `final`
481    ///    does not exist; otherwise `atomic_swap_dirs` uses
482    ///    `renameat2(RENAME_EXCHANGE)` to swap the two directories
483    ///    in a single atomic syscall. Either way, no reader observes
484    ///    a partial entry; the swap path also cleans up the
485    ///    now-stale prior version under the temp name.
486    pub fn store(
487        &self,
488        cache_key: &str,
489        artifacts: &CacheArtifacts<'_>,
490        metadata: &KernelMetadata,
491    ) -> anyhow::Result<CacheEntry> {
492        validate_cache_key(cache_key)?;
493        validate_filename(&metadata.image_name)?;
494
495        let _store_lock =
496            self.acquire_exclusive_lock_blocking(cache_key, store_exclusive_lock_timeout())?;
497
498        // Double-checked re-lookup inside LOCK_EX: when N peers race
499        // on the same cache_key they all miss the pre-lock cache
500        // check, queue on the lock, and would otherwise repeat the
501        // head writer's copy/strip/publish work in series. The
502        // recheck early-returns when the existing entry's
503        // content-defining metadata fields match what we'd publish
504        // (see [`cache_content_matches`] for the predicate). The
505        // matched entry is returned to the caller verbatim — its
506        // on-disk bytes are byte-equivalent to what we would write,
507        // so no overwrite-publish is needed.
508        //
509        // The recheck-hit early-return BYPASSES the orphan tempdir
510        // sweep at step 4. That is intentional: every orphan-sweep
511        // call costs an opendir + readdir + N kill(pid, 0) probes,
512        // and the recheck-hit path is the hot path for serialised
513        // peer fan-out — adding the sweep here would charge every
514        // late peer a syscall budget the head writer already paid.
515        // Orphans accumulate only on the cache-miss / overwrite
516        // path, which is also where new tempdirs are created, so
517        // the GC runs proportionally to tempdir creation. Uses the
518        // private `lookup_silent` variant (no warn) so the recheck
519        // does not double-emit the unstripped-vmlinux warn that
520        // store()'s caller would see again on its next lookup().
521        if let Some(existing) = self.lookup_silent(cache_key)
522            && cache_content_matches(&existing.metadata, metadata, artifacts.vmlinux.is_some())
523        {
524            tracing::debug!(
525                cache_key = cache_key,
526                "cache.store: in-lock recheck hit; skipping copy/strip/publish",
527            );
528            return Ok(existing);
529        }
530
531        let final_dir = self.root.join(cache_key);
532        let tmp_dir = self.root.join(format!(
533            "{TMP_DIR_PREFIX}{}-{}",
534            cache_key,
535            std::process::id(),
536        ));
537
538        if tmp_dir.exists() {
539            fs::remove_dir_all(&tmp_dir)?;
540        }
541        if let Err(e) = clean_orphaned_tmp_dirs(&self.root) {
542            tracing::warn!(err = %format!("{e:#}"), "clean_orphaned_tmp_dirs failed; continuing store");
543        }
544        fs::create_dir_all(&tmp_dir)?;
545
546        let _guard = TmpDirGuard(&tmp_dir);
547
548        let image_dest = tmp_dir.join(&metadata.image_name);
549        crate::reflink::reflink_or_copy(artifacts.image, &image_dest)
550            .context("copy kernel image to cache")?;
551
552        let (has_vmlinux, vmlinux_stripped) = if let Some(vmlinux) = artifacts.vmlinux {
553            let vmlinux_dest = tmp_dir.join("vmlinux");
554            match strip_vmlinux_debug(vmlinux) {
555                Ok(stripped) => {
556                    crate::reflink::reflink_or_copy(stripped.path(), &vmlinux_dest)
557                        .context("copy stripped vmlinux to cache")?;
558                    (true, true)
559                }
560                Err(e) => {
561                    tracing::warn!(
562                        cache_key = cache_key,
563                        err = %format!("{e:#}"),
564                        "vmlinux strip failed, caching unstripped \
565                         (larger on-disk payload). See \
566                         `cargo ktstr kernel list --json` \
567                         vmlinux_stripped field.",
568                    );
569                    crate::reflink::reflink_or_copy(vmlinux, &vmlinux_dest)
570                        .context("copy vmlinux to cache")?;
571                    (true, false)
572                }
573            }
574        } else {
575            (false, false)
576        };
577
578        let mut meta = metadata.clone();
579        meta.set_has_vmlinux(has_vmlinux);
580        meta.set_vmlinux_stripped(vmlinux_stripped);
581        let meta_json = serde_json::to_string_pretty(&meta)?;
582        fs::write(tmp_dir.join("metadata.json"), meta_json)
583            .map_err(|e| anyhow::anyhow!("write cache metadata: {e}"))?;
584
585        // Durability: fsync the staged files + tmp dir before the
586        // publish rename so a host crash is less likely to leave the
587        // cache pointing at a torn entry. Best-effort — lookup re-parses
588        // metadata.json on read (and checks the image file exists), so a
589        // torn metadata.json re-detects as a cache miss and rebuilds. A
590        // torn kernel image or vmlinux with intact metadata is not
591        // content-checked here; it surfaces at the consumer (kernel boot
592        // fails, or monitor/probe reads of vmlinux fail) — a loud
593        // failure, never silent corruption of a shipped artifact.
594        if let Err(e) = fsync_staging_dir(&tmp_dir) {
595            tracing::warn!(
596                cache_key = cache_key,
597                err = %e,
598                "fsync of staged cache entry before publish failed; \
599                 publishing without the durability barrier (validate-on-read \
600                 remains the correctness backstop)",
601            );
602        }
603
604        match fs::rename(&tmp_dir, &final_dir) {
605            Ok(()) => {}
606            Err(e)
607                if e.raw_os_error() == Some(libc::ENOTEMPTY)
608                    || e.raw_os_error() == Some(libc::EEXIST) =>
609            {
610                atomic_swap_dirs(&tmp_dir, &final_dir)?;
611            }
612            Err(e) => {
613                return Err(anyhow::anyhow!("atomic rename cache entry: {e}"));
614            }
615        }
616
617        // Persist the publish rename itself: fsync the cache root so the
618        // new entry name survives a crash. Best-effort, same rationale.
619        if let Err(e) = fsync_parent(&final_dir) {
620            tracing::warn!(
621                cache_key = cache_key,
622                err = %e,
623                "fsync of cache root after publish failed; the rename may \
624                 not survive a crash (validate-on-read backstop)",
625            );
626        }
627
628        Ok(CacheEntry {
629            key: cache_key.to_string(),
630            path: final_dir,
631            metadata: meta,
632        })
633    }
634
635    /// Remove every cached entry. Returns the number of entries
636    /// removed. Preserves the `.locks/` subdirectory.
637    pub fn clean_all(&self) -> anyhow::Result<usize> {
638        self.remove_entries(self.list()?)
639    }
640
641    /// Remove every cached entry except the `keep` most recent ones
642    /// (by `built_at` timestamp). Preserves the `.locks/`
643    /// subdirectory.
644    pub fn clean_keep(&self, keep: usize) -> anyhow::Result<usize> {
645        self.remove_entries(self.list()?.into_iter().skip(keep))
646    }
647
648    fn remove_entries<I: IntoIterator<Item = ListedEntry>>(
649        &self,
650        iter: I,
651    ) -> anyhow::Result<usize> {
652        let to_remove: Vec<_> = iter.into_iter().collect();
653        let count = to_remove.len();
654        for entry in &to_remove {
655            fs::remove_dir_all(entry.path())?;
656        }
657        Ok(count)
658    }
659
660    // ---------------- Per-entry coordination locks ----------------
661
662    /// Absolute path to the coordination lockfile for `cache_key`.
663    pub(crate) fn lock_path(&self, cache_key: &str) -> PathBuf {
664        self.root
665            .join(LOCK_DIR_NAME)
666            .join(format!("{cache_key}.lock"))
667    }
668
669    /// Create the `{cache_root}/.locks/` subdirectory if absent.
670    pub(crate) fn ensure_lock_dir(&self) -> anyhow::Result<()> {
671        let dir = self.root.join(LOCK_DIR_NAME);
672        fs::create_dir_all(&dir)
673            .with_context(|| format!("create lock subdirectory {}", dir.display()))
674    }
675
676    /// Acquire `LOCK_SH` on the cache-entry lockfile.
677    pub fn acquire_shared_lock(&self, cache_key: &str) -> anyhow::Result<SharedLockGuard> {
678        validate_cache_key(cache_key)?;
679        let path = self.lock_path(cache_key);
680        let fd = acquire_flock_with_timeout(
681            &path,
682            FlockMode::Shared,
683            SHARED_LOCK_DEFAULT_TIMEOUT,
684            &format!("cache entry {cache_key:?}"),
685            None,
686        )?;
687        Ok(SharedLockGuard { fd })
688    }
689
690    /// Acquire `LOCK_EX` on the cache-entry lockfile, blocking up
691    /// to `timeout`. On timeout, the error message surfaces the
692    /// `STORE_EXCLUSIVE_LOCK_TIMEOUT_ENV` override so an operator
693    /// hitting a contended `store()` discovers the env-var
694    /// remediation without reading the docs.
695    pub fn acquire_exclusive_lock_blocking(
696        &self,
697        cache_key: &str,
698        timeout: std::time::Duration,
699    ) -> anyhow::Result<ExclusiveLockGuard> {
700        validate_cache_key(cache_key)?;
701        let path = self.lock_path(cache_key);
702        let fd = acquire_flock_with_timeout(
703            &path,
704            FlockMode::Exclusive,
705            timeout,
706            &format!("cache entry {cache_key:?}"),
707            Some(
708                "override the timeout via KTSTR_CACHE_STORE_LOCK_TIMEOUT (humantime: 30s, 2m, 1h)",
709            ),
710        )?;
711        Ok(ExclusiveLockGuard { fd })
712    }
713
714    /// Non-blocking `LOCK_EX` attempt on the cache-entry lockfile.
715    pub fn try_acquire_exclusive_lock(
716        &self,
717        cache_key: &str,
718    ) -> anyhow::Result<ExclusiveLockGuard> {
719        validate_cache_key(cache_key)?;
720        // try_flock doesn't lazily create the parent directory like
721        // acquire_flock_with_timeout does — must materialise .locks/
722        // here so the open(O_CREAT) inside try_flock has a parent.
723        self.ensure_lock_dir()?;
724        let path = self.lock_path(cache_key);
725        match crate::flock::try_flock(&path, crate::flock::FlockMode::Exclusive)? {
726            Some(fd) => Ok(ExclusiveLockGuard { fd }),
727            None => {
728                let holders = crate::flock::read_holders(&path).unwrap_or_default();
729                anyhow::bail!(
730                    "cache entry {cache_key:?} is locked by active test runs \
731                     (lockfile {lockfile}, holders: {holders}). Wait for \
732                     those tests to finish, or kill them, then retry.",
733                    lockfile = path.display(),
734                    holders = crate::flock::format_holder_list(&holders),
735                );
736            }
737        }
738    }
739}
740
741/// RAII guard for a `LOCK_SH` hold on a cache-entry lockfile.
742#[derive(Debug)]
743pub struct SharedLockGuard {
744    #[allow(dead_code)]
745    fd: std::os::fd::OwnedFd,
746}
747
748/// RAII guard for a `LOCK_EX` hold on a cache-entry lockfile.
749#[derive(Debug)]
750pub struct ExclusiveLockGuard {
751    #[allow(dead_code)]
752    fd: std::os::fd::OwnedFd,
753}
754
755#[cfg(test)]
756#[path = "cache_dir_tests.rs"]
757mod tests;