ktstr/
fetch.rs

1//! Kernel source acquisition: tarball download, GitHub codeload
2//! snapshot, git clone, local tree.
3//!
4//! The acquisition entry points each return an [`AcquiredSource`]
5//! carrying the source directory, cache key, and metadata the caller
6//! needs to proceed to configuration and build: [`download_tarball`]
7//! (kernel.org stable/RC), `download_github_archive` (a GitHub codeload
8//! commit snapshot), `git_clone_kinded` (a kind-directed shallow clone
9//! that dispatches to `git_clone_tag` / [`git_clone`]), and
10//! [`local_source`] (an on-disk tree).
11
12use std::io::Read;
13use std::num::NonZeroU32;
14use std::path::{Path, PathBuf};
15use std::sync::OnceLock;
16use std::time::{Duration, Instant};
17
18use anyhow::{Context, Result, anyhow};
19use reqwest::blocking::Client;
20use sha2::{Digest, Sha256};
21
22/// Process-wide [`reqwest::blocking::Client`] lazily initialized on
23/// first access via [`shared_client`]. Keeping a single `Client`
24/// instance across the fetch-family reuses its TCP connection pool
25/// and TLS session cache across repeated calls to the same host
26/// within a CLI run. Cross-host fetches in the same run still
27/// re-handshake because reqwest's connection pool keys on host.
28static SHARED_CLIENT: OnceLock<Client> = OnceLock::new();
29
30/// Connect-phase timeout for [`shared_client`]: bounds the time spent
31/// in the TCP + TLS handshake before reqwest gives up on a peer.
32/// Bounds the dead-route case — a CDN edge that accepts the SYN but
33/// stalls the handshake, or a route that blackholes outright —
34/// without putting any ceiling on the response body's streaming
35/// duration once the connection is up.
36///
37/// No total request `.timeout()` is set: the same client serves both
38/// short requests (releases.json, sha256sums.asc) and large
39/// tarball streams ([`download_stable_tarball`],
40/// [`download_rc_tarball`]), where a 130–180 MiB compressed payload
41/// over a slow uplink can take minutes of wall-clock to deliver.
42/// Capping that with a per-request timeout would abort legitimate
43/// downloads; bounding only the connect phase preserves the
44/// dead-route guarantee while letting
45/// the body stream as long as the upstream is making forward
46/// progress.
47const SHARED_CLIENT_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
48
49/// Return the process-wide shared [`reqwest::blocking::Client`]. First
50/// call constructs it via `Client::builder()` with
51/// `SHARED_CLIENT_CONNECT_TIMEOUT` applied; every subsequent call
52/// returns a reference to the same instance. This helper is for
53/// top-level CLI entries that want the default client.
54///
55/// Tests that need to verify a network round-trip (rather than a
56/// cache hit) must NOT pass `shared_client()` to a cache-routed
57/// helper (`cached_releases`, `cached_releases_with`,
58/// [`fetch_latest_stable_version`], [`fetch_version_for_prefix`]) —
59/// `RELEASES_CACHE` may already be populated by a peer test, in
60/// which case the helper returns cached data and the network is
61/// never touched. Construct a local `Client` and pass it to the
62/// cache-routed helper to skip the cache; the pointer-equality gate
63/// in `cached_releases_with` routes a non-singleton client to a
64/// direct `fetch_releases` call against `RELEASES_URL` (the
65/// production URL — the bypass skips the cache, NOT the URL). For
66/// full URL injection (e.g. localhost mock server testing), call
67/// either `fetch_releases` directly with the mock URL — see
68/// `fetch_releases_against_localhost_mock_returns_parsed` — or use
69/// the cache-aware seam `cached_releases_with_url`, which routes
70/// the non-singleton bypass branch through the supplied URL while
71/// preserving the singleton/cache routing identical to
72/// `cached_releases_with`.
73///
74/// # Panics
75///
76/// Panics on the first call if `Client::builder().build()` fails to
77/// construct a client. Documented failure modes include TLS backend
78/// initialization (e.g. rustls/native-tls subsystem unreachable) and
79/// system-resolver config load failure; both are treated as setup
80/// bugs rather than runtime errors. The
81/// `expect` here, rather than propagating the error, mirrors the
82/// inherited behavior of `reqwest::blocking::Client::new()` (which
83/// is itself an infallible wrapper around `builder().build().expect`).
84pub fn shared_client() -> &'static Client {
85    SHARED_CLIENT.get_or_init(|| {
86        Client::builder()
87            .connect_timeout(SHARED_CLIENT_CONNECT_TIMEOUT)
88            .build()
89            .expect("build shared reqwest client")
90    })
91}
92
93/// Process-wide cache of the parsed `releases.json` payload.
94/// Populated by [`cached_releases_with`] on its first successful
95/// singleton-path fetch; every subsequent singleton call returns a
96/// clone of the cached vector without re-issuing the HTTP request.
97/// Lifetime matches the process — `releases.json` does not change
98/// underneath a single CLI invocation, so a per-process cache
99/// cannot serve stale data in any way the user would notice.
100///
101/// Failures are NOT cached: a transient kernel.org outage that
102/// errors the first call must allow a later caller to retry, since
103/// the underlying network condition may have cleared. Storing
104/// `Vec<Release>` rather than `Result<Vec<Release>>` enforces this
105/// at the type level — there's no way to populate the cache with
106/// a failure.
107///
108/// Companion to [`SHARED_CLIENT`]: both amortize per-invocation
109/// network cost across the resolve pipeline. Without this cache,
110/// `cargo ktstr test --kernel 6.10..6.12 --kernel 6.14..6.16`
111/// fetches `releases.json` twice — once per Range spec — under
112/// the rayon par_iter that drives `resolve_kernel_set`. With
113/// the cache the first Range to reach `expand_kernel_range`
114/// populates the slot; the second observes the populated slot
115/// and skips the network entirely.
116static RELEASES_CACHE: OnceLock<Vec<Release>> = OnceLock::new();
117
118/// Cache for the gregkh stable-mirror release tags — the `X.Y.Z`
119/// version strings parsed from its `refs/tags/vX.Y.Z` advertisement.
120/// Companion to [`RELEASES_CACHE`]: `--include-eol` may expand several
121/// `A..B` ranges under one `resolve_kernel_set`, and each would
122/// otherwise re-ls-remote the mirror. Populated on the first successful
123/// enumeration; a failed ls-remote leaves it empty so the next caller
124/// retries (`Vec`, not `Result`, mirroring `RELEASES_CACHE`).
125static STABLE_TAGS_CACHE: OnceLock<Vec<String>> = OnceLock::new();
126
127/// Fetch `releases.json` via the process-wide [`shared_client`],
128/// routing through [`RELEASES_CACHE`].
129///
130/// Thin wrapper for callers that don't already thread a `&Client`
131/// — top-level CLI entries like [`crate::cli::expand_kernel_range`]
132/// (under the rayon-driven `cargo ktstr` resolve pipeline) and
133/// `crate::cli::fetch_active_prefixes` (the EOL-annotation pass).
134/// Caching, race semantics, and fault-injection routing are all
135/// documented on [`cached_releases_with`].
136pub(crate) fn cached_releases() -> Result<Vec<Release>> {
137    cached_releases_with(shared_client())
138}
139
140/// Pointer-equality against the [`OnceLock`]-backed
141/// [`shared_client`] singleton is the correct predicate because
142/// `shared_client()` returns a stable `&'static Client` address.
143/// The [`cached_releases_with`] gate uses this predicate to
144/// decide whether to consult [`RELEASES_CACHE`]: the singleton
145/// hits the cache, every other (test-constructed) `Client`
146/// bypasses it and exercises the underlying [`fetch_releases`]
147/// path.
148///
149/// Caveat: `shared_client().clone()` produces a distinct
150/// `Client` at a different address even though it shares the
151/// singleton's connection pool via the inner `Arc`, so the
152/// clone bypasses the cache. Always pass `shared_client()`
153/// directly — never a clone — when cache routing is desired.
154///
155/// Side-effect-free when [`SHARED_CLIENT`] is uninitialized:
156/// no client can equal a not-yet-allocated singleton, so we
157/// return `false` without triggering `get_or_init` — tests
158/// that pass a local `Client` before any production code path
159/// has touched the singleton skip the construction entirely.
160fn is_shared_client(client: &Client) -> bool {
161    match SHARED_CLIENT.get() {
162        Some(singleton) => std::ptr::eq(client, singleton),
163        None => false,
164    }
165}
166
167/// Unified cache-aware entry point for `releases.json`. Routes
168/// the process-wide [`shared_client`] singleton through
169/// [`RELEASES_CACHE`]; any other (test-constructed) `Client`
170/// bypasses [`RELEASES_CACHE`] and calls [`fetch_releases`] with
171/// [`RELEASES_URL`] directly — the cache is skipped but the
172/// production URL is used.
173///
174/// Used by every in-file caller that already threads a `&Client`
175/// — [`fetch_latest_stable_version`], [`fetch_version_for_prefix`],
176/// [`latest_in_series`] — so production callers reuse
177/// [`RELEASES_CACHE`] and tests still get cache-bypass via the
178/// pointer-equality gate. [`cached_releases`] is the no-`Client`
179/// wrapper for top-level CLI entries.
180///
181/// Tests that need URL injection on the bypass branch (e.g.
182/// localhost mock server testing) call
183/// [`cached_releases_with_url`] directly with their mock URL —
184/// the URL-injectable form preserves identical routing
185/// semantics. This wrapper is the production entry point and
186/// pins the URL to [`RELEASES_URL`]; production code MUST go
187/// through this wrapper. A singleton call with a non-RELEASES_URL
188/// would otherwise populate [`RELEASES_CACHE`] with
189/// non-production data and corrupt every later production
190/// call — the singleton-path branch in
191/// [`cached_releases_with_url`] guards against this in both
192/// dev (`debug_assert!`) and release builds (fall back to
193/// bypass), but routing every production call through this
194/// wrapper makes the misuse impossible by construction.
195/// Caching, race semantics, and the bypass-vs-cache routing
196/// are fully documented on [`cached_releases_with_url`].
197fn cached_releases_with(client: &Client) -> Result<Vec<Release>> {
198    cached_releases_with_url(client, RELEASES_URL)
199}
200
201/// URL-injectable form of [`cached_releases_with`]. Production
202/// always reaches this through the [`cached_releases_with`]
203/// wrapper, which pins `url` to [`RELEASES_URL`]; the explicit
204/// `url` parameter exists so the bypass-branch test can route
205/// the non-singleton path through a localhost
206/// [`std::net::TcpListener`]-backed mock instead of hitting real
207/// kernel.org. Without this seam, the bypass test would either
208/// (a) require a real network round-trip on every run, or
209/// (b) accept a 5s timeout penalty on offline hosts to surface
210/// `Err` as a bypass-confirmation signal — both costs the seam
211/// eliminates.
212///
213/// Cache contract is identical to [`cached_releases_with`]:
214/// non-singleton clients bypass [`RELEASES_CACHE`] and call
215/// [`fetch_releases`] with `url`; the singleton routes through
216/// the cache only when `url == RELEASES_URL` (consulting via
217/// `OnceLock::get`, populating via `OnceLock::set` on miss). A
218/// singleton call with a non-RELEASES_URL trips the
219/// `debug_assert!` in dev builds and falls back to the bypass
220/// behavior in release builds — fetches directly via `url`,
221/// returns the result, never touches [`RELEASES_CACHE`]. The
222/// cache only ever stores data fetched from the singleton +
223/// RELEASES_URL combination, so a test that injects a mock URL
224/// on either branch cannot pollute the production cache.
225///
226/// Failures are propagated without populating [`RELEASES_CACHE`],
227/// so a transient kernel.org outage on the first call lets the
228/// next caller retry. Storing `Vec<Release>` (not
229/// `Result<Vec<Release>>`) enforces this at the type level.
230///
231/// Concurrent population on the singleton path is safe via the
232/// `OnceLock::set` race: the loser's `set` returns `Err(clone)`
233/// (the cloned vector that was passed in is moved back), the
234/// returned `Err` is discarded via `let _ = …`, and the loser
235/// returns its own original `fresh` vector. Both winner and
236/// loser return content-equivalent data since both fetched the
237/// same `releases.json`. Worst case under concurrent first
238/// calls: both callers issue the network round-trip, only one
239/// populates [`RELEASES_CACHE`]; every later call — from any
240/// thread — observes the populated slot via the `get` fast-path
241/// and skips the network.
242fn cached_releases_with_url(client: &Client, url: &str) -> Result<Vec<Release>> {
243    // Non-singleton clients bypass the cache (test fault injection).
244    if !is_shared_client(client) {
245        return fetch_releases(client, url);
246    }
247    // Cache-poison guard: the singleton path populates
248    // RELEASES_CACHE on miss. A test author that mistakenly
249    // passes a non-production URL with shared_client() would
250    // fill the cache with non-production data and corrupt every
251    // later production call (which reaches the cache via
252    // get-fast-path). Catch the misuse at debug-build time —
253    // production callers always thread RELEASES_URL through the
254    // `cached_releases_with` wrapper, so the assertion is a
255    // no-op for them; only a future test author wiring this
256    // function up with shared_client() and a mock URL would trip
257    // it.
258    debug_assert!(
259        url == RELEASES_URL,
260        "cached_releases_with_url: shared_client() must use RELEASES_URL \
261         to avoid RELEASES_CACHE pollution — got url={url:?}, expected \
262         RELEASES_URL ({RELEASES_URL:?}). Tests that need URL injection \
263         must pass a non-singleton Client (which takes the bypass branch \
264         above and never touches the cache).",
265    );
266    // Release-build guard: `debug_assert!` is stripped in
267    // optimized builds, so a non-RELEASES_URL on the singleton
268    // path would otherwise reach the populate-on-miss path below
269    // and persistently poison RELEASES_CACHE for every later
270    // production caller. Mirror the bypass-branch behavior
271    // (fetch directly, do not touch the cache) so the misuse
272    // degrades to a slow per-call fetch instead of a permanently
273    // wrong cache. The debug_assert above still fires loudly in
274    // dev builds; this branch only catches the misuse that
275    // slipped through to release.
276    if url != RELEASES_URL {
277        return fetch_releases(client, url);
278    }
279    if let Some(cached) = RELEASES_CACHE.get() {
280        return Ok(cached.clone());
281    }
282    let fresh = fetch_releases(client, url)?;
283    // Race-loss: `set` returns `Err(clone)` carrying back the
284    // clone we passed in; we discard it and return the original
285    // `fresh` below. See the rustdoc above for full semantics.
286    let _ = RELEASES_CACHE.set(fresh.clone());
287    Ok(fresh)
288}
289
290/// Downloaded/cloned kernel source ready for building.
291#[non_exhaustive]
292pub struct AcquiredSource {
293    /// Path to the kernel source directory.
294    pub source_dir: PathBuf,
295    /// Cache key for this source (e.g. "6.14.2-tarball-x86_64-kc{kconfig_hash}").
296    pub cache_key: String,
297    /// Version string if known (e.g. "6.14.2", "6.15-rc3").
298    pub version: Option<String>,
299    /// How the source was acquired, with per-variant payload
300    /// (git hash/ref for `Git`, source tree path and git hash for
301    /// `Local`).
302    pub kernel_source: crate::cache::KernelSource,
303    /// Whether the source is a temporary directory that should be
304    /// cleaned up after building.
305    pub is_temp: bool,
306    /// For local sources: whether the working tree is dirty.
307    /// Dirty trees must not be cached.
308    pub is_dirty: bool,
309    /// For local sources: whether the source is an actual git
310    /// repository. `true` when `gix::discover` succeeded and the
311    /// crate could compute index + worktree dirty state; `false`
312    /// for non-git source trees (tarball-extracted, rsync'd,
313    /// hand-assembled) where dirty detection is impossible and
314    /// the source is always cache-skipped pessimistically. Lets
315    /// the cache-skip hint branch on whether `commit` / `stash`
316    /// are actionable remediations (they aren't for non-git
317    /// sources).
318    ///
319    /// For non-local sources (tarball, git clone) the field is
320    /// set to `true` by convention — these paths are always
321    /// `is_dirty = false`, so the cache-skip branch that reads
322    /// `is_git` is never reached and the value is inert. Pinning
323    /// to `true` (rather than leaving the field meaningless)
324    /// keeps the invariant "is_git is meaningful only when
325    /// is_dirty is true, but always set" so a future code path
326    /// that reaches `is_git` outside the cache-skip context does
327    /// not trip on an `is_git = false` under a known-good source.
328    pub is_git: bool,
329}
330
331/// Target architecture string and boot image name.
332pub fn arch_info() -> (&'static str, &'static str) {
333    #[cfg(target_arch = "x86_64")]
334    {
335        ("x86_64", "bzImage")
336    }
337    #[cfg(target_arch = "aarch64")]
338    {
339        ("aarch64", "Image")
340    }
341}
342
343/// Parse a version string into its major version for URL construction.
344///
345/// "6.14.2" -> 6, "6.15-rc3" -> 6.
346fn major_version(version: &str) -> Result<u32> {
347    let major_str = version
348        .split('.')
349        .next()
350        .ok_or_else(|| anyhow!("invalid version: {version}"))?;
351    major_str
352        .parse::<u32>()
353        .with_context(|| format!("invalid major version in {version}"))
354}
355
356/// Determine if a version string represents an RC release.
357///
358/// RC releases use a different URL pattern and gzip compression
359/// (vs xz for stable).
360fn is_rc(version: &str) -> bool {
361    version.contains("-rc")
362}
363
364/// One (`moniker`, `version`) row from kernel.org's `releases.json`.
365///
366/// A named struct instead of a bare `(String, String)` tuple so every
367/// call site reads its field by name (`r.moniker`, `r.version`) rather
368/// than positional destructuring — the two strings are trivially
369/// swappable at a tuple-destructure call site, and a silent swap
370/// would mis-drive `is_skippable_release_moniker` while the
371/// now-misnamed "moniker" string flows into `version_prefix`
372/// downstream. Naming the fields removes that class of bug at the
373/// type-checker level and shows up in IDE hints on every iteration
374/// site.
375///
376/// Both fields are owned `String` (not `&str`) because the values are
377/// parsed out of a `reqwest::Response` body whose lifetime ends when
378/// `fetch_releases` returns; downstream callers iterate the vector
379/// long after that borrow would dangle.
380#[derive(Clone, Debug)]
381pub(crate) struct Release {
382    /// releases.json `moniker` field — stable / longterm / mainline /
383    /// linux-next / etc. Consumed by
384    /// [`is_skippable_release_moniker`] and by
385    /// [`fetch_latest_stable_version`]'s stable/longterm filter.
386    pub moniker: String,
387    /// releases.json `version` field — e.g. `"6.14.2"`, `"6.15-rc3"`,
388    /// `"6.16-rc2-next-20260420"`. Consumed by
389    /// [`version_tuple`], [`patch_level`], and
390    /// `cli::version_prefix`.
391    pub version: String,
392}
393
394/// Is this releases.json moniker one that the version-resolution
395/// pipeline should skip?
396///
397/// `linux-next` is a rolling integration branch whose version strings
398/// carry a date suffix rather than a stable tag, so it does not fit
399/// the major.minor.patch resolution model used by `latest_in_series`,
400/// `fetch_version_for_prefix`, and `cli::fetch_active_prefixes`. The
401/// release iteration in all three sites filters it out; this helper
402/// is the single point of truth for that decision so a future moniker
403/// that also warrants skipping can be added in one place.
404pub(crate) fn is_skippable_release_moniker(moniker: &str) -> bool {
405    moniker == "linux-next"
406}
407
408/// Find the latest version in the same major.minor series from releases.json.
409///
410/// Returns `Some("6.14.10")` for prefix `"6.14"` if that series exists in
411/// releases.json. Returns `None` if the series is not found (EOL or invalid).
412fn latest_in_series(client: &Client, version: &str) -> Option<String> {
413    let prefix = {
414        let parts: Vec<&str> = version.split('.').collect();
415        if parts.len() >= 2 {
416            format!("{}.{}", parts[0], parts[1])
417        } else {
418            return None;
419        }
420    };
421
422    // Routes through [`RELEASES_CACHE`] for the singleton; see
423    // [`cached_releases_with`] for the bypass gate.
424    let releases = cached_releases_with(client).ok()?;
425    let mut best: Option<(String, (u32, u32, u32))> = None;
426    for r in &releases {
427        if is_skippable_release_moniker(&r.moniker) {
428            continue;
429        }
430        if !r.version.starts_with(&prefix) {
431            continue;
432        }
433        if r.version.len() != prefix.len() && r.version.as_bytes()[prefix.len()] != b'.' {
434            continue;
435        }
436        if let Some(tuple) = version_tuple(&r.version)
437            && (best.is_none() || tuple > best.as_ref().unwrap().1)
438        {
439            best = Some((r.version.clone(), tuple));
440        }
441    }
442    best.map(|(v, _)| v)
443}
444
445/// Build a user-facing error message for a version that was not found.
446///
447/// Suggests the latest version in the same major.minor series when
448/// releases.json contains one.
449fn version_not_found_msg(client: &Client, version: &str) -> String {
450    let parts: Vec<&str> = version.split('.').collect();
451    let prefix = if parts.len() >= 2 {
452        format!("{}.{}", parts[0], parts[1])
453    } else {
454        version.to_string()
455    };
456    match latest_in_series(client, version) {
457        Some(latest) if latest != version => {
458            format!("version {version} not found. latest {prefix}.x: {latest}")
459        }
460        _ => format!("version {version} not found"),
461    }
462}
463
464/// Reject responses where the server returned HTML instead of a binary
465/// archive. Some CDN error pages return 200 with text/html.
466fn reject_html_response(response: &reqwest::blocking::Response, url: &str) -> Result<()> {
467    if let Some(ct) = response.headers().get(reqwest::header::CONTENT_TYPE)
468        && let Ok(ct_str) = ct.to_str()
469        && ct_str.contains("text/html")
470    {
471        anyhow::bail!(
472            "download {url}: server returned HTML instead of tarball (URL may be invalid)"
473        );
474    }
475    Ok(())
476}
477
478/// Print download size from Content-Length header if available.
479///
480/// `cli_label` prefixes the diagnostic line so the message matches the
481/// binary the user invoked (`"ktstr"` vs `"cargo ktstr"`).
482fn print_download_size(
483    response: &reqwest::blocking::Response,
484    url: &str,
485    cli_label: &str,
486    mp: Option<&crate::cli::FetchProgress>,
487) {
488    let line = if let Some(len) = response.content_length() {
489        let mib = len as f64 / (1024.0 * 1024.0);
490        format!("{cli_label}: downloading {url} ({mib:.1} MiB)")
491    } else {
492        format!("{cli_label}: downloading {url}")
493    };
494    // Route through the progress group so the line coordinates with
495    // concurrent bars on a TTY (and still reaches piped/CI stderr when
496    // the group is hidden); raw `eprintln!` when no group is present.
497    match mp {
498        Some(fp) => fp.println(&line),
499        None => eprintln!("{line}"),
500    }
501}
502
503/// Maximum tolerated stretch of "no body bytes received" before a
504/// streaming download is declared stalled. Catches a TCP connection
505/// that completed handshake (so connect_timeout doesn't fire) but
506/// then silently stops delivering body data — a common CDN failure
507/// mode where keepalive holds the socket open while the upstream
508/// origin is unreachable. The 60s value is generous enough that a
509/// real slow uplink delivering chunks every few seconds never
510/// triggers it, but tight enough that a wedged connection surfaces
511/// before the run's overall test timeout.
512const DOWNLOAD_NO_PROGRESS_TIMEOUT: Duration = Duration::from_secs(60);
513
514/// Streaming `Read` adapter for kernel tarball downloads.
515///
516/// Wraps the [`reqwest::blocking::Response`] body to do two things
517/// the bare response cannot:
518///
519/// 1. **Body-progress watchdog.** Tracks `last_progress` (the
520///    instant of the last successful read with `n > 0`) and errors
521///    when more than [`DOWNLOAD_NO_PROGRESS_TIMEOUT`] elapses
522///    between byte-producing reads. Without this, a CDN edge that
523///    keepalives the socket but stops delivering body bytes would
524///    only surface after reqwest's per-request read timeout
525///    ([`DOWNLOAD_REQUEST_READ_TIMEOUT`], 300s), which bounds a
526///    single stalled `read()`; the watchdog applies the tighter
527///    60s no-progress bound across successive reads. The check fires
528///    BEFORE the inner `read()` so a stalled inner reader cannot
529///    out-block the watchdog.
530///
531/// 2. **Streaming SHA-256.** Updates a [`Sha256`] hasher with every
532///    byte that flows past, so the caller can verify the finalized
533///    digest against an expected value (parsed out of
534///    `sha256sums.asc`) without a second pass over the data. The
535///    hasher only sees bytes that were actually consumed by the
536///    decoder + tar extractor, which is the same set of bytes that
537///    landed on disk — so a partial download that errored midway
538///    produces a hash over only what we successfully streamed,
539///    preventing false-positive verifications on truncated input.
540///
541/// Sits between [`reqwest::blocking::Response`] and the
542/// decompression layer (`XzDecoder` / `GzDecoder`); both
543/// decompressors expose `into_inner()` so the wrapper can be
544/// recovered after extraction completes (see
545/// [`Self::finalize`]).
546struct DownloadStream<R: Read> {
547    /// Underlying reqwest response body. Owned because `XzDecoder`
548    /// and `GzDecoder` take ownership of their inner reader, so
549    /// the wrapper must hold the response by value rather than by
550    /// reference.
551    inner: R,
552    /// Running SHA-256 hasher updated on every byte-producing read.
553    /// Consumed by [`DownloadStream::finalize`] (which takes `self`
554    /// by value); the call site recovers the wrapper from inside
555    /// the decoder + tar archive chain via `into_inner` before
556    /// finalizing.
557    hasher: Sha256,
558    /// Total body bytes read so far. Surfaced in the watchdog
559    /// error message so an operator triaging "no progress" can see
560    /// how many bytes did arrive before the stall — distinguishing
561    /// "connection dropped after a few bytes" from "connection
562    /// dropped after most of the payload".
563    bytes_total: u64,
564    /// `Instant` of the last successful read with `n > 0`. Set at
565    /// construction (not on first read) so a connection that wins
566    /// the handshake but never delivers any body bytes still
567    /// trips the watchdog after [`DOWNLOAD_NO_PROGRESS_TIMEOUT`]
568    /// rather than waiting for an indeterminate pre-data window.
569    last_progress: Instant,
570    /// Tolerated stretch of zero-progress time. Pinned at
571    /// construction from [`DOWNLOAD_NO_PROGRESS_TIMEOUT`]; held in
572    /// the struct rather than read from the constant on every
573    /// `read()` so a future per-call override (e.g. shorter
574    /// timeouts in tests) lands without touching the watchdog
575    /// logic.
576    no_progress_timeout: Duration,
577    /// Optional indicatif download bar, advanced by `inc(n)` on
578    /// every byte-producing read in lockstep with `bytes_total`.
579    /// `None` is the no-bar path (non-TTY, or no progress group
580    /// threaded in) and carries zero per-read overhead beyond the
581    /// `Option` check. Advancing here — the single byte-accounting
582    /// site — guarantees `bar.position() == finalize().1`, so the
583    /// bar can never drift from the bytes the hasher and watchdog
584    /// observed.
585    progress: Option<indicatif::ProgressBar>,
586}
587
588impl<R: Read> DownloadStream<R> {
589    /// Construct a streaming wrapper around `inner` with the production
590    /// no-progress budget, optionally attaching an indicatif progress
591    /// bar. `last_progress` is set to "now" so the watchdog clock starts
592    /// at construction; the downstream decoder may take an indeterminate
593    /// time before the first `read()`, but any actual progress resets
594    /// the clock. The optional bar is advanced by `inc(n)` on every
595    /// byte-producing read (see the `progress` field); `progress = None`
596    /// is the non-TTY / no-group path (no bar). The bar is a pure
597    /// observer — it never affects the watchdog gate or the streaming
598    /// sha256, so a stalled or truncated download still surfaces its
599    /// error unchanged.
600    fn with_progress(inner: R, progress: Option<indicatif::ProgressBar>) -> Self {
601        Self {
602            inner,
603            hasher: Sha256::new(),
604            bytes_total: 0,
605            last_progress: Instant::now(),
606            no_progress_timeout: DOWNLOAD_NO_PROGRESS_TIMEOUT,
607            progress,
608        }
609    }
610
611    /// Consume the wrapper and return `(hex_digest, bytes_total)`.
612    /// Lowercase hex matches the format kernel.org publishes in
613    /// `sha256sums.asc`, so the caller can do a direct
614    /// `eq_ignore_ascii_case` comparison without re-encoding.
615    fn finalize(self) -> (String, u64) {
616        (hex::encode(self.hasher.finalize()), self.bytes_total)
617    }
618}
619
620impl<R: Read> Read for DownloadStream<R> {
621    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
622        // Watchdog gate: trip BEFORE delegating to the inner reader
623        // so a stalled inner read does not get a fresh chance to
624        // run after the no-progress window has already expired. The
625        // wrapper cannot interrupt a `read()` that is currently
626        // blocked in a syscall — that protection comes from the
627        // per-request timeout configured via
628        // `RequestBuilder::timeout` — but it can refuse to issue
629        // the next call once the cumulative no-progress window
630        // crosses the bound.
631        let elapsed = self.last_progress.elapsed();
632        if elapsed > self.no_progress_timeout {
633            return Err(std::io::Error::new(
634                std::io::ErrorKind::TimedOut,
635                format!(
636                    "download stalled: no body bytes for {}s after {} bytes received",
637                    elapsed.as_secs(),
638                    self.bytes_total,
639                ),
640            ));
641        }
642        match self.inner.read(buf) {
643            Ok(0) => {
644                // EOF: do NOT update last_progress — a 0-byte read
645                // is not progress, and updating here would let a
646                // decoder that polls past EOF reset the watchdog
647                // indefinitely.
648                Ok(0)
649            }
650            Ok(n) => {
651                self.hasher.update(&buf[..n]);
652                self.bytes_total += n as u64;
653                self.last_progress = Instant::now();
654                // Advance the bar in lockstep with `bytes_total` (same
655                // `n`, same reads) so `position()` and `finalize().1`
656                // never diverge. No-op when no bar is attached.
657                if let Some(pb) = &self.progress {
658                    pb.inc(n as u64);
659                }
660                Ok(n)
661            }
662            Err(e) => Err(e),
663        }
664    }
665}
666
667/// Per-request body-stream timeout passed to
668/// [`reqwest::blocking::RequestBuilder::timeout`] for tarball
669/// downloads. The blocking client treats this as a per-`read()`
670/// deadline (reset on every successful read), so it complements the
671/// [`DownloadStream`] watchdog: reqwest's deadline kills a single
672/// stalled syscall, and the watchdog observes the cumulative
673/// no-progress window across multiple reads. Set generously
674/// (5 minutes) because a slow but progressing connection can
675/// legitimately take that long for a single read on a large CDN
676/// chunk; the watchdog provides the tighter 60s no-progress bound.
677const DOWNLOAD_REQUEST_READ_TIMEOUT: Duration = Duration::from_secs(300);
678
679/// Total request timeout for [`fetch_sha256sums_from_url`]: bounds
680/// the wall-clock window for the single small-body GET that
681/// retrieves the cleartext-signed checksum manifest. The body is
682/// the `sha256sums.asc` cleartext block — typically a few KiB of
683/// `<hash>  <filename>` lines plus a PGP signature trailer — so a
684/// tight 30 s ceiling fits the realistic case (sub-second on a
685/// healthy CDN edge) while still bounding the failure mode this
686/// guards against: a stalled CDN that accepts the connection but
687/// never delivers bytes. Without a per-request timeout the
688/// shared client only carries [`SHARED_CLIENT_CONNECT_TIMEOUT`]
689/// (handshake-only), so a stalled body read would hang the build
690/// indefinitely. The caller treats any error from this function
691/// as "no expected hash available" and downgrades verification
692/// to a warning, so a 30 s timeout that fires on a hung CDN
693/// surfaces as an unverified-but-progressing download rather
694/// than a wedged build.
695const SHA256SUMS_REQUEST_TIMEOUT: Duration = Duration::from_secs(30);
696
697/// Construct the cdn.kernel.org `sha256sums.asc` URL for a stable
698/// major series:
699/// `https://cdn.kernel.org/pub/linux/kernel/v{major}.x/sha256sums.asc`.
700/// Single source of truth for the manifest URL shape, used by
701/// [`resolve_expected_sha256`] (production) and shared with the
702/// URL-injection test seam so the two never drift.
703fn sha256sums_url(major: u32) -> String {
704    format!("https://cdn.kernel.org/pub/linux/kernel/v{major}.x/sha256sums.asc")
705}
706
707/// GET the cleartext SHA-256 manifest at `url` and return its body.
708///
709/// Returns the file body as a `String` on success. Any error
710/// (transport failure, non-2xx status, non-UTF-8 body) is
711/// propagated; the caller treats failure as "no expected hash
712/// available" and downgrades verification to a warning.
713///
714/// Takes the full `url` rather than a `major` so the GET-and-status
715/// mechanics are reachable with an injected URL (a localhost mock)
716/// without a real cdn.kernel.org round-trip — mirrors the
717/// [`fetch_releases`] / [`cached_releases_with_url`] seam. Production
718/// reaches this only via [`resolve_expected_sha256_from_url`], whose
719/// URL is pinned by [`sha256sums_url`].
720fn fetch_sha256sums_from_url(client: &Client, url: &str) -> Result<String> {
721    tracing::info!(%url, "fetching kernel tarball sha256sums (requires network)");
722    let response = client
723        .get(url)
724        .timeout(SHA256SUMS_REQUEST_TIMEOUT)
725        .send()
726        .with_context(|| format!("fetch {url}"))?;
727    if !response.status().is_success() {
728        anyhow::bail!("fetch {url}: HTTP {}", response.status());
729    }
730    response
731        .text()
732        .with_context(|| format!("read body of {url}"))
733}
734
735/// Extract the SHA-256 hex digest for `target_filename` from the
736/// cleartext-signed `sha256sums.asc` body.
737///
738/// kernel.org publishes `sha256sums.asc` as a PGP-cleartext-signed
739/// document: a `-----BEGIN PGP SIGNED MESSAGE-----` header, an
740/// optional `Hash:` line, a blank line, the cleartext body
741/// (`<64-hex-chars>  <filename>` per line), then a
742/// `-----BEGIN PGP SIGNATURE-----` block. We only need the
743/// cleartext body — signature verification is a separate concern
744/// (the user-facing instruction is "If no expected hash available,
745/// log warning", not "require signature").
746///
747/// Returns `Some(lowercase_hex)` on first match. Returns `None` if
748/// the target filename does not appear in the manifest (e.g. the
749/// upstream rotated or removed the entry).
750fn parse_sha256_for_file(manifest: &str, target_filename: &str) -> Option<String> {
751    // Strip the PGP signature trailer if present. Everything after
752    // the signature marker is binary noise that never contains
753    // checksum lines.
754    let body = manifest
755        .split_once("-----BEGIN PGP SIGNATURE-----")
756        .map(|(before, _)| before)
757        .unwrap_or(manifest);
758    for line in body.lines() {
759        let line = line.trim();
760        // sha256sum format: `<64-hex-chars><whitespace><filename>`.
761        // Split on whitespace; require exactly two tokens and a
762        // 64-char hex first token.
763        let mut parts = line.split_whitespace();
764        let Some(hash) = parts.next() else { continue };
765        let Some(name) = parts.next() else { continue };
766        if name != target_filename {
767            continue;
768        }
769        if hash.len() != 64 || !hash.chars().all(|c| c.is_ascii_hexdigit()) {
770            continue;
771        }
772        return Some(hash.to_ascii_lowercase());
773    }
774    None
775}
776
777/// Verify `actual_hex` against `expected_hex` (case-insensitive).
778/// Returns `Ok(())` on match, `Err` with a diagnostic message on
779/// mismatch. Pulled out of the call site so the comparison logic
780/// has one home and the diagnostic carries both digests in lowercase
781/// hex for direct copy-paste reuse.
782fn verify_sha256(actual_hex: &str, expected_hex: &str, url: &str) -> Result<()> {
783    if actual_hex.eq_ignore_ascii_case(expected_hex) {
784        Ok(())
785    } else {
786        anyhow::bail!(
787            "sha256 mismatch for {url}: expected {}, got {}. \
788             If cdn.kernel.org updated this tarball in-place, \
789             retry with --skip-sha256 to bypass verification.",
790            expected_hex.to_ascii_lowercase(),
791            actual_hex.to_ascii_lowercase(),
792        );
793    }
794}
795
796/// Resolve the expected SHA-256 digest for a stable tarball from
797/// cdn.kernel.org's `sha256sums.asc` manifest.
798///
799/// Three outcomes:
800/// - `Some(hex)` — manifest fetched and the entry for `tarball_name`
801///   was parsed cleanly.
802/// - `None` with no warning (only when `skip_sha256 = true`) —
803///   operator explicitly opted out of verification; emits a single
804///   security-sensitive bypass warning instead.
805/// - `None` with a per-cause warning (manifest fetch failed, or
806///   manifest fetched but entry missing) — best-effort fallback so
807///   a transient cdn.kernel.org outage / schema drift does not
808///   gate the whole download.
809///
810/// The fallback path is deliberately permissive: we trade strict
811/// authentication for build availability. A network-path attacker
812/// who can deny `sha256sums.asc` while serving a poisoned
813/// `linux-{version}.tar.xz` could exploit this; operators who
814/// require strict verification should pin the source via a
815/// `--kernel <path>` or `--kernel git+…` source rather than the
816/// download path. The bypass warnings
817/// surface on the operator's diagnostic stream so the lost
818/// guarantee is visible to ops triage.
819///
820/// Extracted from [`download_stable_tarball`] so the gate is
821/// directly unit-testable without mocking network calls — the
822/// caller-supplied `client` reaches a `Client::get` only when
823/// `skip_sha256 == false`, so a `skip_sha256 = true` test does not
824/// need a configured `Client`.
825fn resolve_expected_sha256(
826    client: &Client,
827    major: u32,
828    tarball_name: &str,
829    skip_sha256: bool,
830) -> Option<String> {
831    resolve_expected_sha256_from_url(client, &sha256sums_url(major), tarball_name, skip_sha256)
832}
833
834/// URL-injectable core of [`resolve_expected_sha256`]: the skip-gate,
835/// fetch-then-parse, and per-cause warn-and-downgrade logic, against
836/// an arbitrary `sha256sums_url`. Production reaches this only via
837/// [`resolve_expected_sha256`], which pins the URL to
838/// [`sha256sums_url`]; the seam exists so the no-skip arm's
839/// fetch-and-parse path is testable against a localhost mock without a
840/// real cdn.kernel.org round-trip — mirrors [`cached_releases_with_url`].
841fn resolve_expected_sha256_from_url(
842    client: &Client,
843    sha256sums_url: &str,
844    tarball_name: &str,
845    skip_sha256: bool,
846) -> Option<String> {
847    if skip_sha256 {
848        tracing::warn!(
849            tarball = %tarball_name,
850            "--skip-sha256: bypassing checksum verification — the \
851             downloaded tarball will not be authenticated against \
852             cdn.kernel.org's sha256sums.asc manifest. Use only when \
853             upstream has updated a tarball in-place and the manifest \
854             is mismatched.",
855        );
856        return None;
857    }
858    // Best-effort expected-hash lookup: any failure (network,
859    // status, parse, missing entry) downgrades to a warning so the
860    // download still proceeds. The warning surfaces the cause so an
861    // operator triaging "kernel build went weird" can spot that
862    // verification was skipped.
863    match fetch_sha256sums_from_url(client, sha256sums_url) {
864        Ok(manifest) => match parse_sha256_for_file(&manifest, tarball_name) {
865            Some(hex) => Some(hex),
866            None => {
867                tracing::warn!(
868                    tarball = %tarball_name,
869                    "sha256sums.asc fetched but no entry for {tarball_name}; \
870                     download will proceed without checksum verification. \
871                     Pass --skip-sha256 to bypass the manifest fetch when \
872                     the entry is known to be absent.",
873                );
874                None
875            }
876        },
877        Err(err) => {
878            tracing::warn!(
879                error = %format!("{err:#}"),
880                "failed to fetch sha256sums.asc; download will proceed \
881                 without checksum verification. Pass --skip-sha256 to \
882                 bypass the manifest fetch when the manifest is known \
883                 to be unavailable.",
884            );
885            None
886        }
887    }
888}
889
890/// GitHub mirror of the linux-stable tree — comprehensive (stable +
891/// base-release `vX.Y.Z` tags back to v2.6) and the authoritative
892/// source for tags whose `.tar.xz` is no longer on cdn.kernel.org.
893///
894/// cdn.kernel.org keeps only the LATEST tarball of each series
895/// currently in `releases.json`; every superseded point release AND
896/// every tag of an EOL series is pruned (a GET for the tarball 404s,
897/// verified empirically — and HEAD is not a dependable existence probe
898/// on the CDN). The gregkh mirror still carries every `vX.Y.Z` tag, and
899/// codeload serves each tag's snapshot as a `tar.gz`, so a codeload
900/// download recovers the source a pruned tarball would have provided —
901/// no clone. Its `ls-refs` advertises every release tag, which
902/// `--include-eol` enumerates to surface EOL series absent from
903/// `releases.json` (see [`cached_stable_tags`]) and which
904/// [`fetch_version_for_prefix`] resolves for an EOL/unreleased series.
905/// github.com advertises allow-sha + a ref-prefix filter and a codeload
906/// CDN; git.kernel.org offers neither. Used by [`download_tarball`]'s
907/// [`TarballNotFound`] fallback and the prefix resolver.
908const STABLE_MIRROR_URL: &str = "https://github.com/gregkh/linux";
909
910/// Marker error attached to a stable-tarball download failure when
911/// cdn.kernel.org returns HTTP 404.
912///
913/// A 404 means the tarball is pruned — an EOL series (absent from
914/// `releases.json`) or a superseded point release (the CDN retains
915/// only each maintained series' latest). [`download_tarball`] detects
916/// this via `downcast_ref` (the context-aware anyhow accessor — a
917/// `chain().any(..is::<T>())` walk would MISS a context-wrapped
918/// marker) and falls back to a codeload snapshot of the tag from the
919/// gregkh mirror ([`STABLE_MIRROR_URL`]). Any other HTTP status is a
920/// hard error with no fallback.
921#[derive(Debug)]
922struct TarballNotFound;
923
924impl std::fmt::Display for TarballNotFound {
925    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
926        f.write_str("stable tarball pruned from cdn.kernel.org (EOL or superseded point release)")
927    }
928}
929
930impl std::error::Error for TarballNotFound {}
931
932/// Download a stable kernel tarball (.tar.xz) from cdn.kernel.org.
933///
934/// Returns a [`TarballNotFound`] error (downcast-detectable) when the
935/// CDN 404s the tarball — see that type for the pruning semantics and
936/// [`download_tarball`] for the git-tag fallback it triggers.
937///
938/// Streams the body through a [`DownloadStream`] watchdog so a
939/// stalled connection (no body bytes for
940/// [`DOWNLOAD_NO_PROGRESS_TIMEOUT`]) surfaces as an error rather
941/// than blocking indefinitely. Computes SHA-256 over the streamed
942/// bytes and verifies against the digest in
943/// `sha256sums.asc` for the matching `linux-{version}.tar.xz`
944/// entry; if the manifest fetch / parse fails (transient outage,
945/// schema drift, missing entry), logs a warning and continues
946/// without verification rather than failing the whole download.
947///
948/// `skip_sha256 = true` bypasses the manifest fetch entirely and
949/// emits a single bypass warning. Intended for the case where
950/// cdn.kernel.org has updated a tarball in-place (a new point
951/// release reusing the same URL) and the manifest is stale or
952/// mismatched. Unverified downloads are a security-sensitive
953/// fallback — the bypass warning surfaces the lost guarantee on
954/// the operator's diagnostic stream.
955fn download_stable_tarball(
956    client: &Client,
957    version: &str,
958    dest_dir: &Path,
959    cli_label: &str,
960    skip_sha256: bool,
961    mp: Option<&crate::cli::FetchProgress>,
962) -> Result<PathBuf> {
963    let major = major_version(version)?;
964    let url = format!("https://cdn.kernel.org/pub/linux/kernel/v{major}.x/linux-{version}.tar.xz");
965    download_stable_tarball_from_url(client, &url, version, dest_dir, cli_label, skip_sha256, mp)
966}
967
968/// URL-injectable core of [`download_stable_tarball`]: the GET, the
969/// 404→[`TarballNotFound`] / other-status→hard-error status gate, and
970/// the stream→verify→extract pipeline, against an arbitrary tarball
971/// `url`. Production reaches this only via [`download_stable_tarball`],
972/// which pins the cdn.kernel.org URL; the seam exists so the status
973/// routing (404 marker vs hard error) is unit-testable against a
974/// localhost mock without a real cdn round-trip — mirrors
975/// [`resolve_expected_sha256_from_url`] / [`fetch_releases`].
976fn download_stable_tarball_from_url(
977    client: &Client,
978    url: &str,
979    version: &str,
980    dest_dir: &Path,
981    cli_label: &str,
982    skip_sha256: bool,
983    mp: Option<&crate::cli::FetchProgress>,
984) -> Result<PathBuf> {
985    let major = major_version(version)?;
986    let tarball_name = format!("linux-{version}.tar.xz");
987
988    let expected_sha256 = resolve_expected_sha256(client, major, &tarball_name, skip_sha256);
989
990    tracing::info!(%url, "downloading stable kernel tarball (requires network)");
991    let response = client
992        .get(url)
993        .timeout(DOWNLOAD_REQUEST_READ_TIMEOUT)
994        .send()
995        .with_context(|| format!("download {url}"))?;
996    if !response.status().is_success() {
997        if response.status() == reqwest::StatusCode::NOT_FOUND {
998            // Pruned tarball (EOL series or superseded point release).
999            // Return the downcast-detectable marker so `download_tarball`
1000            // falls back to a codeload snapshot of the tag from the
1001            // gregkh mirror (`STABLE_MIRROR_URL`) rather than failing
1002            // outright.
1003            return Err(anyhow::Error::new(TarballNotFound));
1004        }
1005        anyhow::bail!("download {url}: HTTP {}", response.status());
1006    }
1007    reject_html_response(&response, url)?;
1008    print_download_size(&response, url, cli_label, mp);
1009    // Capture the total before `response` is moved into the stream so a
1010    // determinate (percent + ETA) bar can be built; `None` when the
1011    // server sent no Content-Length, in which case the bar degrades to
1012    // a live byte counter.
1013    let total = response.content_length();
1014
1015    // Route status lines through the progress group (see
1016    // `print_download_size`); `eprintln!` when no group is threaded in.
1017    let status = |line: &str| match mp {
1018        Some(fp) => fp.println(line),
1019        None => eprintln!("{line}"),
1020    };
1021    status(&format!("{cli_label}: extracting tarball (xz)"));
1022    // Stage extraction inside `dest_dir` (same filesystem) so the
1023    // final `fs::rename` into place is atomic and a verification
1024    // failure leaves `dest_dir` untouched. A bad mirror that serves
1025    // a wrong-version archive — or sneaks stray top-level entries
1026    // alongside `linux-{version}/` — gets caught after extraction
1027    // but before anything lands in `dest_dir`. The TempDir's Drop
1028    // sweeps every entry the malicious archive deposited.
1029    let staging =
1030        tempfile::TempDir::new_in(dest_dir).with_context(|| "create extraction staging dir")?;
1031    let download_bar = mp.map(|fp| fp.download_bar(version, total));
1032    let stream = DownloadStream::with_progress(response, download_bar.as_ref().map(|b| b.bar()));
1033    let decoder = xz2::read::XzDecoder::new(stream);
1034    let mut archive = tar::Archive::new(decoder);
1035    archive
1036        .unpack(staging.path())
1037        .with_context(|| "extract tarball")?;
1038
1039    // Recover the watchdog wrapper from inside the decoder/archive
1040    // chain to read the streaming digest. `into_inner` on tar +
1041    // xz2 each peel one layer of the chain. Done after a successful
1042    // unpack so we don't compute over a partial stream.
1043    let stream = archive.into_inner().into_inner();
1044    let (actual_hex, bytes_total) = stream.finalize();
1045    // Download is complete (every byte streamed) — clear the bar
1046    // before emitting the verification status so the two don't overlap.
1047    if let Some(bar) = &download_bar {
1048        bar.finish();
1049    }
1050    if let Some(expected) = expected_sha256.as_deref() {
1051        verify_sha256(&actual_hex, expected, url)?;
1052        status(&format!(
1053            "{cli_label}: sha256 verified ({bytes_total} bytes, hash {actual_hex})"
1054        ));
1055    } else if !skip_sha256 {
1056        // Skip path already emitted its bespoke bypass warning
1057        // before the download; firing again here under "no
1058        // expected sha256 available" would mislead — that wording
1059        // implies a fallback, not an explicit operator opt-out.
1060        tracing::warn!(
1061            url = %url,
1062            bytes = bytes_total,
1063            sha256 = %actual_hex,
1064            "no expected sha256 available for {url}; computed digest \
1065             {actual_hex} over {bytes_total} bytes is unverified",
1066        );
1067    }
1068
1069    let source_dir = promote_staged_kernel_tree(&staging, dest_dir, version)?;
1070    Ok(source_dir)
1071}
1072
1073/// Verify a kernel tarball's staged extraction contains exactly one
1074/// top-level entry named `linux-{version}/` and atomically rename it
1075/// into `dest_dir/linux-{version}`. Bails — leaving `dest_dir`
1076/// untouched — when the staging dir holds a stray entry, when the
1077/// expected inner directory is missing, or when the rename fails.
1078/// The caller's `TempDir` outlives this helper, so its Drop sweeps
1079/// any residual staging contents whether this returns Ok or Err.
1080fn promote_staged_kernel_tree(
1081    staging: &tempfile::TempDir,
1082    dest_dir: &Path,
1083    version: &str,
1084) -> Result<PathBuf> {
1085    let expected_name = format!("linux-{version}");
1086    let mut found_inner = false;
1087    for entry in std::fs::read_dir(staging.path()).with_context(|| "read staging dir entries")? {
1088        let entry = entry.with_context(|| "iterate staging dir entry")?;
1089        let name = entry.file_name();
1090        if name == std::ffi::OsStr::new(&expected_name) {
1091            found_inner = true;
1092        } else {
1093            anyhow::bail!(
1094                "tarball contains unexpected top-level entry {name:?}; \
1095                 expected only {expected_name}/"
1096            );
1097        }
1098    }
1099    if !found_inner {
1100        anyhow::bail!("expected directory {expected_name} after extraction");
1101    }
1102    let inner = staging.path().join(&expected_name);
1103    let source_dir = dest_dir.join(&expected_name);
1104    std::fs::rename(&inner, &source_dir)
1105        .with_context(|| format!("rename {} -> {}", inner.display(), source_dir.display()))?;
1106    Ok(source_dir)
1107}
1108
1109/// Promote the single top-level directory a codeload archive extracts
1110/// out of `staging` into `dest_dir/{canonical}`, so it survives
1111/// `staging`'s `Drop`.
1112///
1113/// Unlike [`promote_staged_kernel_tree`], the top-dir name is not
1114/// `linux-{version}` — GitHub derives it from the ref (`linux-6.11.11`
1115/// for a tag, `linux-{sha}` for a commit, `linux-{branch}` for a
1116/// branch), so this promotes the SOLE entry by structure rather than by
1117/// a fixed name, renaming it to a caller-supplied `canonical` name that
1118/// keys off the resolved commit (collision-free across refs). A hostile
1119/// or malformed snapshot that deposits zero or several top-level
1120/// entries — or a top-level entry that is not a plain directory (a
1121/// regular file, or a symlink, which the directory-entry file-type
1122/// check rejects rather than following) — is rejected before anything
1123/// lands in `dest_dir`; the `TempDir`'s `Drop` sweeps every entry the
1124/// archive left.
1125fn promote_single_kernel_tree(
1126    staging: &tempfile::TempDir,
1127    dest_dir: &Path,
1128    canonical: &str,
1129) -> Result<PathBuf> {
1130    let mut entries = Vec::new();
1131    for entry in std::fs::read_dir(staging.path()).with_context(|| "read staging dir entries")? {
1132        entries.push(entry.with_context(|| "iterate staging dir entry")?);
1133    }
1134    if entries.len() != 1 {
1135        anyhow::bail!(
1136            "codeload archive must contain exactly one top-level entry; found {}",
1137            entries.len()
1138        );
1139    }
1140    let inner = entries[0].path();
1141    // Use the DIRECTORY-ENTRY file type (does NOT follow symlinks) so a
1142    // top-level symlink-to-directory is rejected rather than promoted:
1143    // `Path::is_dir()` would follow the link and accept an
1144    // attacker-chosen target, and `fs::rename` moves the symlink itself
1145    // (it never dereferences), leaving the build reading through it.
1146    let entry_type = entries[0]
1147        .file_type()
1148        .with_context(|| "stat codeload top-level entry")?;
1149    if !entry_type.is_dir() {
1150        anyhow::bail!(
1151            "codeload archive top-level entry is not a plain directory: {}",
1152            inner.display()
1153        );
1154    }
1155    let source_dir = dest_dir.join(canonical);
1156    std::fs::rename(&inner, &source_dir)
1157        .with_context(|| format!("rename {} -> {}", inner.display(), source_dir.display()))?;
1158    Ok(source_dir)
1159}
1160
1161/// Download an RC kernel tarball (.tar.gz) from git.kernel.org.
1162///
1163/// Streams the body through a [`DownloadStream`] watchdog so a
1164/// stalled connection surfaces as an error rather than blocking
1165/// indefinitely. RC tarballs are dynamically generated by gitweb
1166/// at request time and have no published `sha256sums` manifest, so
1167/// this path always logs a warning that the digest is unverified —
1168/// it is computed and surfaced for diagnostic value (operators can
1169/// pin it manually) but never compared to an authoritative source.
1170fn download_rc_tarball(
1171    client: &Client,
1172    version: &str,
1173    dest_dir: &Path,
1174    cli_label: &str,
1175    mp: Option<&crate::cli::FetchProgress>,
1176) -> Result<PathBuf> {
1177    let url = format!("https://git.kernel.org/torvalds/t/linux-{version}.tar.gz");
1178    tracing::info!(%url, "downloading RC kernel tarball (requires network)");
1179
1180    let response = client
1181        .get(&url)
1182        .timeout(DOWNLOAD_REQUEST_READ_TIMEOUT)
1183        .send()
1184        .with_context(|| format!("download {url}"))?;
1185    if response.status() == reqwest::StatusCode::NOT_FOUND {
1186        anyhow::bail!(
1187            "RC tarball not found: {url}\n  \
1188             RC releases are removed from git.kernel.org after the stable version ships."
1189        );
1190    }
1191    if !response.status().is_success() {
1192        anyhow::bail!("download {url}: HTTP {}", response.status());
1193    }
1194    reject_html_response(&response, &url)?;
1195    print_download_size(&response, &url, cli_label, mp);
1196    // RC tarballs are gitweb-generated and often arrive without a
1197    // Content-Length, so `total` is frequently `None` and the bar
1198    // degrades to a live byte counter (rate, no ETA).
1199    let total = response.content_length();
1200
1201    let status = |line: &str| match mp {
1202        Some(fp) => fp.println(line),
1203        None => eprintln!("{line}"),
1204    };
1205    status(&format!("{cli_label}: extracting tarball (gzip)"));
1206    // Stage extraction inside `dest_dir` (same filesystem) so the
1207    // final atomic rename keeps `dest_dir` clean when a bad mirror
1208    // serves a wrong-version archive or sneaks stray top-level
1209    // entries past the archive boundary. RC tarballs have no
1210    // upstream sha256 manifest, so structural verification is the
1211    // only defence against a hostile gitweb response.
1212    let staging =
1213        tempfile::TempDir::new_in(dest_dir).with_context(|| "create extraction staging dir")?;
1214    let download_bar = mp.map(|fp| fp.download_bar(version, total));
1215    let stream = DownloadStream::with_progress(response, download_bar.as_ref().map(|b| b.bar()));
1216    let decoder = flate2::read::GzDecoder::new(stream);
1217    let mut archive = tar::Archive::new(decoder);
1218    archive
1219        .unpack(staging.path())
1220        .with_context(|| "extract tarball")?;
1221
1222    // Surface the streamed digest as a warning. RC tarballs have
1223    // no upstream manifest, so verification is impossible — but
1224    // emitting the hash gives an operator a value they can
1225    // capture for offline pinning if they want to detect drift on
1226    // re-fetch.
1227    let stream = archive.into_inner().into_inner();
1228    let (actual_hex, bytes_total) = stream.finalize();
1229    if let Some(bar) = &download_bar {
1230        bar.finish();
1231    }
1232    tracing::warn!(
1233        url = %url,
1234        bytes = bytes_total,
1235        sha256 = %actual_hex,
1236        "no expected sha256 available for {url} (RC tarballs are \
1237         dynamically generated by git.kernel.org and have no \
1238         published manifest); computed digest {actual_hex} over \
1239         {bytes_total} bytes is unverified",
1240    );
1241
1242    let source_dir = promote_staged_kernel_tree(&staging, dest_dir, version)?;
1243    Ok(source_dir)
1244}
1245
1246/// Download a GitHub source snapshot for `git_ref` as a codeload
1247/// `tar.gz` and extract it, returning an [`AcquiredSource`] keyed
1248/// identically to the clone path ([`git_cache_key`] over the resolved
1249/// `commit_hash`) so a codeload-acquired kernel and a clone-acquired
1250/// one of the same commit share the cache entry.
1251///
1252/// GitHub serves a gzip snapshot for any tag/branch/commit via
1253/// codeload; the caller supplies the `archive_url`
1254/// ([`github_archive_url`]) and the pre-resolved `commit_hash`
1255/// ([`resolve_ref_commit`]) — the snapshot has no `.git`, so the
1256/// commit cannot be read back from the tree. Modeled on
1257/// [`download_rc_tarball`] (gzip decode; codeload carries no sha256
1258/// manifest, so extraction is structurally verified —
1259/// [`promote_single_kernel_tree`] rejects any top level that is not a
1260/// single plain directory (multi-entry, a file, or a symlink) — and
1261/// the streamed digest is logged, not compared).
1262pub(crate) fn download_github_archive(
1263    client: &Client,
1264    archive_url: &str,
1265    git_ref: &str,
1266    commit_hash: &str,
1267    dest_dir: &Path,
1268    cli_label: &str,
1269    mp: Option<&crate::cli::FetchProgress>,
1270) -> Result<AcquiredSource> {
1271    tracing::info!(%archive_url, "downloading GitHub codeload snapshot (requires network)");
1272    let response = client
1273        .get(archive_url)
1274        .timeout(DOWNLOAD_REQUEST_READ_TIMEOUT)
1275        .send()
1276        .with_context(|| format!("download {archive_url}"))?;
1277    if response.status() == reqwest::StatusCode::NOT_FOUND {
1278        anyhow::bail!(
1279            "codeload snapshot not found: {archive_url}\n  \
1280             the ref may not exist on the remote, or the repo is private"
1281        );
1282    }
1283    if !response.status().is_success() {
1284        anyhow::bail!("download {archive_url}: HTTP {}", response.status());
1285    }
1286    reject_html_response(&response, archive_url)?;
1287    print_download_size(&response, archive_url, cli_label, mp);
1288    // codeload responses are dynamically generated and often arrive
1289    // without a Content-Length, so `total` is frequently `None` and the
1290    // bar degrades to a live byte counter.
1291    let total = response.content_length();
1292
1293    let status = |line: &str| match mp {
1294        Some(fp) => fp.println(line),
1295        None => eprintln!("{line}"),
1296    };
1297    status(&format!("{cli_label}: extracting snapshot (gzip)"));
1298    // Stage extraction inside `dest_dir` (same filesystem) so the final
1299    // atomic rename keeps `dest_dir` clean when a bad response serves a
1300    // malformed archive or sneaks stray top-level entries. codeload
1301    // snapshots have no upstream sha256 manifest, so structural
1302    // verification (single top-level dir) is the only defence against a
1303    // hostile response.
1304    let staging =
1305        tempfile::TempDir::new_in(dest_dir).with_context(|| "create extraction staging dir")?;
1306    let short_hash: String = commit_hash.chars().take(7).collect();
1307    let download_bar = mp.map(|fp| fp.download_bar(git_ref, total));
1308    let stream = DownloadStream::with_progress(response, download_bar.as_ref().map(|b| b.bar()));
1309    let decoder = flate2::read::GzDecoder::new(stream);
1310    let mut archive = tar::Archive::new(decoder);
1311    archive
1312        .unpack(staging.path())
1313        .with_context(|| "extract snapshot")?;
1314
1315    // Drain the watchdog to read the streamed digest. codeload has no
1316    // published manifest, so the digest cannot be verified — log it so
1317    // an operator can capture it for offline pinning. `into_inner` peels
1318    // the tar then the gz layer, recovering the `DownloadStream`.
1319    let stream = archive.into_inner().into_inner();
1320    let (actual_hex, bytes_total) = stream.finalize();
1321    if let Some(bar) = &download_bar {
1322        bar.finish();
1323    }
1324    tracing::info!(
1325        url = %archive_url,
1326        bytes = bytes_total,
1327        sha256 = %actual_hex,
1328        "codeload snapshot extracted (unverified: codeload archives have \
1329         no published sha256 manifest)",
1330    );
1331
1332    // Name the promoted tree by the resolved commit so distinct refs
1333    // never collide in `dest_dir` (the tree is temporary — `is_temp`).
1334    let canonical = format!("linux-git-{short_hash}");
1335    let source_dir = promote_single_kernel_tree(&staging, dest_dir, &canonical)?;
1336    let version = read_makefile_version(&source_dir);
1337
1338    Ok(AcquiredSource {
1339        source_dir,
1340        cache_key: git_cache_key(git_ref, commit_hash),
1341        version,
1342        kernel_source: crate::cache::KernelSource::git(short_hash, git_ref),
1343        is_temp: true,
1344        is_dirty: false,
1345        is_git: true,
1346    })
1347}
1348
1349/// Download a kernel tarball (stable or RC) and extract it.
1350///
1351/// `cli_label` prefixes diagnostic status output (e.g. `"ktstr"` or
1352/// `"cargo ktstr"`).
1353///
1354/// `skip_sha256` propagates to `download_stable_tarball` only —
1355/// stable tarballs publish a `sha256sums.asc` manifest the flag
1356/// bypasses. RC tarballs (`download_rc_tarball`) have no published
1357/// manifest so verification is impossible regardless of the flag;
1358/// the RC path always runs unverified and emits its own warning,
1359/// so `skip_sha256` is a no-op on the RC arm. `--kernel <path>` and
1360/// `--kernel git+…` sources do not reach this function at all.
1361///
1362/// `mp` is the progress group the determinate download bar is added
1363/// to; `None` disables the bar (the single-shot `kernel build` paths
1364/// and unit tests pass `None`).
1365pub fn download_tarball(
1366    client: &Client,
1367    version: &str,
1368    dest_dir: &Path,
1369    cli_label: &str,
1370    skip_sha256: bool,
1371    mp: Option<&crate::cli::FetchProgress>,
1372) -> Result<AcquiredSource> {
1373    let (arch, _) = arch_info();
1374    let source_dir = if is_rc(version) {
1375        download_rc_tarball(client, version, dest_dir, cli_label, mp)?
1376    } else {
1377        match download_stable_tarball(client, version, dest_dir, cli_label, skip_sha256, mp) {
1378            Ok(dir) => dir,
1379            // Pruned tarball (EOL series or superseded point release):
1380            // cdn.kernel.org keeps only each maintained series' latest
1381            // .tar.xz. Recover the source from the stable tree's
1382            // `v{version}` tag via a shallow (depth-1) clone. The kernel
1383            // built from this source is cached by the caller under the
1384            // SAME `{version}-tarball-...` key returned below, so a
1385            // re-run hits that cache and never re-clones.
1386            Err(e) if e.downcast_ref::<TarballNotFound>().is_some() => {
1387                let tag = format!("v{version}");
1388                // A 404 says the tarball is gone, not why. cdn.kernel.org
1389                // keeps only the latest tarball per series, but the gregkh
1390                // GitHub mirror carries every `vX.Y.Z` release tag and
1391                // codeload serves the tag's snapshot as a tar.gz — no
1392                // clone, and a commit-pinned snapshot. Resolve the tag to
1393                // its commit first (kind-directed, so a tag never aliases
1394                // a same-named branch); a tag absent there means the
1395                // version simply does not exist — surface the friendly
1396                // "not found" suggestion (with the latest in-series patch)
1397                // instead of a cryptic fetch failure.
1398                let Some(commit_hash) = resolve_ref_commit(
1399                    STABLE_MIRROR_URL,
1400                    &tag,
1401                    crate::kernel_path::GitRefKind::Tag,
1402                ) else {
1403                    anyhow::bail!("{}", version_not_found_msg(client, version));
1404                };
1405                let archive_url = github_archive_url(STABLE_MIRROR_URL, &commit_hash)
1406                    .expect("STABLE_MIRROR_URL is a github.com URL");
1407                let msg = format!(
1408                    "{cli_label}: {version} not on cdn.kernel.org (pruned/EOL); \
1409                     fetching gregkh mirror tag {tag}"
1410                );
1411                match mp {
1412                    Some(fp) => fp.println(&msg),
1413                    None => eprintln!("{msg}"),
1414                }
1415                download_github_archive(
1416                    client,
1417                    &archive_url,
1418                    &tag,
1419                    &commit_hash,
1420                    dest_dir,
1421                    cli_label,
1422                    mp,
1423                )?
1424                .source_dir
1425            }
1426            Err(e) => return Err(e),
1427        }
1428    };
1429
1430    Ok(AcquiredSource {
1431        source_dir,
1432        cache_key: format!("{version}-tarball-{arch}-kc{}", crate::cache_key_suffix()),
1433        version: Some(version.to_string()),
1434        kernel_source: crate::cache::KernelSource::Tarball,
1435        is_temp: true,
1436        is_dirty: false,
1437        is_git: true,
1438    })
1439}
1440
1441/// Parse the patch level from a kernel version string.
1442/// "6.12.8" → Some(8), "7.0" → Some(0), "abc" → None.
1443fn patch_level(version: &str) -> Option<u32> {
1444    let parts: Vec<&str> = version.split('.').collect();
1445    match parts.len() {
1446        2 => Some(0), // "7.0" has patch level 0
1447        3 => parts[2].parse().ok(),
1448        _ => None,
1449    }
1450}
1451
1452/// Production URL for `releases.json`. Tests call [`fetch_releases`] directly with a localhost mock URL.
1453pub(crate) const RELEASES_URL: &str = "https://www.kernel.org/releases.json";
1454
1455/// Fetch `releases.json` from `url` and return a vector of
1456/// [`Release`] records. Issues an HTTP GET unconditionally — no
1457/// cache consultation.
1458///
1459/// Production callers reach this function via
1460/// [`cached_releases_with`] (or [`cached_releases`]) which pass
1461/// [`RELEASES_URL`]; the cache helper only invokes
1462/// `fetch_releases` on a cache miss for the singleton path or on
1463/// the bypass branch for non-singleton clients. Tests that need
1464/// to exercise the underlying GET directly — without the cache
1465/// layer — call this function with a locally-constructed `Client`
1466/// and a localhost URL pointed at a TcpListener-backed mock that
1467/// returns canned `releases.json` content.
1468pub(crate) fn fetch_releases(client: &Client, url: &str) -> Result<Vec<Release>> {
1469    tracing::info!(%url, "fetching kernel.org releases index (requires network)");
1470    let response = client
1471        .get(url)
1472        .send()
1473        .with_context(|| format!("fetch {url}"))?;
1474    if !response.status().is_success() {
1475        anyhow::bail!("fetch {url}: HTTP {}", response.status());
1476    }
1477    let body = response.text().with_context(|| "read response body")?;
1478    parse_releases_body(&body)
1479}
1480
1481fn parse_releases_body(body: &str) -> Result<Vec<Release>> {
1482    let json: serde_json::Value =
1483        serde_json::from_str(body).with_context(|| "parse releases.json")?;
1484    let releases = json
1485        .get("releases")
1486        .and_then(|r| r.as_array())
1487        .ok_or_else(|| anyhow!("releases.json: missing releases array"))?;
1488    let input_rows = releases.len();
1489    let parsed: Vec<Release> = releases
1490        .iter()
1491        .filter_map(|r| {
1492            let moniker = r.get("moniker")?.as_str()?;
1493            let version = r.get("version")?.as_str()?;
1494            Some(Release {
1495                moniker: moniker.to_string(),
1496                version: version.to_string(),
1497            })
1498        })
1499        .collect();
1500    // Per-row tolerance: a corrupt row is silently dropped via the
1501    // filter_map `?` chain so a single bad entry does not abort the
1502    // whole fetch (see `fetch_releases_row_missing_moniker_drops_row`
1503    // and siblings). The drop is also a hazard: the truncated vector
1504    // gets cached in [`RELEASES_CACHE`] for the rest of the process
1505    // lifetime via the singleton path, so a transient malformed row
1506    // at fetch time persists as a partial snapshot for every later
1507    // cache-hit caller. Surface the drop count so an operator
1508    // tailing logs sees that releases.json arrived partial — without
1509    // this, the symptom (a missing version on resolve) is invisible
1510    // until it propagates as "version not found" elsewhere.
1511    let dropped = input_rows - parsed.len();
1512    if dropped > 0 {
1513        tracing::warn!(
1514            input_rows,
1515            parsed_rows = parsed.len(),
1516            dropped,
1517            "releases.json: dropped {dropped} of {input_rows} row(s) \
1518             missing moniker/version (or non-string values); cached \
1519             snapshot will reflect this for the process lifetime"
1520        );
1521    }
1522    Ok(parsed)
1523}
1524
1525/// Fetch the latest stable kernel version from kernel.org.
1526///
1527/// Selects from the `releases` array (moniker "stable" or "longterm"),
1528/// requiring patch version >= 8 to avoid brand-new major versions
1529/// that may have build issues on CI runners.
1530///
1531/// When `client` is the process-wide [`shared_client`] singleton,
1532/// routes through `RELEASES_CACHE`; other clients bypass the
1533/// cache via pointer-equality and exercise `fetch_releases`
1534/// directly — see `cached_releases_with` for details.
1535///
1536/// `cli_label` prefixes diagnostic status output (e.g. `"ktstr"` or
1537/// `"cargo ktstr"`).
1538pub fn fetch_latest_stable_version(client: &Client, cli_label: &str) -> Result<String> {
1539    eprintln!("{cli_label}: fetching latest kernel version");
1540    let releases = cached_releases_with(client)?;
1541
1542    let mut best: Option<&str> = None;
1543    for r in &releases {
1544        if r.moniker != "stable" && r.moniker != "longterm" {
1545            continue;
1546        }
1547        if patch_level(&r.version).unwrap_or(0) < 8 {
1548            continue;
1549        }
1550        // Pick the first matching release — releases.json is ordered
1551        // newest first, so the first stable with patch >= 8 is the best.
1552        best = Some(r.version.as_str());
1553        break;
1554    }
1555
1556    let version =
1557        best.ok_or_else(|| anyhow!("no stable kernel with patch >= 8 found in releases.json"))?;
1558    eprintln!("{cli_label}: latest stable kernel: {version}");
1559    Ok(version.to_string())
1560}
1561
1562/// Parse a version string into numeric components for comparison.
1563/// "6.14.2" → Some((6, 14, 2)), "6.14" → Some((6, 14, 0)),
1564/// "7.0" → Some((7, 0, 0)). Returns None for unparseable versions.
1565fn version_tuple(version: &str) -> Option<(u32, u32, u32)> {
1566    let parts: Vec<&str> = version.split('.').collect();
1567    match parts.len() {
1568        2 => {
1569            let major = parts[0].parse().ok()?;
1570            let minor = parts[1].parse().ok()?;
1571            Some((major, minor, 0))
1572        }
1573        3 => {
1574            let major = parts[0].parse().ok()?;
1575            let minor = parts[1].parse().ok()?;
1576            let patch = parts[2].parse().ok()?;
1577            Some((major, minor, patch))
1578        }
1579        _ => None,
1580    }
1581}
1582
1583/// Return true when `s` is a kernel major.minor prefix like
1584/// `"6.14"` (as opposed to a full patch version `"6.14.2"` or an rc
1585/// tag `"6.15-rc3"`). Callers use this to decide whether the input
1586/// needs prefix resolution via [`fetch_version_for_prefix`].
1587///
1588/// Accepts any string with fewer than 2 dots and no `-rc` substring,
1589/// so `"7"` (single-segment) and `""` both return true. This matches
1590/// the historical inline check used by kernel-build dispatchers.
1591pub fn is_major_minor_prefix(s: &str) -> bool {
1592    s.matches('.').count() < 2 && !s.contains("-rc")
1593}
1594
1595/// Resolve the highest version matching a prefix.
1596///
1597/// E.g., "6.12" → "6.12.81", "6" → "6.19.12" (highest 6.x.y).
1598///
1599/// Scans all monikers in releases.json except linux-next. On no active
1600/// match (an EOL or unreleased series, absent from releases.json),
1601/// resolves the highest `vX.Y.z` stable patch from the gregkh mirror's
1602/// git tags; if the series has NO stable point release yet (only a base
1603/// tag), falls back to the bare `{prefix}` mainline base — see
1604/// `latest_patch_from_git_tags`.
1605///
1606/// When `client` is the process-wide [`shared_client`] singleton,
1607/// routes through `RELEASES_CACHE`; other clients bypass the
1608/// cache via pointer-equality and exercise `fetch_releases`
1609/// directly — see `cached_releases_with` for details. Cache
1610/// scope is releases.json only; the EOL-series git-tag fallback in
1611/// `latest_patch_from_git_tags` always hits the network.
1612///
1613/// `cli_label` prefixes diagnostic status output (e.g. `"ktstr"` or
1614/// `"cargo ktstr"`).
1615pub fn fetch_version_for_prefix(client: &Client, prefix: &str, cli_label: &str) -> Result<String> {
1616    eprintln!("{cli_label}: fetching latest {prefix}.x kernel version");
1617    let releases = cached_releases_with(client)?;
1618
1619    let mut best: Option<(&str, (u32, u32, u32))> = None;
1620    for r in &releases {
1621        if is_skippable_release_moniker(&r.moniker) {
1622            continue;
1623        }
1624        if !r.version.starts_with(prefix) {
1625            continue;
1626        }
1627        if r.version.len() != prefix.len() && r.version.as_bytes()[prefix.len()] != b'.' {
1628            continue;
1629        }
1630        let Some(tuple) = version_tuple(&r.version) else {
1631            continue;
1632        };
1633        if best.is_none() || tuple > best.unwrap().1 {
1634            best = Some((r.version.as_str(), tuple));
1635        }
1636    }
1637
1638    if let Some((version, _)) = best {
1639        eprintln!("{cli_label}: latest {prefix}.x kernel: {version}");
1640        return Ok(version.to_string());
1641    }
1642
1643    eprintln!(
1644        "{cli_label}: {prefix}.x not in releases.json (EOL or unreleased series); \
1645         resolving latest patch via the gregkh mirror tags"
1646    );
1647    match latest_patch_from_git_tags(STABLE_MIRROR_URL, prefix, cli_label)? {
1648        Some(version) => Ok(version),
1649        None => {
1650            // No stable point release for this series — fall back to the
1651            // mainline base (the `{prefix}` release itself, e.g. a series
1652            // just cut with no `.1` yet, per the "only if there is no
1653            // X.Y.z stable use X.Y mainline" rule). The base tarball is
1654            // fetched by the normal download path (cdn.kernel.org,
1655            // falling back to the gregkh mirror snapshot); torvalds is
1656            // the mainline authority the gregkh mirror tracks.
1657            eprintln!(
1658                "{cli_label}: no {prefix}.x stable point release; using {prefix} mainline base"
1659            );
1660            Ok(prefix.to_string())
1661        }
1662    }
1663}
1664
1665/// Resolve a series' latest stable patch by ls-remote-ing the gregkh
1666/// GitHub mirror's `refs/tags/v{prefix}.{patch}` tags and taking the
1667/// highest patch. Returns `Ok(None)` when the series has NO stable
1668/// point release (no `v{prefix}.N` tag) — the caller then falls back to
1669/// the mainline base.
1670///
1671/// The gregkh mirror is the RELIABLE EOL-resolution source: it carries
1672/// every `vX.Y.Z` release tag (back to v2.6) and its codeload CDN
1673/// serves each tag's tarball, so resolution and the pruned-tarball
1674/// download (see [`download_tarball`]'s fallback) share ONE
1675/// comprehensive mirror. cdn.kernel.org cannot be used here: its
1676/// `v{major}.x/` directory index 404s, and its `sha256sums.asc` is
1677/// served inconsistently per CDN edge (200 from some nodes, 404 from
1678/// others — the 404 nodes break CI runners while the tarball fetch on
1679/// those same nodes still succeeds).
1680fn latest_patch_from_git_tags(url: &str, prefix: &str, cli_label: &str) -> Result<Option<String>> {
1681    eprintln!("{cli_label}: resolving {prefix}.x release tags via {url}");
1682    let refs = ls_remote_refs(url)
1683        .with_context(|| format!("ls-remote {url} for {prefix}.x release tags"))?;
1684    match max_tag_patch(refs.iter().map(ref_full_name), prefix) {
1685        Some(patch) => {
1686            let version = format!("{prefix}.{patch}");
1687            eprintln!("{cli_label}: latest {prefix}.x kernel (from git tags): {version}");
1688            Ok(Some(version))
1689        }
1690        None => Ok(None),
1691    }
1692}
1693
1694/// The advertised full ref name (`refs/...`), as raw bytes, of a
1695/// protocol handshake ref.
1696fn ref_full_name(r: &gix::protocol::handshake::Ref) -> &[u8] {
1697    use gix::protocol::handshake::Ref::{Direct, Peeled, Symbolic, Unborn};
1698    match r {
1699        Peeled { full_ref_name, .. }
1700        | Direct { full_ref_name, .. }
1701        | Symbolic { full_ref_name, .. }
1702        | Unborn { full_ref_name, .. } => full_ref_name.as_ref(),
1703    }
1704}
1705
1706/// Highest `{patch}` among `refs/tags/v{prefix}.{patch}` ref names.
1707///
1708/// gix folds an annotated tag's peeled entry into a single
1709/// `Ref::Peeled` whose `full_ref_name` is the BASE name — no `^{}`
1710/// suffix — and a lightweight tag arrives as a `Ref::Direct` with the
1711/// base name too, so every tag advertises its base
1712/// `refs/tags/v{prefix}.{patch}` name for the needle to match. The
1713/// `^{}` strip below is therefore a defensive no-op on real gix output
1714/// (it only affects a raw wire ref name gix never emits; the base
1715/// entry supplies the patch regardless). Pure (no network) so it is
1716/// unit-testable with synthetic ref names.
1717///
1718/// The trailing `.` in the `refs/tags/v{prefix}.` needle keeps a
1719/// `6.14` prefix from matching a `6.140` series, and the numeric-only
1720/// patch tail rejects `-rc` and other non-release tags.
1721fn max_tag_patch<'a>(ref_names: impl Iterator<Item = &'a [u8]>, prefix: &str) -> Option<u32> {
1722    let needle = format!("refs/tags/v{prefix}.");
1723    let mut best: Option<u32> = None;
1724    for name in ref_names {
1725        let Some(rest) = name.strip_prefix(needle.as_bytes()) else {
1726            continue;
1727        };
1728        let rest = rest.strip_suffix(b"^{}").unwrap_or(rest);
1729        if let Ok(s) = std::str::from_utf8(rest)
1730            && let Ok(patch) = s.parse::<u32>()
1731        {
1732            best = Some(best.map_or(patch, |b| b.max(patch)));
1733        }
1734    }
1735    best
1736}
1737
1738/// ls-remote the gregkh stable mirror ([`STABLE_MIRROR_URL`]) once and
1739/// cache the release version strings (`X.Y.Z`) parsed from its
1740/// `refs/tags/vX.Y.Z` advertisement, for `--include-eol` range
1741/// expansion. Returns EVERY release-tag version verbatim (including
1742/// `-rc*` and old series); the caller
1743/// (`crate::cli::select_series_latest_in_range`) does the
1744/// range / rc / per-series filtering. `None` on ls-remote failure —
1745/// not cached, so the next caller retries. gregkh/linux mirrors
1746/// linux-stable comprehensively (tags back to v2.6), so this surfaces
1747/// EOL series that `releases.json` has dropped.
1748pub(crate) fn cached_stable_tags() -> Option<&'static [String]> {
1749    if let Some(tags) = STABLE_TAGS_CACHE.get() {
1750        return Some(tags.as_slice());
1751    }
1752    let refs = ls_remote_refs(STABLE_MIRROR_URL)?;
1753    let tags: Vec<String> = refs
1754        .iter()
1755        .filter_map(|r| {
1756            // Base tag name only: gix folds an annotated tag's peeled
1757            // entry into one `Ref::Peeled` carrying the base name, and a
1758            // lightweight tag is a `Ref::Direct` with the base name, so
1759            // `^{}` never appears on real gix output — the strip is a
1760            // defensive no-op. Non-`refs/tags/v*` refs are skipped.
1761            let name = ref_full_name(r);
1762            let v = name.strip_prefix(b"refs/tags/v")?;
1763            let v = v.strip_suffix(b"^{}").unwrap_or(v);
1764            std::str::from_utf8(v).ok().map(|s| s.to_string())
1765        })
1766        .collect();
1767    // Loser of a concurrent race discards its clone (both fetched the
1768    // same advertisement, so the cached content is equivalent).
1769    let _ = STABLE_TAGS_CACHE.set(tags);
1770    STABLE_TAGS_CACHE.get().map(|v| v.as_slice())
1771}
1772
1773/// Cache key for a git-cloned kernel: the raw user ref verbatim, the
1774/// resolved commit's FULL hash, the target arch, and the
1775/// kconfig-fragment suffix. The SINGLE construction site, shared by all
1776/// three sharers of a commit's cache entry: [`git_clone`] (post-clone,
1777/// from `head_id`), `download_github_archive` (post-download, keyed on
1778/// the resolved commit), and the pre-fetch ls-remote cache probe in
1779/// `resolve_git_kernel` — a drift between any of them would make the
1780/// probe miss the entry the fetch wrote and defeat the fetch-skip, and
1781/// split the codeload and clone paths onto separate entries for one
1782/// commit.
1783///
1784/// The FULL 40-hex commit hash keys the entry (not a 7-hex prefix): a
1785/// branch/tag tip moves over time, so the `{git_ref}` segment alone
1786/// cannot distinguish successive commits — only the hash does. A 7-hex
1787/// (28-bit) prefix would let a moved tip whose new commit shares the
1788/// first 7 hex with the cached old commit hit the stale entry and serve
1789/// the wrong kernel build under the new ref. The full id removes that
1790/// collision class; the probe and clone both render full lowercase hex
1791/// before any truncation, so keying on it is drift-free.
1792pub(crate) fn git_cache_key(git_ref: &str, commit_hash: &str) -> String {
1793    let (arch, _) = arch_info();
1794    // Sanitize the ref segment so no ref can produce a key
1795    // validate_cache_key (cache::housekeeping) rejects: it rejects `/`,
1796    // `\`, `..`, a NUL byte, and a leading `.`. A slashed branch ref
1797    // (e.g. `for-next/core`) or a dot-prefixed ref (`.foo`) would
1798    // otherwise be uncacheable verbatim and break both the pre-fetch
1799    // probe lookup and the store. The full commit_hash already makes
1800    // the key unique, so collapsing several refs onto one sanitized
1801    // prefix is safe — two refs at the same commit want the same build;
1802    // two at different commits differ in the hash segment.
1803    let safe_ref: String = git_ref
1804        .chars()
1805        .map(|c| {
1806            if c == '/' || c == '\\' || c == '\0' {
1807                '_'
1808            } else {
1809                c
1810            }
1811        })
1812        .collect();
1813    let safe_ref = safe_ref.replace("..", "__");
1814    // A leading `.` (hidden entry, `.` / `..`) is rejected by
1815    // validate_cache_key; prefix `_` so a `.foo` ref stays cacheable.
1816    let safe_ref = if safe_ref.starts_with('.') {
1817        format!("_{safe_ref}")
1818    } else {
1819        safe_ref
1820    };
1821    format!(
1822        "{safe_ref}-git-{commit_hash}-{arch}-kc{}",
1823        crate::cache_key_suffix()
1824    )
1825}
1826
1827/// If `url` is a GitHub remote, build the codeload archive URL for the
1828/// resolved `commit_hash`: `github.com/OWNER/REPO/archive/<commit>.tar.gz`
1829/// (302 → codeload.github.com, its CDN) serves a gzip source snapshot
1830/// for any commit — verified empirically. This lets a GitHub source's
1831/// commit be fetched over HTTP (no clone, no server-side allow-sha
1832/// requirement) rather than cloned. `None` for a non-GitHub URL
1833/// (self-hosted / GitLab / …) — those take the gix clone path.
1834///
1835/// The caller resolves the ref to `commit_hash` FIRST (a kind-directed
1836/// ls-remote; a sha is already the commit), so the download fetches the
1837/// EXACT commit the cache entry is keyed on — a branch tip that
1838/// advances between the ls-remote probe and this GET cannot mislabel
1839/// the entry the way a ref-name snapshot would. `commit_hash` is
1840/// lowercased to align with `git_cache_key`'s hash segment.
1841///
1842/// Accepts the https/http/ssh/git and scp-style GitHub remotes, each
1843/// with an optional trailing `/` and `.git`; the host is matched
1844/// case-insensitively (DNS hostnames are case-insensitive).
1845pub(crate) fn github_archive_url(url: &str, commit_hash: &str) -> Option<String> {
1846    // Match the github.com scheme+host CASE-INSENSITIVELY (DNS
1847    // hostnames are case-insensitive, so `GitHub.com` is a GitHub URL),
1848    // keeping the OWNER/REPO path verbatim. Accept the https/http/ssh/git
1849    // schemes (with an optional `git@` userinfo) and the scp-style
1850    // git@github.com:OWNER/REPO, each with an optional trailing `.git`.
1851    let mut path = None;
1852    for prefix in [
1853        "https://github.com/",
1854        "http://github.com/",
1855        "ssh://git@github.com/",
1856        "ssh://github.com/",
1857        "git://github.com/",
1858        "git@github.com:",
1859    ] {
1860        if url
1861            .get(..prefix.len())
1862            .is_some_and(|head| head.eq_ignore_ascii_case(prefix))
1863        {
1864            path = Some(&url[prefix.len()..]);
1865            break;
1866        }
1867    }
1868    let path = path?;
1869    // Trim trailing slashes (a common copy-paste artifact) before the
1870    // `.git` strip so `OWNER/REPO/` and `OWNER/REPO.git/` still resolve
1871    // to codeload rather than misrouting to the clone path.
1872    let path = path.trim_end_matches('/');
1873    let path = path.strip_suffix(".git").unwrap_or(path);
1874    // Exactly OWNER/REPO — reject deeper paths (a stray extra segment
1875    // is not a repo root, so fall through to the clone path).
1876    let mut segs = path.split('/');
1877    let owner = segs.next().filter(|s| !s.is_empty())?;
1878    let repo = segs.next().filter(|s| !s.is_empty())?;
1879    if segs.next().is_some() {
1880        return None;
1881    }
1882    // Always the resolved COMMIT (lowercased) — never a ref-name
1883    // snapshot — so the extracted tree matches git_cache_key's commit
1884    // exactly regardless of a concurrent branch-tip move. codeload
1885    // serves any commit case-insensitively.
1886    Some(format!(
1887        "https://github.com/{owner}/{repo}/archive/{}.tar.gz",
1888        commit_hash.to_ascii_lowercase()
1889    ))
1890}
1891
1892/// The object id the advertised ref named exactly `target` points at,
1893/// or `None` if no ref matches. For an annotated tag (`Ref::Peeled`)
1894/// this is the PEELED commit (`object`), never the tag object;
1895/// `Ref::Unborn` carries no commit and never matches. Used by the
1896/// kind-directed [`resolve_ref_commit`] so tag-peeling and
1897/// unborn-skipping stay consistent.
1898fn pick_ref_object(
1899    refs: &[gix::protocol::handshake::Ref],
1900    target: &str,
1901) -> Option<gix::hash::ObjectId> {
1902    refs.iter().find_map(|r| {
1903        let (name, object) = match r {
1904            gix::protocol::handshake::Ref::Peeled {
1905                full_ref_name,
1906                object,
1907                ..
1908            }
1909            | gix::protocol::handshake::Ref::Direct {
1910                full_ref_name,
1911                object,
1912            }
1913            | gix::protocol::handshake::Ref::Symbolic {
1914                full_ref_name,
1915                object,
1916                ..
1917            } => (full_ref_name, object),
1918            gix::protocol::handshake::Ref::Unborn { .. } => return None,
1919        };
1920        (*name == target).then_some(*object)
1921    })
1922}
1923
1924/// Resolve `git_ref` to its full commit hash under `ref_kind`, via a
1925/// kind-directed ls-remote. Unlike the clone path, the codeload
1926/// download has no checked-out `.git` to read `head_id` from, so it
1927/// resolves the commit here — [`git_cache_key`] needs it to key the
1928/// entry a clone of the same ref would write (shared cache).
1929///
1930/// A `Sha` ref IS the commit (lowercased to match `git_clone`'s
1931/// rendering) and resolves offline — no handshake. `Tag`/`Branch`
1932/// match ONLY the fully-qualified `refs/tags/{ref}` / `refs/heads/{ref}`
1933/// so a tag never aliases a same-named branch (a bare-name DWIM lookup
1934/// would resolve either). `None` on
1935/// ls-remote failure, no match, or `Unknown` (rejected by
1936/// [`crate::kernel_path::KernelId::validate`] upstream, so it is never
1937/// resolved).
1938pub(crate) fn resolve_ref_commit(
1939    url: &str,
1940    git_ref: &str,
1941    ref_kind: crate::kernel_path::GitRefKind,
1942) -> Option<String> {
1943    use crate::kernel_path::GitRefKind;
1944    let target = match ref_kind {
1945        GitRefKind::Sha => return Some(git_ref.to_ascii_lowercase()),
1946        GitRefKind::Tag => format!("refs/tags/{git_ref}"),
1947        GitRefKind::Branch => format!("refs/heads/{git_ref}"),
1948        GitRefKind::Unknown => return None,
1949    };
1950    pick_ref_object(&ls_remote_refs(url)?, &target).map(|object| format!("{object}"))
1951}
1952
1953/// True when `git_ref` is a full 40-char hex commit id — recognizable
1954/// as a sha without a remote handshake. A 39/41-char ref, or any
1955/// 40-char ref carrying a non-hex byte, is a name (branch/tag) and
1956/// falls through to ls-remote. Case is not normalized here (the caller
1957/// lowercases the full hash to match `git_clone`'s rendering).
1958fn is_full_sha(git_ref: &str) -> bool {
1959    git_ref.len() == 40 && git_ref.bytes().all(|b| b.is_ascii_hexdigit())
1960}
1961
1962/// ls-remote `url` and return EVERY advertised ref WITHOUT fetching a
1963/// pack. Best-effort: `None` on any failure (network, auth). Shared by
1964/// [`resolve_ref_commit`] (resolve one kind-directed ref → commit),
1965/// [`cached_stable_tags`], and [`latest_patch_from_git_tags`] (highest
1966/// `v{prefix}.{patch}` tag).
1967///
1968/// The ad-hoc repo (`init_opts` on a tempdir, with repo-local git config
1969/// only — see `anon_open_opts`) carries no working tree and fetches no
1970/// pack. Remote-side ref-prefix filtering is
1971/// DISABLED: gix's default (`prefix_from_spec_as_filter_on_remote =
1972/// true`) derives protocol-v2 `ls-refs` `ref-prefix` filters from the
1973/// remote's fetch refspecs; an anonymous `remote_at` has none, and
1974/// `fetch_tags = Included` injects only `refs/tags/*`, so the server
1975/// would return TAGS ONLY and `refs/heads/*` would never arrive.
1976/// Disabling the filter returns all refs, so a branch, tag, or HEAD
1977/// all resolve.
1978fn ls_remote_refs(url: &str) -> Option<Vec<gix::protocol::handshake::Ref>> {
1979    let tmp = tempfile::TempDir::new().ok()?;
1980    let repo = gix::ThreadSafeRepository::init_opts(
1981        tmp.path(),
1982        gix::create::Kind::WithWorktree,
1983        gix::create::Options::default(),
1984        anon_open_opts(),
1985    )
1986    .ok()?
1987    .to_thread_local();
1988    let remote = repo.remote_at(url).ok()?;
1989    let conn = remote.connect(gix::remote::Direction::Fetch).ok()?;
1990    let (refmap, _handshake) = conn
1991        .ref_map(
1992            gix::progress::Discard,
1993            gix::remote::ref_map::Options {
1994                prefix_from_spec_as_filter_on_remote: false,
1995                ..Default::default()
1996            },
1997        )
1998        .ok()?;
1999    Some(refmap.remote_refs)
2000}
2001
2002/// Open options for ktstr's git fetches: load ONLY repo-local git
2003/// config, never the user (`~/.gitconfig`), XDG, system
2004/// (`/etc/gitconfig`), or `GIT_CONFIG_*` env sources. This neutralizes a
2005/// `url.<base>.insteadOf` rewrite (e.g. a developer rule mapping
2006/// `https://github.com/` to `git@github.com:`) that would otherwise
2007/// route an anonymous public fetch through SSH and prompt for the key
2008/// passphrase once per operation — several at once under the concurrent
2009/// intra-range kernel resolution. Environment permissions stay at the
2010/// Full-trust default so an `http(s)_proxy` env var still applies.
2011///
2012/// SCOPE: these opts govern EVERY gix remote path — the internal version
2013/// resolution (`ls_remote_refs` and its callers) AND every user-supplied
2014/// `git+URL` clone via `git_clone_inner`, including a self-hosted
2015/// `git+https://...` source. The tradeoff is deliberate: a PUBLIC source
2016/// (the common case — kernel.org / gregkh / torvalds mirrors) fetches
2017/// anonymously with no credential prompt, and a PRIVATE source must use a
2018/// `git+ssh://user@host/repo` URL (SSH authenticates via `~/.ssh`,
2019/// independent of gitconfig). gitconfig-driven auth (an `insteadOf`
2020/// HTTPS->SSH rewrite plus credential/`git_binary` config) is
2021/// intentionally NOT honored, so it can never silently reroute an
2022/// anonymous fetch through SSH. The ad-hoc temp repos carry no local
2023/// config, so the effective URL-rewrite set is empty: the passed URL is
2024/// used verbatim.
2025fn anon_open_opts() -> gix::open::Options {
2026    use gix::sec::trust::DefaultForLevel;
2027    let mut opts = gix::open::Options::default_for_level(gix::sec::Trust::Full);
2028    opts.permissions.config.system = false;
2029    opts.permissions.config.git = false;
2030    opts.permissions.config.user = false;
2031    opts.permissions.config.env = false;
2032    opts.permissions.config.git_binary = false;
2033    opts
2034}
2035
2036/// Shallow-clone a git repository at a BRANCH ref.
2037///
2038/// `cli_label` prefixes diagnostic status output (e.g. `"ktstr"` or
2039/// `"cargo ktstr"`).
2040///
2041/// `mp` is the progress group a determinate clone bar is added to;
2042/// `None` disables the bar and passes `gix::progress::Discard` to gix
2043/// exactly as before (the single-shot `kernel build` paths and unit
2044/// tests pass `None`). The bar shows real object/file counts + ETA
2045/// during the receiving / resolving / checkout phases that gix reports
2046/// a bounded total for; see the `crate::cli::progress` module.
2047///
2048/// For a TAG ref use `git_clone_tag`: gix's shallow clone only
2049/// resolves branches via `with_ref_name` — see `git_clone_inner`.
2050pub fn git_clone(
2051    url: &str,
2052    git_ref: &str,
2053    dest_dir: &Path,
2054    cli_label: &str,
2055    mp: Option<&crate::cli::FetchProgress>,
2056) -> Result<AcquiredSource> {
2057    git_clone_inner(url, git_ref, dest_dir, cli_label, mp, None)
2058}
2059
2060/// Shallow-clone a git repository at a TAG ref (e.g. `v6.14.11`).
2061///
2062/// gix's shallow clone routes the ref through `Category::LocalBranch`
2063/// (`refs/heads/`) in its single-branch-shallow path
2064/// (`gix::clone::fetch`), so a tag never matches on the remote and the
2065/// fetch fails with "None of the refspec(s) matched". This appends a
2066/// `+refs/tags/{tag}:refs/heads/{tag}` refspec so the tag is fetched
2067/// into the local branch ref the checkout resolves. The `#tag=` git
2068/// source (via [`git_clone_kinded`]) uses this for a non-GitHub remote;
2069/// a GitHub remote takes the codeload path instead. (The pruned/EOL
2070/// tarball recovery no longer clones — [`download_tarball`]'s
2071/// `TarballNotFound` fallback fetches a gregkh codeload snapshot.)
2072pub(crate) fn git_clone_tag(
2073    url: &str,
2074    tag: &str,
2075    dest_dir: &Path,
2076    cli_label: &str,
2077    mp: Option<&crate::cli::FetchProgress>,
2078) -> Result<AcquiredSource> {
2079    let extra_refspec = format!("+refs/tags/{tag}:refs/heads/{tag}");
2080    git_clone_inner(url, tag, dest_dir, cli_label, mp, Some(extra_refspec))
2081}
2082
2083/// Clone a git source at `git_ref`, dispatching on `ref_kind` to the
2084/// correct clone path. A well-formed `github.com/OWNER/REPO` source
2085/// normally takes the codeload path ([`download_github_archive`], via
2086/// [`crate::cli::resolve_git_kernel`]) and reaches here only as a
2087/// fallback when the pre-fetch ls-remote resolution failed (no commit →
2088/// no codeload URL). A `github.com` URL whose path is not exactly
2089/// `OWNER/REPO` (so `github_archive_url` returns `None`) can still reach
2090/// the `Sha` arm below.
2091///
2092/// - `Tag` → [`git_clone_tag`] (adds the `refs/tags/*` refspec gix's
2093///   shallow path omits).
2094/// - `Branch` → [`git_clone`] (the plain shallow single-branch clone).
2095/// - `Sha` → a hard error: gix cannot fetch a bare commit, and a
2096///   self-hosted server generally lacks allow-sha-in-want. The
2097///   actionable message points at GitHub (codeload serves any sha) or a
2098///   tag/branch.
2099/// - `Unknown` → a hard error; [`crate::kernel_path::KernelId::validate`]
2100///   rejects it upstream, so this is a defensive backstop.
2101pub(crate) fn git_clone_kinded(
2102    url: &str,
2103    git_ref: &str,
2104    ref_kind: crate::kernel_path::GitRefKind,
2105    dest_dir: &Path,
2106    cli_label: &str,
2107    mp: Option<&crate::cli::FetchProgress>,
2108) -> Result<AcquiredSource> {
2109    use crate::kernel_path::GitRefKind;
2110    match ref_kind {
2111        GitRefKind::Tag => git_clone_tag(url, git_ref, dest_dir, cli_label, mp),
2112        GitRefKind::Branch => git_clone(url, git_ref, dest_dir, cli_label, mp),
2113        GitRefKind::Sha => anyhow::bail!(
2114            "git+{url}#sha={git_ref}: fetching this source by commit sha is \
2115             not supported — gix cannot fetch a bare commit and the remote \
2116             lacks allow-sha-in-want. Use a github.com/OWNER/REPO URL \
2117             (codeload serves any commit) or pin a #tag= / #branch= instead."
2118        ),
2119        GitRefKind::Unknown => anyhow::bail!(
2120            "git+{url}: ref kind could not be determined; use #tag=NAME, \
2121             #branch=NAME, or #sha=<40-hex>"
2122        ),
2123    }
2124}
2125
2126/// Shared shallow-clone implementation for [`git_clone`] (branch) and
2127/// [`git_clone_tag`] (tag).
2128///
2129/// `extra_refspec`, when `Some`, is appended to the remote's fetch
2130/// refspecs via `configure_remote` before the fetch (the tag path uses
2131/// it to fetch `refs/tags/*`). `None` leaves the branch clone
2132/// byte-identical to the historical behavior.
2133fn git_clone_inner(
2134    url: &str,
2135    git_ref: &str,
2136    dest_dir: &Path,
2137    cli_label: &str,
2138    mp: Option<&crate::cli::FetchProgress>,
2139    extra_refspec: Option<String>,
2140) -> Result<AcquiredSource> {
2141    // Any 40-hex `git_ref` cannot be cloned here, whatever kind the
2142    // operator meant it as: gix's `with_ref_name(<40-hex>)` treats it as
2143    // an object-id (its own `# Panics` doc: "an object-id as hex-hash"
2144    // panics at `fetch_then_checkout`, gix `clone/access.rs`), and
2145    // fetching a bare commit needs server-side allow-sha-in-want this
2146    // path does not implement. Reject with an actionable error rather
2147    // than panic. Placed at the single clone entry so every caller is
2148    // covered — including a `#branch=`/`#tag=` whose NAME is 40 hex.
2149    if is_full_sha(git_ref) {
2150        anyhow::bail!(
2151            "git+{url}#{git_ref}: cannot fetch a kernel by a raw commit SHA — \
2152             gix's shallow clone treats any 40-hex ref as a commit id (even a \
2153             branch/tag named 40 hex chars). Use a branch or tag name that is \
2154             not 40 hex chars, or on github.com `#sha=<40-hex>` (codeload \
2155             fetches the commit)."
2156        );
2157    }
2158    let cloning = format!("{cli_label}: cloning {url} (ref: {git_ref}, depth: 1)");
2159    match mp {
2160        Some(fp) => fp.println(&cloning),
2161        None => eprintln!("{cloning}"),
2162    }
2163
2164    let clone_dir = dest_dir.join("linux");
2165
2166    // Build the clone with anon_open_opts() (repo-local config only)
2167    // rather than gix::prepare_clone, whose open opts load the user's
2168    // gitconfig and would apply an `insteadOf` HTTPS->SSH rewrite,
2169    // prompting for a key passphrase. Mirrors gix::prepare_clone's
2170    // (WithWorktree, default create opts) otherwise.
2171    let mut prep = gix::clone::PrepareFetch::new(
2172        url,
2173        &clone_dir,
2174        gix::create::Kind::WithWorktree,
2175        gix::create::Options::default(),
2176        anon_open_opts(),
2177    )
2178    .with_context(|| "prepare clone")?
2179    .with_shallow(gix::remote::fetch::Shallow::DepthAtRemote(
2180        NonZeroU32::new(1).expect("1 is nonzero"),
2181    ))
2182    .with_ref_name(Some(git_ref))
2183    .with_context(|| "set ref name")?;
2184
2185    // Tag path only: gix's single-branch-shallow fetch derives its
2186    // refspec from `with_ref_name` via Category::LocalBranch
2187    // (`refs/heads/{ref}`), which never matches a `refs/tags/*` ref.
2188    // Append the caller's `+refs/tags/{tag}:refs/heads/{tag}` so the
2189    // tag is fetched into the branch ref the checkout resolves.
2190    // `with_refspecs` APPENDS (keeping gix's own single-branch spec),
2191    // so a branch clone that reaches here would still match its spec —
2192    // but the branch path passes `None` and skips this entirely.
2193    if let Some(spec) = extra_refspec {
2194        prep = prep.configure_remote(move |remote| {
2195            Ok(remote.with_refspecs(Some(spec.as_str()), gix::remote::Direction::Fetch)?)
2196        });
2197    }
2198
2199    // Drive a determinate clone bar from gix's progress tree (see
2200    // [`crate::cli::progress::CloneProgress`]). `None` when no progress
2201    // group is threaded in; the gix calls then pass `Discard` exactly
2202    // as before. One interrupt flag (never set) is shared by both
2203    // phases, matching the prior per-call `AtomicBool::new(false)`.
2204    let clone_progress = mp.map(|fp| fp.clone_progress(git_ref));
2205    let interrupt = std::sync::atomic::AtomicBool::new(false);
2206
2207    let (mut checkout, _outcome) = match &clone_progress {
2208        Some(cp) => prep
2209            .fetch_then_checkout(cp.item(), &interrupt)
2210            .with_context(|| "clone fetch")?,
2211        None => prep
2212            .fetch_then_checkout(gix::progress::Discard, &interrupt)
2213            .with_context(|| "clone fetch")?,
2214    };
2215
2216    let (_repo, _outcome) = match &clone_progress {
2217        Some(cp) => checkout
2218            .main_worktree(cp.item(), &interrupt)
2219            .with_context(|| "checkout")?,
2220        None => checkout
2221            .main_worktree(gix::progress::Discard, &interrupt)
2222            .with_context(|| "checkout")?,
2223    };
2224
2225    // Clone + checkout done — stop the poll thread, join it, clear the
2226    // bar. On any error path above, `clone_progress` is dropped
2227    // instead, and its `Drop` performs the same shutdown (leak-proof).
2228    if let Some(cp) = clone_progress {
2229        cp.finish();
2230    }
2231
2232    let repo = gix::open(&clone_dir).with_context(|| "open cloned repo")?;
2233    let head = repo.head_id().with_context(|| "read HEAD")?;
2234    // FULL commit hash keys the cache (see `git_cache_key` — a 7-hex
2235    // prefix risks a moved-tip collision serving a stale build); the
2236    // 7-hex `short_hash` is kept only for the human-facing source record.
2237    let commit_hash = format!("{head}");
2238    let short_hash = commit_hash.chars().take(7).collect::<String>();
2239
2240    let cache_key = git_cache_key(git_ref, &commit_hash);
2241
2242    // Record the kernel version from the checked-out source Makefile, as
2243    // local_source does — the worktree is fully checked out here, so a
2244    // git-clone-acquired honoring kernel also earns the 90% tmpfs reclaim
2245    // via the metadata.json sidecar. None on an unreadable/unparsable
2246    // Makefile, which keeps the conservative 50% default.
2247    let version = read_makefile_version(&clone_dir);
2248
2249    Ok(AcquiredSource {
2250        source_dir: clone_dir,
2251        cache_key,
2252        version,
2253        kernel_source: crate::cache::KernelSource::git(short_hash, git_ref),
2254        is_temp: true,
2255        is_dirty: false,
2256        is_git: true,
2257    })
2258}
2259
2260/// Use a local kernel source tree.
2261///
2262/// Dirty detection uses gix `tree_index_status` (HEAD-vs-index) and
2263/// `status().into_index_worktree_iter()` (index-vs-worktree) to check
2264/// for modifications to tracked files. Submodule checks are skipped
2265/// entirely. Untracked files do not affect the dirty flag.
2266///
2267/// When the tree is dirty, the HEAD commit does not describe the
2268/// source actually being built, so `git_hash` is dropped — no
2269/// commit identifies a dirty worktree. `is_dirty=true` carries that
2270/// fact forward; callers (see [`crate::cli`]) use it to bypass the
2271/// kernel cache entirely.
2272///
2273/// No diagnostic output: all operator-visible messaging for a
2274/// local source is routed through `kernel_build_pipeline`'s
2275/// cache-skip hint (`DIRTY_TREE_CACHE_SKIP_HINT` /
2276/// `NON_GIT_TREE_CACHE_SKIP_HINT`), which has the full context
2277/// to emit a single informational line rather than two redundant
2278/// warnings. Sibling entries (`download_tarball`, `git_clone`)
2279/// still take a `cli_label` because they genuinely print
2280/// progress lines — `local_source` does not.
2281pub fn local_source(source_path: &Path) -> Result<AcquiredSource> {
2282    let (arch, _) = arch_info();
2283
2284    if !source_path.is_dir() {
2285        anyhow::bail!("{}: not a directory", source_path.display());
2286    }
2287
2288    let canonical = source_path
2289        .canonicalize()
2290        .with_context(|| format!("canonicalize {}", source_path.display()))?;
2291
2292    let LocalSourceState {
2293        short_hash,
2294        is_dirty,
2295        is_git,
2296    } = inspect_local_source_state(&canonical)?;
2297
2298    // User .config is folded into the cache key so two builds of the
2299    // same HEAD with different `.config` files do NOT collide on the
2300    // same key — see [`config_hash_for_key`] for the encoding.
2301    // Read at `local_source` time (rather than at the post-build
2302    // store site) so cache LOOKUP and cache STORE see the same key.
2303    let user_config_hash = config_hash_for_key(&canonical);
2304
2305    let cache_key =
2306        compose_local_cache_key(arch, &short_hash, &canonical, user_config_hash.as_deref());
2307
2308    // Record the kernel version from the source-tree Makefile so the
2309    // tmpfs-fraction gate (TmpfsFraction::for_kernel_version, via the
2310    // cache metadata.json sidecar) recognizes a locally-built honoring
2311    // kernel — symmetric with the tarball path. None when the Makefile
2312    // is unreadable/unparsable, which keeps the conservative 50% default.
2313    let version = read_makefile_version(&canonical);
2314
2315    Ok(AcquiredSource {
2316        source_dir: canonical.clone(),
2317        cache_key,
2318        version,
2319        kernel_source: crate::cache::KernelSource::Local {
2320            source_tree_path: Some(canonical),
2321            git_hash: short_hash,
2322        },
2323        is_temp: false,
2324        is_dirty,
2325        is_git,
2326    })
2327}
2328
2329/// Parse the kernel `MAJOR.MINOR.PATCH` version from a source tree's
2330/// top-level `Makefile` (`VERSION` / `PATCHLEVEL` / `SUBLEVEL`) — the
2331/// authoritative version of a locally-built kernel, mirroring the
2332/// version a tarball acquisition records. Returns `None` if the
2333/// `Makefile` is unreadable or any of the three fields is absent or
2334/// non-numeric, so the caller records no version and the rootfs tmpfs
2335/// fraction conservatively defaults to 50% (the honoring gate
2336/// `TmpfsFraction::for_kernel_version` keys on a positively-known
2337/// version). `EXTRAVERSION` (e.g. `-rc7`) is intentionally ignored: the
2338/// gate keys on `MAJOR.MINOR.PATCH` only.
2339fn read_makefile_version(source_dir: &Path) -> Option<String> {
2340    let text = std::fs::read_to_string(source_dir.join("Makefile")).ok()?;
2341    // Each field is a top-of-file `NAME = N` assignment; take the first
2342    // matching line and require a bare integer (a trailing comment or
2343    // non-numeric value yields None for that field, hence overall None).
2344    let field = |name: &str| -> Option<u16> {
2345        text.lines().find_map(|line| {
2346            line.trim()
2347                .strip_prefix(name)?
2348                .trim_start()
2349                .strip_prefix('=')?
2350                .trim()
2351                .parse::<u16>()
2352                .ok()
2353        })
2354    };
2355    Some(format!(
2356        "{}.{}.{}",
2357        field("VERSION")?,
2358        field("PATCHLEVEL")?,
2359        field("SUBLEVEL")?
2360    ))
2361}
2362
2363/// Result of [`inspect_local_source_state`] — git hash and dirty/git
2364/// classification of a canonical source-tree path. Pulled out of
2365/// [`local_source`] so the post-build dirty re-check (a second call
2366/// from [`crate::cli::kernel_build_pipeline`]) reuses the exact same
2367/// gix path.
2368#[derive(Debug, Clone)]
2369pub struct LocalSourceState {
2370    /// HEAD short hash (7 chars). `None` when the tree is dirty
2371    /// (HEAD doesn't describe the actual source) or non-git (no
2372    /// HEAD at all). Mirrors the `git_hash` field on
2373    /// [`AcquiredSource::kernel_source`] for [`crate::cache::KernelSource::Local`].
2374    pub short_hash: Option<String>,
2375    /// Tracked-file dirt: HEAD-vs-index disagreement OR
2376    /// index-vs-worktree disagreement. Always `true` for non-git
2377    /// trees (dirty detection is impossible without git, so the
2378    /// pessimistic stance is dirty).
2379    pub is_dirty: bool,
2380    /// `true` when `gix::discover` succeeded (the tree is a git
2381    /// repo); `false` otherwise. Lets the cache-skip hint branch
2382    /// on whether `commit` / `stash` is actionable.
2383    pub is_git: bool,
2384}
2385
2386/// Inspect a canonical source-tree path for git hash + dirty state.
2387///
2388/// Submodule checks are skipped (false positives on kernel trees
2389/// with uninitialized submodules). The non-git arm returns
2390/// `(None, true, false)` so the caller's cache-skip hint can
2391/// distinguish "dirty git repo" from "not a git repo at all".
2392///
2393/// Called twice per build by [`crate::cli::kernel_build_pipeline`]:
2394/// once at acquire time (via [`local_source`]) and again after
2395/// `make` returns to detect mid-build worktree edits, branch flips,
2396/// or commits that would otherwise let a racing-write build land in
2397/// the cache under a stale identity. Both calls share the same gix
2398/// path so the post-build comparison is apples-to-apples.
2399///
2400/// Non-atomic against concurrent git operations: the probe runs
2401/// six sequential gix calls (`discover` → `head_id` → `head_tree`
2402/// → `index_or_empty` → `tree_index_status` → `status`), each a
2403/// separate filesystem read with no transactional bracket. A
2404/// concurrent `git commit`, `git add`, or worktree write between
2405/// any two calls can produce internally-inconsistent results —
2406/// e.g. `head_id` reads commit C0, a peer commit lands C1, then
2407/// `head_tree` reads C1's root tree and the diff against the
2408/// post-add index reports unexpected dirt. Git itself serializes
2409/// its own writes via per-resource lockfiles under `.git/`
2410/// (`index.lock` for staging operations, `HEAD.lock` and
2411/// `refs/heads/<branch>.lock` for ref updates), so peer `git`
2412/// processes wait on whichever lockfile their operation touches;
2413/// the genuinely-unsynchronized class is worktree-only writes
2414/// (autoformatter, IDE-on-save) which the index-worktree status
2415/// step catches regardless of timing.
2416///
2417/// The disposition is intentionally pessimistic so inconsistency is
2418/// safe: any `Err` propagates to the caller, which treats it as a
2419/// rebuild signal (`MidWaitState::ProbeFailed` in the mid-wait
2420/// caller); any spurious dirty signal falls into DirtyEdit /
2421/// HashAdvanced, both forcing a rebuild. The cost of a false-
2422/// positive rebuild is one extra `make`; the cost of a false-
2423/// negative would be a cache slot keyed on a HEAD that no longer
2424/// describes the source — the asymmetry is the reason for the
2425/// pessimistic disposition. Callers should treat the returned
2426/// state as a best-effort approximation of probe-time, not an
2427/// instantaneous snapshot.
2428pub fn inspect_local_source_state(canonical: &Path) -> Result<LocalSourceState> {
2429    let (short_hash, is_dirty, is_git) = match gix::discover(canonical) {
2430        Ok(repo) => {
2431            let head = repo.head_id().with_context(|| "read HEAD")?;
2432            let short_hash = format!("{}", head).chars().take(7).collect::<String>();
2433
2434            // tree_index_status compares a TREE id against the index;
2435            // the HEAD commit id is not itself a tree, so peel HEAD
2436            // to its root tree before diffing or the diff silently
2437            // returns an error and index dirt goes undetected.
2438            let head_tree = repo.head_tree().with_context(|| "read HEAD tree")?;
2439            let head_tree_id = head_tree.id;
2440
2441            // Check HEAD-vs-index for tracked file changes.
2442            let mut index_dirty = false;
2443            let index = repo.index_or_empty().with_context(|| "open index")?;
2444            let _ = repo.tree_index_status(
2445                &head_tree_id,
2446                &index,
2447                None,
2448                gix::status::tree_index::TrackRenames::Disabled,
2449                |_, _, _| {
2450                    index_dirty = true;
2451                    Ok::<_, std::convert::Infallible>(std::ops::ControlFlow::Break(()))
2452                },
2453            );
2454
2455            // Check index-vs-worktree for modified tracked files,
2456            // skipping submodules entirely (Ignore::All).
2457            let worktree_dirty = if !index_dirty {
2458                repo.status(gix::progress::Discard)
2459                    .with_context(|| "status")?
2460                    .index_worktree_rewrites(None)
2461                    .index_worktree_submodules(gix::status::Submodule::Given {
2462                        ignore: gix::submodule::config::Ignore::All,
2463                        check_dirty: false,
2464                    })
2465                    .index_worktree_options_mut(|opts| {
2466                        opts.dirwalk_options = None;
2467                    })
2468                    .into_index_worktree_iter(Vec::new())
2469                    .map(|mut iter| iter.next().is_some())
2470                    .unwrap_or(false)
2471            } else {
2472                false
2473            };
2474
2475            let is_dirty = index_dirty || worktree_dirty;
2476            // Drop the HEAD hash when dirty — the commit does not
2477            // describe the actual source being built, so publishing
2478            // it via git_hash / cache_key would misidentify the
2479            // build input.
2480            let hash = if is_dirty { None } else { Some(short_hash) };
2481            (hash, is_dirty, true)
2482        }
2483        Err(_) => {
2484            // The downstream kernel_build_pipeline (cli::kernel_build_pipeline)
2485            // emits `NON_GIT_TREE_CACHE_SKIP_HINT` — a single
2486            // informational line that names both the cause and the
2487            // remediation paths — once the is_dirty=true branch
2488            // decides to skip the cache. Emitting a second
2489            // "not a git repository" warning here duplicated that
2490            // content for every non-git `--kernel <path>` run. The
2491            // `(None, true, false)` tuple silently communicates
2492            // the non-git state to the cache-skip decision site;
2493            // no separate stderr line is needed on this path.
2494            (None, true, false)
2495        }
2496    };
2497    Ok(LocalSourceState {
2498        short_hash,
2499        is_dirty,
2500        is_git,
2501    })
2502}
2503
2504/// Compose the cache key for a local source given its arch, optional
2505/// HEAD short hash, canonical source path, and optional user
2506/// `.config` hash.
2507///
2508/// Three shapes:
2509/// - `local-{hash7}-{arch}-kc{suffix}` — clean git tree, no user
2510///   `.config` (plain `make defconfig` path or no config file yet)
2511/// - `local-{hash7}-{arch}-cfg{user_config}-kc{suffix}` — clean git
2512///   tree with a user `.config` whose hash differs from `defconfig`
2513/// - `local-unknown-{path_hash}-{arch}-kc{suffix}` — dirty / non-git
2514///   tree (HEAD does not describe the source; the path-derived
2515///   crc32 salt keeps two distinct dirty trees from colliding on the
2516///   same `local-unknown-...` slot)
2517///
2518/// `path_hash` is the full 8-char (32-bit) lowercase-hex CRC32 of
2519/// the canonical source-path bytes. CRC32 keeps the per-path
2520/// disambiguator stable across runs without pulling in a
2521/// crypto-grade hash for what is fundamentally a slot disambiguator.
2522///
2523/// `user_config_hash` is `None` whenever the source tree has no
2524/// `.config` file yet (the build will run `make defconfig` and
2525/// produce one). This collapses the user-config branch back into the
2526/// hash-only key so a fresh checkout's first build still hits a
2527/// later cache lookup keyed without the cfg segment.
2528pub fn compose_local_cache_key(
2529    arch: &str,
2530    short_hash: &Option<String>,
2531    canonical: &Path,
2532    user_config_hash: Option<&str>,
2533) -> String {
2534    let suffix = crate::cache_key_suffix();
2535    match short_hash {
2536        Some(hash) => match user_config_hash {
2537            Some(cfg) => format!("local-{hash}-{arch}-cfg{cfg}-kc{suffix}"),
2538            None => format!("local-{hash}-{arch}-kc{suffix}"),
2539        },
2540        None => {
2541            let path_hash = canonical_path_hash(canonical);
2542            format!("local-unknown-{path_hash}-{arch}-kc{suffix}")
2543        }
2544    }
2545}
2546
2547/// CRC32 of the canonical source-path bytes, lowercase hex
2548/// (full 8-char width — the entire 32-bit value). Disambiguates
2549/// `local-unknown-...` cache keys and per-source-tree lockfile
2550/// names across distinct dirty / non-git source trees so two
2551/// parallel `cargo ktstr test --kernel ./linux-a` and
2552/// `--kernel ./linux-b` runs can't write each other's vmlinux into
2553/// the same cache slot or share a single source-tree flock.
2554///
2555/// Full 32 bits (8 hex chars) of CRC32 keep collision risk
2556/// negligible against the practical population (handful of source
2557/// trees per host) while staying human-readable. The earlier
2558/// 6-char (24-bit) form left ~6× the collision surface for the
2559/// same key shape; truncation served no purpose other than visual
2560/// brevity. Path bytes are taken via `OsStr::as_encoded_bytes` so
2561/// a non-UTF-8 component (rare on Linux but possible) doesn't lose
2562/// entropy through a UTF-8 lossy conversion.
2563pub(crate) fn canonical_path_hash(canonical: &Path) -> String {
2564    let bytes = canonical.as_os_str().as_encoded_bytes();
2565    format!("{:08x}", crc32fast::hash(bytes))
2566}
2567
2568/// Read `<canonical>/.config` and return its CRC32 as a lowercase
2569/// hex string suitable for embedding in the cache key. Returns
2570/// `None` when no `.config` exists (a fresh tree before the build
2571/// runs `make defconfig`).
2572///
2573/// Distinct from the `config_hash` written into [`crate::cache::KernelMetadata`]
2574/// at store time — that records the FINAL `.config` after
2575/// configuration runs, for diagnostic display in `kernel list`.
2576/// This helper records the PRE-BUILD `.config` so the cache key
2577/// reflects what the operator's tree currently has on disk; the
2578/// same `.config` content always maps to the same key, even if the
2579/// downstream `make olddefconfig` step elaborates additional
2580/// defaults.
2581fn config_hash_for_key(canonical: &Path) -> Option<String> {
2582    let config_path = canonical.join(".config");
2583    let data = std::fs::read(&config_path).ok()?;
2584    Some(format!("{:08x}", crc32fast::hash(&data)))
2585}
2586
2587#[cfg(test)]
2588#[path = "fetch_tests.rs"]
2589mod tests;