ktstr/monitor/
arena.rs

1//! Host-side BPF arena page enumeration.
2//!
3//! `BPF_MAP_TYPE_ARENA` (kernel uapi value [`BPF_MAP_TYPE_ARENA`]) is
4//! a sparse, page-granular memory region shared between BPF programs
5//! and userspace. The kernel allocates a 4 GiB-plus-guard
6//! (`KERN_VM_SZ`) `vm_struct` and lazily maps order-0 pages into it
7//! on demand (see `kernel/bpf/arena.c::arena_alloc_pages` and
8//! `arena_vm_fault`); the user-visible window is at
9//! `[arena.user_vm_start .. arena.user_vm_end)`, a 32-bit-addressable
10//! range whose lower 32 bits the BPF JIT uses as the arena pointer
11//! payload. Translation kernel-side is:
12//!
13//! ```text
14//! kern_vm_start = arena->kern_vm->addr + GUARD_SZ/2
15//! kaddr         = kern_vm_start + (u32)user_addr
16//! page          = vmalloc_to_page(kaddr)   // PTE walk on init_mm
17//! ```
18//!
19//! The host-side walker mirrors this: read the arena's `kern_vm`
20//! pointer, dereference to get `vm_struct.addr`, add `GUARD_SZ/2`,
21//! then for each pgoff in `0..max_entries` compute `kaddr` and run
22//! `GuestMem::translate_kva` (the existing PTE walker against
23//! `init_mm`'s page table). `max_entries` is the BPF map's declared
24//! page capacity from `bpf_map_create()` — it is the source of truth
25//! for "how many pages this arena could hold", regardless of whether
26//! the scheduler exposes a userspace mmap (some don't, leaving
27//! `user_vm_start == user_vm_end == 0`). Pages whose translate fails
28//! are simply "not faulted in" — arena maps are sparse by design.
29//!
30//! The walker does NOT consult `arena->rt` (the range_tree of free
31//! pgoffs) — `range_tree` polarity is "set = free" / "clear =
32//! allocated", reading it from a frozen snapshot would only tell
33//! the host which pages the kernel *intended* to be allocated, not
34//! which are actually mapped. The PTE walk is the source of truth.
35//!
36//! [`BPF_MAP_TYPE_ARENA`]: BPF_MAP_TYPE_ARENA
37
38use anyhow::{Context, Result};
39use serde::{Deserialize, Serialize};
40use std::path::Path;
41
42use btf_rs::Btf;
43
44use super::Kva;
45use super::bpf_map::{BPF_MAP_TYPE_ARENA, BpfMapInfo};
46use super::btf_offsets::{find_struct, load_btf_from_path, member_byte_offset};
47use super::guest::GuestKernel;
48
49/// Page size used by the arena walker, derived from the GUEST
50/// kernel's MMU configuration.
51///
52/// `arena_alloc_pages` and `arena_vm_fault` both call
53/// `apply_to_page_range` on `PAGE_SIZE`-granular ranges where
54/// `PAGE_SIZE` is the GUEST kernel's own MMU page size. The host's
55/// page size is irrelevant — ktstr can run a 16 KiB-granule guest
56/// on a 4 KiB-granule host (and vice versa), and the arena layout
57/// must match the guest's view.
58///
59/// On x86_64 the guest page granule is fixed at 4 KiB. On aarch64
60/// the granule is encoded in `TCR_EL1.TG1` (bits `\[31:30\]`):
61///   - `0b10` → 4 KiB
62///   - `0b01` → 16 KiB
63///   - `0b11` → 64 KiB
64///
65/// Falls back to 4 KiB when the architecture branches reject the
66/// register value (e.g. uninitialized `tcr_el1 == 0` on aarch64);
67/// the fallback is conservative — at worst the walker overscans a
68/// small arena and surfaces extra `pgoff` slots that translate to
69/// `None`. A guest with non-4 KiB granule whose `tcr_el1` reads
70/// zero would be a freeze-path bug elsewhere (the freeze
71/// coordinator polls until `tcr_el1` populates before snapshotting).
72fn guest_page_size(tcr_el1: u64) -> u64 {
73    #[cfg(target_arch = "x86_64")]
74    {
75        let _ = tcr_el1;
76        4096
77    }
78    #[cfg(target_arch = "aarch64")]
79    {
80        match (tcr_el1 >> 30) & 0x3 {
81            0b10 => 4096,
82            0b01 => 16384,
83            0b11 => 65536,
84            _ => 4096, // 0b00 reserved; conservative fallback
85        }
86    }
87    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
88    {
89        let _ = tcr_el1;
90        4096
91    }
92}
93
94/// `GUARD_SZ / 2` from `kernel/bpf/arena.c`.
95///
96/// Kernel formula:
97///   `GUARD_SZ = round_up(1ull << sizeof_field(struct bpf_insn, off) * 8,
98///                        PAGE_SIZE << 1)`
99/// where `sizeof_field(struct bpf_insn, off) * 8 = 16` so the lower
100/// term is `1 << 16 = 65536`. Result depends on the kernel's page
101/// granule (`PAGE_SIZE << 1`):
102///   - 4 KiB pages: `round_up(65536, 8192)` = 65536, GUARD_HALF = 32768.
103///   - 16 KiB pages: `round_up(65536, 32768)` = 65536, GUARD_HALF = 32768.
104///   - 64 KiB pages: `round_up(65536, 131072)` = 131072, GUARD_HALF = 65536.
105///
106/// `bpf_arena_get_kern_vm_start` returns `arena->kern_vm->addr +
107/// GUARD_SZ/2`, so the kernel-side accessible region starts
108/// `GUARD_HALF` past the raw `vm_struct.addr`. The walker must add
109/// this offset when translating user-VA to kern-VA.
110fn guard_half(page_size: u64) -> u64 {
111    (1u64 << 16).next_multiple_of(page_size << 1) / 2
112}
113
114/// Maximum number of pages the walker will translate per arena
115/// sequentially.
116///
117/// `KERN_VM_SZ = SZ_4G + GUARD_SZ` is the kernel's vmalloc reservation
118/// (~1M pages) but most arenas use a small fraction. Cap the
119/// sequential walk at 4096 pages (16 MiB) to bound report size and
120/// freeze-path latency (a full 1M-page walk at ~1 µs per
121/// translate_kva would burn ~1 s on the freeze hot path); truncation
122/// is surfaced via [`ArenaSnapshot::truncated`] and a sparse stride
123/// sweep (see [`MAX_ARENA_STRIDE_PROBES`]) catches mapped pages
124/// beyond this cap.
125const MAX_ARENA_PAGES: u64 = 4096;
126
127/// Number of evenly-spaced stride probes the walker performs across
128/// pgoffs `MAX_ARENA_PAGES`..`declared_pages` when `declared_pages`
129/// exceeds the sequential cap. Lets the walker surface mapped pages
130/// in sparse arenas (e.g. a scheduler that allocated pages near the
131/// 4 GiB end of its user_vm window) without paying the full 1M-page
132/// translate_kva cost.
133///
134/// 256 probes × ~1 µs per translate ≈ 0.25 ms — negligible on the
135/// freeze hot path. Each hit lands in [`ArenaSnapshot::pages`]
136/// alongside the sequential prefix, so the consumer sees both.
137const MAX_ARENA_STRIDE_PROBES: u64 = 256;
138
139/// Defensive cap on the arena's address-range span, in bytes.
140///
141/// The walker computes its span from `info.max_entries * page_size`
142/// (the BPF map's declared page capacity, see [`snapshot_arena`]).
143/// `arena_map_alloc` allows at most 4 GiB worth of pages by design —
144/// the BPF JIT addresses arena pointers via the low 32 bits of the
145/// user address, so anything wider than `0x1_0000_0000` cannot be a
146/// real arena layout (see the `vm_range > SZ_4G` check in
147/// `arena_map_alloc`, `kernel/bpf/arena.c`). A torn / corrupt
148/// `bpf_map.max_entries` or a freeze-time race against `arena_map_alloc`
149/// could yield a wild value;
150/// cap it here so the walker never multiplies a near-`u64::MAX` page
151/// count by the page size (overflow) or attempts to walk billions of
152/// pgoffs (live-lock on the freeze path).
153const MAX_VM_RANGE_BYTES: u64 = 0x1_0000_0000;
154
155/// Byte offsets within `struct bpf_arena` and `struct vm_struct`
156/// needed for the host-side arena walker.
157///
158/// Resolved from BTF at startup so the walker doesn't hardcode kernel
159/// layout. Mirrors the [`super::btf_offsets::BpfMapOffsets`] pattern.
160#[derive(Debug, Clone)]
161pub struct BpfArenaOffsets {
162    /// Offset of `kern_vm` (`struct vm_struct *`) within `struct bpf_arena`.
163    pub arena_kern_vm: usize,
164    /// Offset of `user_vm_start` (u64) within `struct bpf_arena`.
165    pub arena_user_vm_start: usize,
166    /// Offset of `addr` (`void *`) within `struct vm_struct`.
167    pub vm_struct_addr: usize,
168}
169
170impl BpfArenaOffsets {
171    /// Parse BTF from a vmlinux ELF and resolve arena field offsets.
172    ///
173    /// Returns Err on kernels whose BTF lacks `bpf_arena` (i.e. arena
174    /// support is not built in) — the caller can treat the absent
175    /// offsets as a signal to skip arena enumeration.
176    ///
177    /// Production callers (the freeze coordinator) reach this code
178    /// via [`Self::from_btf`] on a pre-parsed `&Btf` to amortize the
179    /// ELF parse — `from_vmlinux` stays public as the convenience
180    /// entry point for direct-from-vmlinux callers (CLI tools, unit
181    /// tests against a vmlinux on disk).
182    #[allow(dead_code)]
183    pub fn from_vmlinux(path: &Path) -> Result<Self> {
184        let btf = load_btf_from_path(path).context("btf: open vmlinux")?;
185        Self::from_btf(&btf)
186    }
187
188    /// Resolve arena struct offsets from a pre-loaded BTF object.
189    pub fn from_btf(btf: &Btf) -> Result<Self> {
190        let (bpf_arena, _) = find_struct(btf, "bpf_arena")
191            .context("btf: struct bpf_arena not found (arena unsupported on this kernel?)")?;
192        let arena_kern_vm = member_byte_offset(btf, &bpf_arena, "kern_vm")?;
193        let arena_user_vm_start = member_byte_offset(btf, &bpf_arena, "user_vm_start")?;
194
195        let (vm_struct, _) =
196            find_struct(btf, "vm_struct").context("btf: struct vm_struct not found")?;
197        let vm_struct_addr = member_byte_offset(btf, &vm_struct, "addr")?;
198
199        Ok(Self {
200            arena_kern_vm,
201            arena_user_vm_start,
202            vm_struct_addr,
203        })
204    }
205}
206
207/// One mapped arena page captured from guest memory.
208#[derive(Debug, Clone, Default, Serialize, Deserialize)]
209#[non_exhaustive]
210pub struct ArenaPage {
211    /// User-side virtual address (32-bit window starting at
212    /// `arena.user_vm_start`). Operators correlate this with the
213    /// pointer values they see in BPF program output.
214    pub user_addr: u64,
215    /// One arena page's worth of bytes read from the guest. Length
216    /// matches the guest kernel's MMU page size: 4 KiB on x86_64
217    /// and on aarch64 with `TCR_EL1.TG1=0b10`; 16 KiB on aarch64
218    /// 16 KiB-granule kernels (Apple Silicon style); 64 KiB on
219    /// aarch64 64 KiB-granule kernels. The resolution lives in
220    /// `guest_page_size` — the snapshot stamps every captured
221    /// page at that size.
222    pub bytes: Vec<u8>,
223}
224
225/// Snapshot of one arena map's mapped pages.
226#[derive(Debug, Clone, Default, Serialize, Deserialize)]
227#[non_exhaustive]
228pub struct ArenaSnapshot {
229    /// Mapped pages, in pgoff order (skipped over unmapped pgoffs).
230    /// Sequential prefix (pgoffs `0..MAX_ARENA_PAGES`) followed by any
231    /// stride-probe hits in the sparse tail (pgoffs sampled across
232    /// `MAX_ARENA_PAGES..declared_pages`).
233    #[serde(default, skip_serializing_if = "Vec::is_empty")]
234    pub pages: Vec<ArenaPage>,
235    /// True when the walker stopped sequential enumeration at
236    /// `MAX_ARENA_PAGES` before finishing the user_vm window. The
237    /// stride sweep that follows samples the tail at coarse intervals,
238    /// so a hit reaches `pages` even when this flag is set; pgoffs
239    /// between sampled positions are still silently skipped.
240    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
241    pub truncated: bool,
242    /// Total declared page count. Derived from
243    /// `max_entries * page_size` (the BPF map's declared page
244    /// capacity, with `page_size` resolved from the guest's
245    /// TCR_EL1 via `guest_page_size`), not the user_vm window.
246    /// Reflects any `MAX_VM_RANGE_BYTES` cap. Surfaced alongside
247    /// `pages.len()` so consumers can see the
248    /// allocated-vs-declared ratio.
249    pub declared_pages: u64,
250    /// True when `max_entries * page_size` exceeded
251    /// `MAX_VM_RANGE_BYTES` (4 GiB) and the walker capped the span
252    /// before computing `declared_pages`. Indicates a torn / corrupt
253    /// `bpf_arena` struct or a freeze-time race against initialization;
254    /// the rendered pages still come from valid translates, so the
255    /// snapshot is usable.
256    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
257    pub span_capped: bool,
258    /// Kernel-side base of the arena's user_vm window:
259    /// `bpf_arena.kern_vm->addr + GUARD_HALF`. Surfaces here so
260    /// downstream consumers (notably the `super::sdt_alloc` tree
261    /// walker) can translate `__arena` pointers without re-reading
262    /// `struct bpf_arena` themselves. `0` when the snapshot bailed
263    /// before computing the value (kern_vm_kva NULL, vm_addr NULL,
264    /// or any of the upstream translates failed).
265    ///
266    /// Always serialized — the zero value carries diagnostic
267    /// information ("walker reached this point but couldn't compute
268    /// the base"), so suppressing it would mask the failure. Mirrors
269    /// the policy used for the sibling `declared_pages` field.
270    pub kern_vm_start: u64,
271    /// User-side base of the arena window: the value of
272    /// `bpf_arena.user_vm_start`, the address space the BPF program
273    /// (and any captured `__arena` pointer) sees. `[user_vm_start ..
274    /// user_vm_start + 4 GiB)` is the kernel-enforced upper bound
275    /// (`bpf_arena_alloc_pages` clamps to `SZ_4G`). Consumers use it
276    /// to classify a pointer as "lives in this arena" before chasing
277    /// into [`Self::pages`].
278    ///
279    /// `0` when the snapshot bailed before reading
280    /// `arena.user_vm_start` (e.g. `arena_pa` translate failed). On
281    /// the syscall backend this comes from `bpf_map.map_extra` which
282    /// the kernel pins at create time (`lib/arena_map.h` hardcodes
283    /// `1<<44` on x86, `1<<32` on aarch64). On the guest-memory
284    /// backend it's read directly from
285    /// `bpf_arena.user_vm_start` via the resolved offset.
286    ///
287    /// Always serialized for the same diagnostic reason as
288    /// [`Self::kern_vm_start`].
289    pub user_vm_start: u64,
290}
291
292/// Walk the arena's mapped pages and return a snapshot.
293///
294/// Reads `kern_vm` from `struct bpf_arena` at `info.map_kva`,
295/// dereferences to `vm_struct.addr`, computes
296/// `kern_vm_start = addr + GUARD_HALF`, and for each pgoff in
297/// `0..N` translates `kern_vm_start + (u32)user_addr` via
298/// `GuestMem::translate_kva`. Pages that fail to translate are
299/// "not faulted in" and silently skipped.
300///
301/// The walker is best-effort: any read failure on `bpf_arena` /
302/// `vm_struct` itself yields an empty snapshot rather than an error,
303/// so a corrupt arena can't break the broader failure dump.
304pub fn snapshot_arena(
305    kernel: &GuestKernel,
306    info: &BpfMapInfo,
307    offsets: &BpfArenaOffsets,
308) -> ArenaSnapshot {
309    if info.map_type != BPF_MAP_TYPE_ARENA {
310        return ArenaSnapshot::default();
311    }
312
313    let mem = kernel.mem();
314    let walk = kernel.walk_context();
315    let page_size = guest_page_size(walk.tcr_el1);
316    let guard_half_bytes = guard_half(page_size);
317
318    // bpf_arena embeds bpf_map at offset 0, so map_kva == arena_kva.
319    let arena_kva = info.map_kva;
320    // Translate the arena struct itself — it may be kmalloc'd
321    // (direct map) or vmalloc'd (`bpf_map_area_alloc`).
322    let Some(arena_pa) = super::idr::translate_any_kva(
323        mem,
324        walk.cr3_pa,
325        walk.page_offset,
326        arena_kva,
327        walk.l5,
328        walk.tcr_el1,
329    ) else {
330        return ArenaSnapshot::default();
331    };
332
333    let user_vm_start = mem.read_u64(arena_pa, offsets.arena_user_vm_start);
334    let kern_vm_kva = mem.read_u64(arena_pa, offsets.arena_kern_vm);
335    // Preserve `user_vm_start` even when the kern-side walk fails:
336    // the `MemReader::is_arena_addr` consumer needs it to classify
337    // an `__arena` pointer as in-window (vs. a kernel kptr) so the
338    // Ptr-deref path returns `None` cleanly instead of falling
339    // through to the kernel-kptr cpumask probe. Without the anchor,
340    // an arena pointer would be misread as a slab address — at best
341    // garbage hex, at worst a translate against an unmapped page.
342    if kern_vm_kva == 0 {
343        return ArenaSnapshot {
344            user_vm_start,
345            ..ArenaSnapshot::default()
346        };
347    }
348
349    // vm_struct lives in the kernel's slab/kmalloc area; direct or
350    // vmalloc, so use translate_any_kva.
351    let Some(vm_struct_pa) = super::idr::translate_any_kva(
352        mem,
353        walk.cr3_pa,
354        walk.page_offset,
355        kern_vm_kva,
356        walk.l5,
357        walk.tcr_el1,
358    ) else {
359        return ArenaSnapshot {
360            user_vm_start,
361            ..ArenaSnapshot::default()
362        };
363    };
364    let vm_addr = mem.read_u64(vm_struct_pa, offsets.vm_struct_addr);
365    if vm_addr == 0 {
366        return ArenaSnapshot {
367            user_vm_start,
368            ..ArenaSnapshot::default()
369        };
370    }
371    let kern_vm_start = vm_addr.wrapping_add(guard_half_bytes);
372
373    // max_entries is the create-time page capacity; user_vm_end may
374    // be 0 for arenas without userspace mmap.
375    let plan = ArenaWalkPlan::new((info.max_entries as u64) * page_size, page_size);
376
377    let mut snapshot = ArenaSnapshot {
378        pages: Vec::new(),
379        truncated: plan.truncated,
380        declared_pages: plan.declared_pages,
381        span_capped: plan.span_capped,
382        kern_vm_start,
383        user_vm_start,
384    };
385
386    // Reusable scratch buffer for the per-page read. Sized once at
387    // `page_size` and reused across every captured page: on success
388    // the buffer is moved into `ArenaPage` (one allocation per
389    // captured page is unavoidable since each page owns its bytes),
390    // then a fresh allocation refills the scratch on the next
391    // `resize`. The win is the SKIP path — every translate-failure
392    // or short-read pgoff used to allocate-and-discard a page-sized
393    // zero-initialised buffer; now those paths reuse the existing
394    // scratch capacity. On a sparse arena window (most pgoffs
395    // unmapped) this collapses thousands of doomed allocations into
396    // one. The hot path (freeze coordinator's dump pipeline) used
397    // to dominate freeze-time wallclock on arenas with declared
398    // pages > captured pages.
399    let mut scratch: Vec<u8> = Vec::with_capacity(page_size as usize);
400
401    // Closure: translate one pgoff to a page-content read; push
402    // onto `snapshot.pages` if the translate + read succeed.
403    // Captures `mem`, `walk`, `kern_vm_start`, `user_vm_start`,
404    // `page_size`, and `scratch` (mutable — drained into the
405    // captured page on success).
406    let mut try_capture_page = |pgoff: u64, pages: &mut Vec<ArenaPage>| {
407        // user_vm_start + pgoff*page_size is a 64-bit value, but the
408        // kernel composes the kern-VA from the LOW 32 bits only —
409        // `uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE)`
410        // in arena_alloc_pages — since the user_vm window is capped
411        // at SZ_4G and aligned so the low 32 bits cover the whole
412        // span uniquely. Match the same truncation here.
413        //
414        // pgoff and page_size both originate from BPF map metadata
415        // and the guest TCR_EL1; pgoff*page_size in u64 can overflow
416        // when a corrupt map advertises a huge declared_pages count.
417        // Skip the page on multiplication overflow — wrapping_add on
418        // user_vm_start is intentional (matches kernel truncation),
419        // but only when the multiplicand was correctly computed.
420        let Some(byte_off) = pgoff.checked_mul(page_size) else {
421            return;
422        };
423        let user_addr = user_vm_start.wrapping_add(byte_off);
424        let kaddr = kern_vm_start.wrapping_add(user_addr & 0xFFFF_FFFF);
425        let Some(pa) = mem.translate_kva(walk.cr3_pa, Kva(kaddr), walk.l5, walk.tcr_el1) else {
426            return;
427        };
428        // Translate guarantees a page-aligned PA; bound-check
429        // against guest DRAM size in case a corrupt PTE points
430        // past end-of-DRAM.
431        if pa + page_size > mem.size() {
432            return;
433        }
434        // Resize the reusable scratch to `page_size` and zero-fill.
435        // After a previous capture moved the inner Vec out via
436        // `mem::take`, `scratch` is empty with `page_size` capacity;
437        // resize allocates exactly the new buffer's bytes, but
438        // skipping iterations that hit the early returns above
439        // never reach this line so their alloc is avoided entirely.
440        scratch.clear();
441        scratch.resize(page_size as usize, 0);
442        // `GuestMem::read_bytes` returns the actual byte count copied
443        // (may be short when the PA crosses end-of-DRAM, even after
444        // the bounds check above — DRAM can have non-contiguous
445        // regions). Truncate the buffer to that count so consumers
446        // never see the zero-init tail of an unwritten range as
447        // legitimate page bytes.
448        let n = mem.read_bytes(pa, &mut scratch);
449        scratch.truncate(n);
450        if scratch.is_empty() {
451            return;
452        }
453        // Move the populated buffer into the captured page; the
454        // scratch falls back to empty (capacity preserved) for the
455        // next iteration.
456        pages.push(ArenaPage {
457            user_addr,
458            bytes: std::mem::take(&mut scratch),
459        });
460    };
461
462    // Phase 1: sequential walk of the first MAX_ARENA_PAGES (16 MiB
463    // window) — covers every scheduler today, where allocations cluster
464    // near pgoff 0.
465    for pgoff in 0..plan.sequential_to {
466        try_capture_page(pgoff, &mut snapshot.pages);
467    }
468
469    // Phase 2: stride sweep over the sparse tail. Without this, a
470    // scheduler that allocated even one page near the 4 GiB end of
471    // its user_vm window would be invisible to the dump despite the
472    // truncation flag. Mapped pages discovered here append to
473    // `snapshot.pages` after the sequential prefix and are
474    // discoverable by `user_addr` (the consumer correlates by user
475    // pointer, not pgoff index, so out-of-order pgoffs are fine).
476    if let Some(stride) = plan.stride {
477        let mut pgoff = plan.sequential_to;
478        while pgoff < plan.declared_pages {
479            try_capture_page(pgoff, &mut snapshot.pages);
480            // Saturate at declared_pages on the last step; without
481            // this `pgoff += stride` could skip past the final page
482            // when stride > 1.
483            pgoff = pgoff.saturating_add(stride);
484        }
485    }
486
487    snapshot
488}
489
490/// Pure computation that decides how many pgoffs the walker must
491/// translate (sequential prefix + stride sweep). Extracted so the
492/// span-cap, declared-page, and stride-derivation logic is unit-
493/// testable without mocking a [`super::guest::GuestKernel`].
494#[derive(Debug, Clone, Copy, PartialEq, Eq)]
495struct ArenaWalkPlan {
496    /// Page count the snapshot reports as "declared". Reflects any
497    /// `MAX_VM_RANGE_BYTES` cap.
498    declared_pages: u64,
499    /// True when `MAX_VM_RANGE_BYTES` capped the raw span.
500    span_capped: bool,
501    /// True when `declared_pages > MAX_ARENA_PAGES` and the walker
502    /// will skip pgoffs in the sparse tail.
503    truncated: bool,
504    /// Sequential-walk endpoint: the walker enumerates
505    /// `0..sequential_to` exhaustively.
506    sequential_to: u64,
507    /// Stride for the post-sequential sweep, or `None` when no tail
508    /// remains. `Some(stride)` walks pgoffs
509    /// `sequential_to, sequential_to + stride, ...` until
510    /// `declared_pages`.
511    stride: Option<u64>,
512}
513
514impl ArenaWalkPlan {
515    fn new(raw_span: u64, page_size: u64) -> Self {
516        let span_capped = raw_span > MAX_VM_RANGE_BYTES;
517        let span = raw_span.min(MAX_VM_RANGE_BYTES);
518        let declared_pages = span / page_size;
519        let sequential_to = declared_pages.min(MAX_ARENA_PAGES);
520        let truncated = declared_pages > sequential_to;
521        let stride = if declared_pages > MAX_ARENA_PAGES {
522            let tail_pages = declared_pages - MAX_ARENA_PAGES;
523            // div_ceil so stride * MAX_ARENA_STRIDE_PROBES covers
524            // the whole tail; .max(1) so a tail smaller than
525            // MAX_ARENA_STRIDE_PROBES still walks every remaining
526            // page sequentially.
527            Some(tail_pages.div_ceil(MAX_ARENA_STRIDE_PROBES).max(1))
528        } else {
529            None
530        };
531        Self {
532            declared_pages,
533            span_capped,
534            truncated,
535            sequential_to,
536            stride,
537        }
538    }
539}
540
541#[cfg(test)]
542mod tests {
543    use super::*;
544
545    #[test]
546    fn parse_arena_offsets_from_vmlinux() {
547        let path = match crate::monitor::find_test_vmlinux() {
548            Some(p) => p,
549            None => return,
550        };
551        // Skip when find_test_vmlinux returns the raw BTF blob — the
552        // vmlinux-ELF parse path inside `from_vmlinux` would fail on
553        // it, but `from_btf` works directly. Tests in btf_offsets/tests.rs
554        // skip the same way for the same reason.
555        if path.starts_with("/sys/") {
556            crate::report::test_skip("vmlinux is raw BTF (skipping ELF-only path)");
557            return;
558        }
559        let offsets = match BpfArenaOffsets::from_vmlinux(&path) {
560            Ok(o) => o,
561            Err(e) => {
562                // Older kernels without arena support: BTF lacks
563                // `struct bpf_arena`. That's a valid configuration —
564                // skip rather than fail.
565                crate::report::test_skip(format!("arena BTF missing: {e}"));
566                return;
567            }
568        };
569        // bpf_arena starts with `struct bpf_map map`, so user_vm_*
570        // come AFTER the embedded bpf_map; both must be at nonzero
571        // offsets. kern_vm follows them in the kernel layout.
572        assert!(
573            offsets.arena_user_vm_start > 0,
574            "user_vm_start follows embedded bpf_map"
575        );
576        assert_ne!(
577            offsets.arena_kern_vm, offsets.arena_user_vm_start,
578            "kern_vm distinct from user_vm_start"
579        );
580        // vm_struct.addr lives after the 8-byte union (next/llnode)
581        // on 64-bit kernels.
582        assert!(
583            offsets.vm_struct_addr > 0,
584            "vm_struct.addr follows the next/llnode union"
585        );
586    }
587
588    // ---- ArenaWalkPlan: span cap + stride sweep -------------------
589    //
590    // The plan is a pure function of the raw user_vm span. Pin its
591    // outputs against representative shapes so the snapshot_arena
592    // call site stays tight against:
593    //   - tiny arena (single page) — no truncation, no stride
594    //   - mid arena (just under 16 MiB) — sequential only
595    //   - large arena (declared > MAX_ARENA_PAGES) — sequential
596    //     prefix + stride sweep
597    //   - 4 GiB-cap (raw_span > MAX_VM_RANGE_BYTES) — span_capped flag,
598    //     declared_pages clamped to MAX_VM_RANGE_BYTES / page_size
599    //   - corrupt span (raw_span = u64::MAX) — capped, flag set,
600    //     no overflow
601
602    /// Page size used for ArenaWalkPlan unit tests. Production code
603    /// resolves the page size from `guest_page_size` (which decodes
604    /// the guest's `TCR_EL1.TG1`); the plan tests pin their math
605    /// against an explicit 4 KiB so they exercise the same shapes
606    /// regardless of the host the test runs on. Granule-specific
607    /// shapes have their own dedicated tests
608    /// (`arena_walk_plan_16k_granule_*`).
609    const TEST_PAGE_SIZE: u64 = 4096;
610
611    #[test]
612    fn arena_walk_plan_constants_sane() {
613        // The plan-derivation invariants depend on these constants.
614        // Pin them so a future tightening surfaces here, not in
615        // snapshot_arena's runtime behavior.
616        assert_eq!(MAX_VM_RANGE_BYTES, 0x1_0000_0000);
617        assert_eq!(MAX_ARENA_PAGES, 4096);
618        assert_eq!(MAX_ARENA_STRIDE_PROBES, 256);
619    }
620
621    #[test]
622    fn arena_walk_plan_single_page() {
623        // Smallest non-empty arena: one page. Sequential walk covers
624        // it; no stride needed; no truncation.
625        let plan = ArenaWalkPlan::new(TEST_PAGE_SIZE, TEST_PAGE_SIZE);
626        assert_eq!(plan.declared_pages, 1);
627        assert!(!plan.span_capped);
628        assert!(!plan.truncated);
629        assert_eq!(plan.sequential_to, 1);
630        assert_eq!(plan.stride, None);
631    }
632
633    #[test]
634    fn arena_walk_plan_exactly_max_arena_pages() {
635        // declared == MAX_ARENA_PAGES: still no stride, no truncation.
636        // Boundary case: MAX_ARENA_PAGES walks sequentially.
637        let plan = ArenaWalkPlan::new(MAX_ARENA_PAGES * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
638        assert_eq!(plan.declared_pages, MAX_ARENA_PAGES);
639        assert!(!plan.truncated);
640        assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
641        assert_eq!(plan.stride, None);
642    }
643
644    #[test]
645    fn arena_walk_plan_one_page_past_max() {
646        // declared = MAX_ARENA_PAGES + 1: stride mode kicks in for
647        // the single tail page; stride must be 1 (every page).
648        let plan = ArenaWalkPlan::new((MAX_ARENA_PAGES + 1) * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
649        assert_eq!(plan.declared_pages, MAX_ARENA_PAGES + 1);
650        assert!(plan.truncated);
651        assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
652        assert_eq!(plan.stride, Some(1));
653    }
654
655    #[test]
656    fn arena_walk_plan_full_4gib() {
657        // Largest legitimate arena: full 4 GiB user_vm window (1M pages).
658        // Sequential covers first 16 MiB; stride sweeps the remaining
659        // ~1M-4096 pages with 256 probes -> stride = ceil((1M - 4096) / 256).
660        let raw = MAX_VM_RANGE_BYTES;
661        let plan = ArenaWalkPlan::new(raw, TEST_PAGE_SIZE);
662        assert_eq!(plan.declared_pages, raw / TEST_PAGE_SIZE);
663        assert!(!plan.span_capped, "exactly 4 GiB is at the cap, not above");
664        assert!(plan.truncated);
665        assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
666        let stride = plan.stride.expect("stride mode for >MAX_ARENA_PAGES");
667        let tail = plan.declared_pages - MAX_ARENA_PAGES;
668        // Verify stride covers the tail: stride * MAX_ARENA_STRIDE_PROBES
669        // must reach `tail` with at most one slot of overshoot.
670        assert!(stride * MAX_ARENA_STRIDE_PROBES >= tail);
671        assert!((stride - 1) * MAX_ARENA_STRIDE_PROBES < tail);
672    }
673
674    #[test]
675    fn arena_walk_plan_caps_at_4gib() {
676        // Raw span 8 GiB (corrupt struct): span_capped flag set,
677        // declared_pages clamped to MAX_VM_RANGE_BYTES / page_size.
678        let plan = ArenaWalkPlan::new(2 * MAX_VM_RANGE_BYTES, TEST_PAGE_SIZE);
679        assert!(plan.span_capped);
680        assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / TEST_PAGE_SIZE);
681        assert!(plan.truncated);
682        assert!(plan.stride.is_some());
683    }
684
685    #[test]
686    fn arena_walk_plan_caps_corrupt_u64_max_span() {
687        // Pathological: raw_span = u64::MAX. The cap must apply
688        // BEFORE the span-to-pages division; without the cap,
689        // u64::MAX / page_size = ~4.5 quadrillion pages and the
690        // pgoff loop would live-lock.
691        let plan = ArenaWalkPlan::new(u64::MAX, TEST_PAGE_SIZE);
692        assert!(plan.span_capped);
693        assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / TEST_PAGE_SIZE);
694        assert!(plan.truncated);
695    }
696
697    #[test]
698    fn arena_walk_plan_zero_span() {
699        // Edge: zero span. snapshot_arena can reach this with
700        // max_entries=0; the plan must handle zero spans without
701        // panicking or computing nonsense bounds.
702        let plan = ArenaWalkPlan::new(0, TEST_PAGE_SIZE);
703        assert_eq!(plan.declared_pages, 0);
704        assert!(!plan.span_capped);
705        assert!(!plan.truncated);
706        assert_eq!(plan.sequential_to, 0);
707        assert_eq!(plan.stride, None);
708    }
709
710    #[test]
711    fn arena_walk_plan_stride_visits_every_pgoff_when_short_tail() {
712        // tail < MAX_ARENA_STRIDE_PROBES: stride saturates at 1, so
713        // the sweep walks every remaining page. Verify by simulating
714        // the walk and counting positions.
715        // declared = MAX_ARENA_PAGES + 50 -> tail = 50 -> stride = 1.
716        let plan = ArenaWalkPlan::new((MAX_ARENA_PAGES + 50) * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
717        assert_eq!(plan.stride, Some(1));
718        let mut pgoff = plan.sequential_to;
719        let mut visited = 0u64;
720        while pgoff < plan.declared_pages {
721            visited += 1;
722            pgoff = pgoff.saturating_add(plan.stride.unwrap());
723        }
724        assert_eq!(visited, 50, "every tail page should be visited");
725    }
726
727    #[test]
728    fn arena_walk_plan_stride_distributes_probes_in_long_tail() {
729        // tail >> MAX_ARENA_STRIDE_PROBES: stride > 1, fewer probes
730        // than tail pages. Verify the sweep visits exactly
731        // approximately MAX_ARENA_STRIDE_PROBES positions.
732        let plan = ArenaWalkPlan::new(MAX_VM_RANGE_BYTES, TEST_PAGE_SIZE); // full 4 GiB
733        let mut pgoff = plan.sequential_to;
734        let mut visited = 0u64;
735        while pgoff < plan.declared_pages {
736            visited += 1;
737            pgoff = pgoff.saturating_add(plan.stride.unwrap());
738        }
739        // The sweep visits ceil(tail / stride) positions; for the
740        // 4 GiB case `stride * MAX_ARENA_STRIDE_PROBES >= tail` so
741        // visited <= MAX_ARENA_STRIDE_PROBES, and `>= tail / stride`
742        // ensures it's not zero.
743        assert!(
744            visited <= MAX_ARENA_STRIDE_PROBES + 1,
745            "visited {visited}, expected ≤ {} probes",
746            MAX_ARENA_STRIDE_PROBES + 1
747        );
748        assert!(
749            visited >= MAX_ARENA_STRIDE_PROBES - 1,
750            "visited {visited}, expected ≥ {}-ish probes",
751            MAX_ARENA_STRIDE_PROBES - 1
752        );
753    }
754
755    /// `guard_half` mirrors the kernel's `bpf_arena_get_kern_vm_start`
756    /// `GUARD_SZ/2` formula. Pin the three legitimate page granules
757    /// (4 KiB, 16 KiB, 64 KiB) against the hand-computed values from
758    /// the doc comment so a regression in the
759    /// `next_multiple_of(page_size << 1)` math surfaces here.
760    #[test]
761    fn guard_half_matches_kernel_formula() {
762        // 4 KiB granule: round_up(65536, 8192) = 65536, /2 = 32768.
763        assert_eq!(guard_half(4096), 32768);
764        // 16 KiB granule: round_up(65536, 32768) = 65536, /2 = 32768.
765        assert_eq!(guard_half(16384), 32768);
766        // 64 KiB granule: round_up(65536, 131072) = 131072, /2 = 65536.
767        assert_eq!(guard_half(65536), 65536);
768    }
769
770    /// `guest_page_size` decodes `TCR_EL1.TG1` (bits [31:30]) into
771    /// the granule size on aarch64; on x86_64 it is fixed at 4 KiB
772    /// regardless of the input. Pin the four encodings + the
773    /// reserved fallback path so a regression in the bit math
774    /// surfaces here.
775    #[test]
776    fn guest_page_size_decodes_tg1() {
777        #[cfg(target_arch = "x86_64")]
778        {
779            // x86_64: page size is always 4 KiB, regardless of the
780            // (ignored) `tcr_el1` argument.
781            assert_eq!(guest_page_size(0), 4096);
782            assert_eq!(guest_page_size(0b01u64 << 30), 4096);
783            assert_eq!(guest_page_size(0b10u64 << 30), 4096);
784            assert_eq!(guest_page_size(0b11u64 << 30), 4096);
785        }
786        #[cfg(target_arch = "aarch64")]
787        {
788            // TG1=0b10 → 4 KiB
789            assert_eq!(guest_page_size(0b10u64 << 30), 4096);
790            // TG1=0b01 → 16 KiB (Apple Silicon style)
791            assert_eq!(guest_page_size(0b01u64 << 30), 16384);
792            // TG1=0b11 → 64 KiB
793            assert_eq!(guest_page_size(0b11u64 << 30), 65536);
794            // TG1=0b00 (reserved) → conservative 4 KiB fallback
795            assert_eq!(guest_page_size(0), 4096);
796        }
797    }
798
799    /// 16 KiB-granule arena (Apple Silicon kernel build): a single
800    /// declared page is 16 KiB. With raw_span = 16384 the plan must
801    /// report `declared_pages = 1`, no stride. Pre-fix, `PAGE_SIZE`
802    /// was hardcoded to 4096 so 16384 / 4096 = 4 pages — wrong.
803    #[test]
804    fn arena_walk_plan_16k_granule_single_page() {
805        let plan = ArenaWalkPlan::new(16384, 16384);
806        assert_eq!(plan.declared_pages, 1);
807        assert!(!plan.span_capped);
808        assert!(!plan.truncated);
809        assert_eq!(plan.sequential_to, 1);
810        assert_eq!(plan.stride, None);
811    }
812
813    /// 16 KiB-granule arena at the 4 GiB cap: `declared_pages` =
814    /// 4 GiB / 16 KiB = 256 K. Pre-fix, the divisor was 4 KiB so
815    /// the count would have been 4x too large.
816    #[test]
817    fn arena_walk_plan_16k_granule_full_cap() {
818        let plan = ArenaWalkPlan::new(MAX_VM_RANGE_BYTES, 16384);
819        assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / 16384);
820        assert!(!plan.span_capped);
821        assert!(plan.truncated);
822        assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
823    }
824}