ktstr/monitor/arena.rs
1//! Host-side BPF arena page enumeration.
2//!
3//! `BPF_MAP_TYPE_ARENA` (kernel uapi value [`BPF_MAP_TYPE_ARENA`]) is
4//! a sparse, page-granular memory region shared between BPF programs
5//! and userspace. The kernel allocates a 4 GiB-plus-guard
6//! (`KERN_VM_SZ`) `vm_struct` and lazily maps order-0 pages into it
7//! on demand (see `kernel/bpf/arena.c::arena_alloc_pages` and
8//! `arena_vm_fault`); the user-visible window is at
9//! `[arena.user_vm_start .. arena.user_vm_end)`, a 32-bit-addressable
10//! range whose lower 32 bits the BPF JIT uses as the arena pointer
11//! payload. Translation kernel-side is:
12//!
13//! ```text
14//! kern_vm_start = arena->kern_vm->addr + GUARD_SZ/2
15//! kaddr = kern_vm_start + (u32)user_addr
16//! page = vmalloc_to_page(kaddr) // PTE walk on init_mm
17//! ```
18//!
19//! The host-side walker mirrors this: read the arena's `kern_vm`
20//! pointer, dereference to get `vm_struct.addr`, add `GUARD_SZ/2`,
21//! then for each pgoff in `0..max_entries` compute `kaddr` and run
22//! `GuestMem::translate_kva` (the existing PTE walker against
23//! `init_mm`'s page table). `max_entries` is the BPF map's declared
24//! page capacity from `bpf_map_create()` — it is the source of truth
25//! for "how many pages this arena could hold", regardless of whether
26//! the scheduler exposes a userspace mmap (some don't, leaving
27//! `user_vm_start == user_vm_end == 0`). Pages whose translate fails
28//! are simply "not faulted in" — arena maps are sparse by design.
29//!
30//! The walker does NOT consult `arena->rt` (the range_tree of free
31//! pgoffs) — `range_tree` polarity is "set = free" / "clear =
32//! allocated", reading it from a frozen snapshot would only tell
33//! the host which pages the kernel *intended* to be allocated, not
34//! which are actually mapped. The PTE walk is the source of truth.
35//!
36//! [`BPF_MAP_TYPE_ARENA`]: BPF_MAP_TYPE_ARENA
37
38use anyhow::{Context, Result};
39use serde::{Deserialize, Serialize};
40use std::path::Path;
41
42use btf_rs::Btf;
43
44use super::Kva;
45use super::bpf_map::{BPF_MAP_TYPE_ARENA, BpfMapInfo};
46use super::btf_offsets::{find_struct, load_btf_from_path, member_byte_offset};
47use super::guest::GuestKernel;
48
49/// Page size used by the arena walker, derived from the GUEST
50/// kernel's MMU configuration.
51///
52/// `arena_alloc_pages` and `arena_vm_fault` both call
53/// `apply_to_page_range` on `PAGE_SIZE`-granular ranges where
54/// `PAGE_SIZE` is the GUEST kernel's own MMU page size. The host's
55/// page size is irrelevant — ktstr can run a 16 KiB-granule guest
56/// on a 4 KiB-granule host (and vice versa), and the arena layout
57/// must match the guest's view.
58///
59/// On x86_64 the guest page granule is fixed at 4 KiB. On aarch64
60/// the granule is encoded in `TCR_EL1.TG1` (bits `\[31:30\]`):
61/// - `0b10` → 4 KiB
62/// - `0b01` → 16 KiB
63/// - `0b11` → 64 KiB
64///
65/// Falls back to 4 KiB when the architecture branches reject the
66/// register value (e.g. uninitialized `tcr_el1 == 0` on aarch64);
67/// the fallback is conservative — at worst the walker overscans a
68/// small arena and surfaces extra `pgoff` slots that translate to
69/// `None`. A guest with non-4 KiB granule whose `tcr_el1` reads
70/// zero would be a freeze-path bug elsewhere (the freeze
71/// coordinator polls until `tcr_el1` populates before snapshotting).
72fn guest_page_size(tcr_el1: u64) -> u64 {
73 #[cfg(target_arch = "x86_64")]
74 {
75 let _ = tcr_el1;
76 4096
77 }
78 #[cfg(target_arch = "aarch64")]
79 {
80 match (tcr_el1 >> 30) & 0x3 {
81 0b10 => 4096,
82 0b01 => 16384,
83 0b11 => 65536,
84 _ => 4096, // 0b00 reserved; conservative fallback
85 }
86 }
87 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
88 {
89 let _ = tcr_el1;
90 4096
91 }
92}
93
94/// `GUARD_SZ / 2` from `kernel/bpf/arena.c`.
95///
96/// Kernel formula:
97/// `GUARD_SZ = round_up(1ull << sizeof_field(struct bpf_insn, off) * 8,
98/// PAGE_SIZE << 1)`
99/// where `sizeof_field(struct bpf_insn, off) * 8 = 16` so the lower
100/// term is `1 << 16 = 65536`. Result depends on the kernel's page
101/// granule (`PAGE_SIZE << 1`):
102/// - 4 KiB pages: `round_up(65536, 8192)` = 65536, GUARD_HALF = 32768.
103/// - 16 KiB pages: `round_up(65536, 32768)` = 65536, GUARD_HALF = 32768.
104/// - 64 KiB pages: `round_up(65536, 131072)` = 131072, GUARD_HALF = 65536.
105///
106/// `bpf_arena_get_kern_vm_start` returns `arena->kern_vm->addr +
107/// GUARD_SZ/2`, so the kernel-side accessible region starts
108/// `GUARD_HALF` past the raw `vm_struct.addr`. The walker must add
109/// this offset when translating user-VA to kern-VA.
110fn guard_half(page_size: u64) -> u64 {
111 (1u64 << 16).next_multiple_of(page_size << 1) / 2
112}
113
114/// Maximum number of pages the walker will translate per arena
115/// sequentially.
116///
117/// `KERN_VM_SZ = SZ_4G + GUARD_SZ` is the kernel's vmalloc reservation
118/// (~1M pages) but most arenas use a small fraction. Cap the
119/// sequential walk at 4096 pages (16 MiB) to bound report size and
120/// freeze-path latency (a full 1M-page walk at ~1 µs per
121/// translate_kva would burn ~1 s on the freeze hot path); truncation
122/// is surfaced via [`ArenaSnapshot::truncated`] and a sparse stride
123/// sweep (see [`MAX_ARENA_STRIDE_PROBES`]) catches mapped pages
124/// beyond this cap.
125const MAX_ARENA_PAGES: u64 = 4096;
126
127/// Number of evenly-spaced stride probes the walker performs across
128/// pgoffs `MAX_ARENA_PAGES`..`declared_pages` when `declared_pages`
129/// exceeds the sequential cap. Lets the walker surface mapped pages
130/// in sparse arenas (e.g. a scheduler that allocated pages near the
131/// 4 GiB end of its user_vm window) without paying the full 1M-page
132/// translate_kva cost.
133///
134/// 256 probes × ~1 µs per translate ≈ 0.25 ms — negligible on the
135/// freeze hot path. Each hit lands in [`ArenaSnapshot::pages`]
136/// alongside the sequential prefix, so the consumer sees both.
137const MAX_ARENA_STRIDE_PROBES: u64 = 256;
138
139/// Defensive cap on the arena's address-range span, in bytes.
140///
141/// The walker computes its span from `info.max_entries * page_size`
142/// (the BPF map's declared page capacity, see [`snapshot_arena`]).
143/// `arena_map_alloc` allows at most 4 GiB worth of pages by design —
144/// the BPF JIT addresses arena pointers via the low 32 bits of the
145/// user address, so anything wider than `0x1_0000_0000` cannot be a
146/// real arena layout (see the `vm_range > SZ_4G` check in
147/// `arena_map_alloc`, `kernel/bpf/arena.c`). A torn / corrupt
148/// `bpf_map.max_entries` or a freeze-time race against `arena_map_alloc`
149/// could yield a wild value;
150/// cap it here so the walker never multiplies a near-`u64::MAX` page
151/// count by the page size (overflow) or attempts to walk billions of
152/// pgoffs (live-lock on the freeze path).
153const MAX_VM_RANGE_BYTES: u64 = 0x1_0000_0000;
154
155/// Byte offsets within `struct bpf_arena` and `struct vm_struct`
156/// needed for the host-side arena walker.
157///
158/// Resolved from BTF at startup so the walker doesn't hardcode kernel
159/// layout. Mirrors the [`super::btf_offsets::BpfMapOffsets`] pattern.
160#[derive(Debug, Clone)]
161pub struct BpfArenaOffsets {
162 /// Offset of `kern_vm` (`struct vm_struct *`) within `struct bpf_arena`.
163 pub arena_kern_vm: usize,
164 /// Offset of `user_vm_start` (u64) within `struct bpf_arena`.
165 pub arena_user_vm_start: usize,
166 /// Offset of `addr` (`void *`) within `struct vm_struct`.
167 pub vm_struct_addr: usize,
168}
169
170impl BpfArenaOffsets {
171 /// Parse BTF from a vmlinux ELF and resolve arena field offsets.
172 ///
173 /// Returns Err on kernels whose BTF lacks `bpf_arena` (i.e. arena
174 /// support is not built in) — the caller can treat the absent
175 /// offsets as a signal to skip arena enumeration.
176 ///
177 /// Production callers (the freeze coordinator) reach this code
178 /// via [`Self::from_btf`] on a pre-parsed `&Btf` to amortize the
179 /// ELF parse — `from_vmlinux` stays public as the convenience
180 /// entry point for direct-from-vmlinux callers (CLI tools, unit
181 /// tests against a vmlinux on disk).
182 #[allow(dead_code)]
183 pub fn from_vmlinux(path: &Path) -> Result<Self> {
184 let btf = load_btf_from_path(path).context("btf: open vmlinux")?;
185 Self::from_btf(&btf)
186 }
187
188 /// Resolve arena struct offsets from a pre-loaded BTF object.
189 pub fn from_btf(btf: &Btf) -> Result<Self> {
190 let (bpf_arena, _) = find_struct(btf, "bpf_arena")
191 .context("btf: struct bpf_arena not found (arena unsupported on this kernel?)")?;
192 let arena_kern_vm = member_byte_offset(btf, &bpf_arena, "kern_vm")?;
193 let arena_user_vm_start = member_byte_offset(btf, &bpf_arena, "user_vm_start")?;
194
195 let (vm_struct, _) =
196 find_struct(btf, "vm_struct").context("btf: struct vm_struct not found")?;
197 let vm_struct_addr = member_byte_offset(btf, &vm_struct, "addr")?;
198
199 Ok(Self {
200 arena_kern_vm,
201 arena_user_vm_start,
202 vm_struct_addr,
203 })
204 }
205}
206
207/// One mapped arena page captured from guest memory.
208#[derive(Debug, Clone, Default, Serialize, Deserialize)]
209#[non_exhaustive]
210pub struct ArenaPage {
211 /// User-side virtual address (32-bit window starting at
212 /// `arena.user_vm_start`). Operators correlate this with the
213 /// pointer values they see in BPF program output.
214 pub user_addr: u64,
215 /// One arena page's worth of bytes read from the guest. Length
216 /// matches the guest kernel's MMU page size: 4 KiB on x86_64
217 /// and on aarch64 with `TCR_EL1.TG1=0b10`; 16 KiB on aarch64
218 /// 16 KiB-granule kernels (Apple Silicon style); 64 KiB on
219 /// aarch64 64 KiB-granule kernels. The resolution lives in
220 /// `guest_page_size` — the snapshot stamps every captured
221 /// page at that size.
222 pub bytes: Vec<u8>,
223}
224
225/// Snapshot of one arena map's mapped pages.
226#[derive(Debug, Clone, Default, Serialize, Deserialize)]
227#[non_exhaustive]
228pub struct ArenaSnapshot {
229 /// Mapped pages, in pgoff order (skipped over unmapped pgoffs).
230 /// Sequential prefix (pgoffs `0..MAX_ARENA_PAGES`) followed by any
231 /// stride-probe hits in the sparse tail (pgoffs sampled across
232 /// `MAX_ARENA_PAGES..declared_pages`).
233 #[serde(default, skip_serializing_if = "Vec::is_empty")]
234 pub pages: Vec<ArenaPage>,
235 /// True when the walker stopped sequential enumeration at
236 /// `MAX_ARENA_PAGES` before finishing the user_vm window. The
237 /// stride sweep that follows samples the tail at coarse intervals,
238 /// so a hit reaches `pages` even when this flag is set; pgoffs
239 /// between sampled positions are still silently skipped.
240 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
241 pub truncated: bool,
242 /// Total declared page count. Derived from
243 /// `max_entries * page_size` (the BPF map's declared page
244 /// capacity, with `page_size` resolved from the guest's
245 /// TCR_EL1 via `guest_page_size`), not the user_vm window.
246 /// Reflects any `MAX_VM_RANGE_BYTES` cap. Surfaced alongside
247 /// `pages.len()` so consumers can see the
248 /// allocated-vs-declared ratio.
249 pub declared_pages: u64,
250 /// True when `max_entries * page_size` exceeded
251 /// `MAX_VM_RANGE_BYTES` (4 GiB) and the walker capped the span
252 /// before computing `declared_pages`. Indicates a torn / corrupt
253 /// `bpf_arena` struct or a freeze-time race against initialization;
254 /// the rendered pages still come from valid translates, so the
255 /// snapshot is usable.
256 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
257 pub span_capped: bool,
258 /// Kernel-side base of the arena's user_vm window:
259 /// `bpf_arena.kern_vm->addr + GUARD_HALF`. Surfaces here so
260 /// downstream consumers (notably the `super::sdt_alloc` tree
261 /// walker) can translate `__arena` pointers without re-reading
262 /// `struct bpf_arena` themselves. `0` when the snapshot bailed
263 /// before computing the value (kern_vm_kva NULL, vm_addr NULL,
264 /// or any of the upstream translates failed).
265 ///
266 /// Always serialized — the zero value carries diagnostic
267 /// information ("walker reached this point but couldn't compute
268 /// the base"), so suppressing it would mask the failure. Mirrors
269 /// the policy used for the sibling `declared_pages` field.
270 pub kern_vm_start: u64,
271 /// User-side base of the arena window: the value of
272 /// `bpf_arena.user_vm_start`, the address space the BPF program
273 /// (and any captured `__arena` pointer) sees. `[user_vm_start ..
274 /// user_vm_start + 4 GiB)` is the kernel-enforced upper bound
275 /// (`bpf_arena_alloc_pages` clamps to `SZ_4G`). Consumers use it
276 /// to classify a pointer as "lives in this arena" before chasing
277 /// into [`Self::pages`].
278 ///
279 /// `0` when the snapshot bailed before reading
280 /// `arena.user_vm_start` (e.g. `arena_pa` translate failed). On
281 /// the syscall backend this comes from `bpf_map.map_extra` which
282 /// the kernel pins at create time (`lib/arena_map.h` hardcodes
283 /// `1<<44` on x86, `1<<32` on aarch64). On the guest-memory
284 /// backend it's read directly from
285 /// `bpf_arena.user_vm_start` via the resolved offset.
286 ///
287 /// Always serialized for the same diagnostic reason as
288 /// [`Self::kern_vm_start`].
289 pub user_vm_start: u64,
290}
291
292/// Walk the arena's mapped pages and return a snapshot.
293///
294/// Reads `kern_vm` from `struct bpf_arena` at `info.map_kva`,
295/// dereferences to `vm_struct.addr`, computes
296/// `kern_vm_start = addr + GUARD_HALF`, and for each pgoff in
297/// `0..N` translates `kern_vm_start + (u32)user_addr` via
298/// `GuestMem::translate_kva`. Pages that fail to translate are
299/// "not faulted in" and silently skipped.
300///
301/// The walker is best-effort: any read failure on `bpf_arena` /
302/// `vm_struct` itself yields an empty snapshot rather than an error,
303/// so a corrupt arena can't break the broader failure dump.
304pub fn snapshot_arena(
305 kernel: &GuestKernel,
306 info: &BpfMapInfo,
307 offsets: &BpfArenaOffsets,
308) -> ArenaSnapshot {
309 if info.map_type != BPF_MAP_TYPE_ARENA {
310 return ArenaSnapshot::default();
311 }
312
313 let mem = kernel.mem();
314 let walk = kernel.walk_context();
315 let page_size = guest_page_size(walk.tcr_el1);
316 let guard_half_bytes = guard_half(page_size);
317
318 // bpf_arena embeds bpf_map at offset 0, so map_kva == arena_kva.
319 let arena_kva = info.map_kva;
320 // Translate the arena struct itself — it may be kmalloc'd
321 // (direct map) or vmalloc'd (`bpf_map_area_alloc`).
322 let Some(arena_pa) = super::idr::translate_any_kva(
323 mem,
324 walk.cr3_pa,
325 walk.page_offset,
326 arena_kva,
327 walk.l5,
328 walk.tcr_el1,
329 ) else {
330 return ArenaSnapshot::default();
331 };
332
333 let user_vm_start = mem.read_u64(arena_pa, offsets.arena_user_vm_start);
334 let kern_vm_kva = mem.read_u64(arena_pa, offsets.arena_kern_vm);
335 // Preserve `user_vm_start` even when the kern-side walk fails:
336 // the `MemReader::is_arena_addr` consumer needs it to classify
337 // an `__arena` pointer as in-window (vs. a kernel kptr) so the
338 // Ptr-deref path returns `None` cleanly instead of falling
339 // through to the kernel-kptr cpumask probe. Without the anchor,
340 // an arena pointer would be misread as a slab address — at best
341 // garbage hex, at worst a translate against an unmapped page.
342 if kern_vm_kva == 0 {
343 return ArenaSnapshot {
344 user_vm_start,
345 ..ArenaSnapshot::default()
346 };
347 }
348
349 // vm_struct lives in the kernel's slab/kmalloc area; direct or
350 // vmalloc, so use translate_any_kva.
351 let Some(vm_struct_pa) = super::idr::translate_any_kva(
352 mem,
353 walk.cr3_pa,
354 walk.page_offset,
355 kern_vm_kva,
356 walk.l5,
357 walk.tcr_el1,
358 ) else {
359 return ArenaSnapshot {
360 user_vm_start,
361 ..ArenaSnapshot::default()
362 };
363 };
364 let vm_addr = mem.read_u64(vm_struct_pa, offsets.vm_struct_addr);
365 if vm_addr == 0 {
366 return ArenaSnapshot {
367 user_vm_start,
368 ..ArenaSnapshot::default()
369 };
370 }
371 let kern_vm_start = vm_addr.wrapping_add(guard_half_bytes);
372
373 // max_entries is the create-time page capacity; user_vm_end may
374 // be 0 for arenas without userspace mmap.
375 let plan = ArenaWalkPlan::new((info.max_entries as u64) * page_size, page_size);
376
377 let mut snapshot = ArenaSnapshot {
378 pages: Vec::new(),
379 truncated: plan.truncated,
380 declared_pages: plan.declared_pages,
381 span_capped: plan.span_capped,
382 kern_vm_start,
383 user_vm_start,
384 };
385
386 // Reusable scratch buffer for the per-page read. Sized once at
387 // `page_size` and reused across every captured page: on success
388 // the buffer is moved into `ArenaPage` (one allocation per
389 // captured page is unavoidable since each page owns its bytes),
390 // then a fresh allocation refills the scratch on the next
391 // `resize`. The win is the SKIP path — every translate-failure
392 // or short-read pgoff used to allocate-and-discard a page-sized
393 // zero-initialised buffer; now those paths reuse the existing
394 // scratch capacity. On a sparse arena window (most pgoffs
395 // unmapped) this collapses thousands of doomed allocations into
396 // one. The hot path (freeze coordinator's dump pipeline) used
397 // to dominate freeze-time wallclock on arenas with declared
398 // pages > captured pages.
399 let mut scratch: Vec<u8> = Vec::with_capacity(page_size as usize);
400
401 // Closure: translate one pgoff to a page-content read; push
402 // onto `snapshot.pages` if the translate + read succeed.
403 // Captures `mem`, `walk`, `kern_vm_start`, `user_vm_start`,
404 // `page_size`, and `scratch` (mutable — drained into the
405 // captured page on success).
406 let mut try_capture_page = |pgoff: u64, pages: &mut Vec<ArenaPage>| {
407 // user_vm_start + pgoff*page_size is a 64-bit value, but the
408 // kernel composes the kern-VA from the LOW 32 bits only —
409 // `uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE)`
410 // in arena_alloc_pages — since the user_vm window is capped
411 // at SZ_4G and aligned so the low 32 bits cover the whole
412 // span uniquely. Match the same truncation here.
413 //
414 // pgoff and page_size both originate from BPF map metadata
415 // and the guest TCR_EL1; pgoff*page_size in u64 can overflow
416 // when a corrupt map advertises a huge declared_pages count.
417 // Skip the page on multiplication overflow — wrapping_add on
418 // user_vm_start is intentional (matches kernel truncation),
419 // but only when the multiplicand was correctly computed.
420 let Some(byte_off) = pgoff.checked_mul(page_size) else {
421 return;
422 };
423 let user_addr = user_vm_start.wrapping_add(byte_off);
424 let kaddr = kern_vm_start.wrapping_add(user_addr & 0xFFFF_FFFF);
425 let Some(pa) = mem.translate_kva(walk.cr3_pa, Kva(kaddr), walk.l5, walk.tcr_el1) else {
426 return;
427 };
428 // Translate guarantees a page-aligned PA; bound-check
429 // against guest DRAM size in case a corrupt PTE points
430 // past end-of-DRAM.
431 if pa + page_size > mem.size() {
432 return;
433 }
434 // Resize the reusable scratch to `page_size` and zero-fill.
435 // After a previous capture moved the inner Vec out via
436 // `mem::take`, `scratch` is empty with `page_size` capacity;
437 // resize allocates exactly the new buffer's bytes, but
438 // skipping iterations that hit the early returns above
439 // never reach this line so their alloc is avoided entirely.
440 scratch.clear();
441 scratch.resize(page_size as usize, 0);
442 // `GuestMem::read_bytes` returns the actual byte count copied
443 // (may be short when the PA crosses end-of-DRAM, even after
444 // the bounds check above — DRAM can have non-contiguous
445 // regions). Truncate the buffer to that count so consumers
446 // never see the zero-init tail of an unwritten range as
447 // legitimate page bytes.
448 let n = mem.read_bytes(pa, &mut scratch);
449 scratch.truncate(n);
450 if scratch.is_empty() {
451 return;
452 }
453 // Move the populated buffer into the captured page; the
454 // scratch falls back to empty (capacity preserved) for the
455 // next iteration.
456 pages.push(ArenaPage {
457 user_addr,
458 bytes: std::mem::take(&mut scratch),
459 });
460 };
461
462 // Phase 1: sequential walk of the first MAX_ARENA_PAGES (16 MiB
463 // window) — covers every scheduler today, where allocations cluster
464 // near pgoff 0.
465 for pgoff in 0..plan.sequential_to {
466 try_capture_page(pgoff, &mut snapshot.pages);
467 }
468
469 // Phase 2: stride sweep over the sparse tail. Without this, a
470 // scheduler that allocated even one page near the 4 GiB end of
471 // its user_vm window would be invisible to the dump despite the
472 // truncation flag. Mapped pages discovered here append to
473 // `snapshot.pages` after the sequential prefix and are
474 // discoverable by `user_addr` (the consumer correlates by user
475 // pointer, not pgoff index, so out-of-order pgoffs are fine).
476 if let Some(stride) = plan.stride {
477 let mut pgoff = plan.sequential_to;
478 while pgoff < plan.declared_pages {
479 try_capture_page(pgoff, &mut snapshot.pages);
480 // Saturate at declared_pages on the last step; without
481 // this `pgoff += stride` could skip past the final page
482 // when stride > 1.
483 pgoff = pgoff.saturating_add(stride);
484 }
485 }
486
487 snapshot
488}
489
490/// Pure computation that decides how many pgoffs the walker must
491/// translate (sequential prefix + stride sweep). Extracted so the
492/// span-cap, declared-page, and stride-derivation logic is unit-
493/// testable without mocking a [`super::guest::GuestKernel`].
494#[derive(Debug, Clone, Copy, PartialEq, Eq)]
495struct ArenaWalkPlan {
496 /// Page count the snapshot reports as "declared". Reflects any
497 /// `MAX_VM_RANGE_BYTES` cap.
498 declared_pages: u64,
499 /// True when `MAX_VM_RANGE_BYTES` capped the raw span.
500 span_capped: bool,
501 /// True when `declared_pages > MAX_ARENA_PAGES` and the walker
502 /// will skip pgoffs in the sparse tail.
503 truncated: bool,
504 /// Sequential-walk endpoint: the walker enumerates
505 /// `0..sequential_to` exhaustively.
506 sequential_to: u64,
507 /// Stride for the post-sequential sweep, or `None` when no tail
508 /// remains. `Some(stride)` walks pgoffs
509 /// `sequential_to, sequential_to + stride, ...` until
510 /// `declared_pages`.
511 stride: Option<u64>,
512}
513
514impl ArenaWalkPlan {
515 fn new(raw_span: u64, page_size: u64) -> Self {
516 let span_capped = raw_span > MAX_VM_RANGE_BYTES;
517 let span = raw_span.min(MAX_VM_RANGE_BYTES);
518 let declared_pages = span / page_size;
519 let sequential_to = declared_pages.min(MAX_ARENA_PAGES);
520 let truncated = declared_pages > sequential_to;
521 let stride = if declared_pages > MAX_ARENA_PAGES {
522 let tail_pages = declared_pages - MAX_ARENA_PAGES;
523 // div_ceil so stride * MAX_ARENA_STRIDE_PROBES covers
524 // the whole tail; .max(1) so a tail smaller than
525 // MAX_ARENA_STRIDE_PROBES still walks every remaining
526 // page sequentially.
527 Some(tail_pages.div_ceil(MAX_ARENA_STRIDE_PROBES).max(1))
528 } else {
529 None
530 };
531 Self {
532 declared_pages,
533 span_capped,
534 truncated,
535 sequential_to,
536 stride,
537 }
538 }
539}
540
541#[cfg(test)]
542mod tests {
543 use super::*;
544
545 #[test]
546 fn parse_arena_offsets_from_vmlinux() {
547 let path = match crate::monitor::find_test_vmlinux() {
548 Some(p) => p,
549 None => return,
550 };
551 // Skip when find_test_vmlinux returns the raw BTF blob — the
552 // vmlinux-ELF parse path inside `from_vmlinux` would fail on
553 // it, but `from_btf` works directly. Tests in btf_offsets/tests.rs
554 // skip the same way for the same reason.
555 if path.starts_with("/sys/") {
556 crate::report::test_skip("vmlinux is raw BTF (skipping ELF-only path)");
557 return;
558 }
559 let offsets = match BpfArenaOffsets::from_vmlinux(&path) {
560 Ok(o) => o,
561 Err(e) => {
562 // Older kernels without arena support: BTF lacks
563 // `struct bpf_arena`. That's a valid configuration —
564 // skip rather than fail.
565 crate::report::test_skip(format!("arena BTF missing: {e}"));
566 return;
567 }
568 };
569 // bpf_arena starts with `struct bpf_map map`, so user_vm_*
570 // come AFTER the embedded bpf_map; both must be at nonzero
571 // offsets. kern_vm follows them in the kernel layout.
572 assert!(
573 offsets.arena_user_vm_start > 0,
574 "user_vm_start follows embedded bpf_map"
575 );
576 assert_ne!(
577 offsets.arena_kern_vm, offsets.arena_user_vm_start,
578 "kern_vm distinct from user_vm_start"
579 );
580 // vm_struct.addr lives after the 8-byte union (next/llnode)
581 // on 64-bit kernels.
582 assert!(
583 offsets.vm_struct_addr > 0,
584 "vm_struct.addr follows the next/llnode union"
585 );
586 }
587
588 // ---- ArenaWalkPlan: span cap + stride sweep -------------------
589 //
590 // The plan is a pure function of the raw user_vm span. Pin its
591 // outputs against representative shapes so the snapshot_arena
592 // call site stays tight against:
593 // - tiny arena (single page) — no truncation, no stride
594 // - mid arena (just under 16 MiB) — sequential only
595 // - large arena (declared > MAX_ARENA_PAGES) — sequential
596 // prefix + stride sweep
597 // - 4 GiB-cap (raw_span > MAX_VM_RANGE_BYTES) — span_capped flag,
598 // declared_pages clamped to MAX_VM_RANGE_BYTES / page_size
599 // - corrupt span (raw_span = u64::MAX) — capped, flag set,
600 // no overflow
601
602 /// Page size used for ArenaWalkPlan unit tests. Production code
603 /// resolves the page size from `guest_page_size` (which decodes
604 /// the guest's `TCR_EL1.TG1`); the plan tests pin their math
605 /// against an explicit 4 KiB so they exercise the same shapes
606 /// regardless of the host the test runs on. Granule-specific
607 /// shapes have their own dedicated tests
608 /// (`arena_walk_plan_16k_granule_*`).
609 const TEST_PAGE_SIZE: u64 = 4096;
610
611 #[test]
612 fn arena_walk_plan_constants_sane() {
613 // The plan-derivation invariants depend on these constants.
614 // Pin them so a future tightening surfaces here, not in
615 // snapshot_arena's runtime behavior.
616 assert_eq!(MAX_VM_RANGE_BYTES, 0x1_0000_0000);
617 assert_eq!(MAX_ARENA_PAGES, 4096);
618 assert_eq!(MAX_ARENA_STRIDE_PROBES, 256);
619 }
620
621 #[test]
622 fn arena_walk_plan_single_page() {
623 // Smallest non-empty arena: one page. Sequential walk covers
624 // it; no stride needed; no truncation.
625 let plan = ArenaWalkPlan::new(TEST_PAGE_SIZE, TEST_PAGE_SIZE);
626 assert_eq!(plan.declared_pages, 1);
627 assert!(!plan.span_capped);
628 assert!(!plan.truncated);
629 assert_eq!(plan.sequential_to, 1);
630 assert_eq!(plan.stride, None);
631 }
632
633 #[test]
634 fn arena_walk_plan_exactly_max_arena_pages() {
635 // declared == MAX_ARENA_PAGES: still no stride, no truncation.
636 // Boundary case: MAX_ARENA_PAGES walks sequentially.
637 let plan = ArenaWalkPlan::new(MAX_ARENA_PAGES * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
638 assert_eq!(plan.declared_pages, MAX_ARENA_PAGES);
639 assert!(!plan.truncated);
640 assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
641 assert_eq!(plan.stride, None);
642 }
643
644 #[test]
645 fn arena_walk_plan_one_page_past_max() {
646 // declared = MAX_ARENA_PAGES + 1: stride mode kicks in for
647 // the single tail page; stride must be 1 (every page).
648 let plan = ArenaWalkPlan::new((MAX_ARENA_PAGES + 1) * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
649 assert_eq!(plan.declared_pages, MAX_ARENA_PAGES + 1);
650 assert!(plan.truncated);
651 assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
652 assert_eq!(plan.stride, Some(1));
653 }
654
655 #[test]
656 fn arena_walk_plan_full_4gib() {
657 // Largest legitimate arena: full 4 GiB user_vm window (1M pages).
658 // Sequential covers first 16 MiB; stride sweeps the remaining
659 // ~1M-4096 pages with 256 probes -> stride = ceil((1M - 4096) / 256).
660 let raw = MAX_VM_RANGE_BYTES;
661 let plan = ArenaWalkPlan::new(raw, TEST_PAGE_SIZE);
662 assert_eq!(plan.declared_pages, raw / TEST_PAGE_SIZE);
663 assert!(!plan.span_capped, "exactly 4 GiB is at the cap, not above");
664 assert!(plan.truncated);
665 assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
666 let stride = plan.stride.expect("stride mode for >MAX_ARENA_PAGES");
667 let tail = plan.declared_pages - MAX_ARENA_PAGES;
668 // Verify stride covers the tail: stride * MAX_ARENA_STRIDE_PROBES
669 // must reach `tail` with at most one slot of overshoot.
670 assert!(stride * MAX_ARENA_STRIDE_PROBES >= tail);
671 assert!((stride - 1) * MAX_ARENA_STRIDE_PROBES < tail);
672 }
673
674 #[test]
675 fn arena_walk_plan_caps_at_4gib() {
676 // Raw span 8 GiB (corrupt struct): span_capped flag set,
677 // declared_pages clamped to MAX_VM_RANGE_BYTES / page_size.
678 let plan = ArenaWalkPlan::new(2 * MAX_VM_RANGE_BYTES, TEST_PAGE_SIZE);
679 assert!(plan.span_capped);
680 assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / TEST_PAGE_SIZE);
681 assert!(plan.truncated);
682 assert!(plan.stride.is_some());
683 }
684
685 #[test]
686 fn arena_walk_plan_caps_corrupt_u64_max_span() {
687 // Pathological: raw_span = u64::MAX. The cap must apply
688 // BEFORE the span-to-pages division; without the cap,
689 // u64::MAX / page_size = ~4.5 quadrillion pages and the
690 // pgoff loop would live-lock.
691 let plan = ArenaWalkPlan::new(u64::MAX, TEST_PAGE_SIZE);
692 assert!(plan.span_capped);
693 assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / TEST_PAGE_SIZE);
694 assert!(plan.truncated);
695 }
696
697 #[test]
698 fn arena_walk_plan_zero_span() {
699 // Edge: zero span. snapshot_arena can reach this with
700 // max_entries=0; the plan must handle zero spans without
701 // panicking or computing nonsense bounds.
702 let plan = ArenaWalkPlan::new(0, TEST_PAGE_SIZE);
703 assert_eq!(plan.declared_pages, 0);
704 assert!(!plan.span_capped);
705 assert!(!plan.truncated);
706 assert_eq!(plan.sequential_to, 0);
707 assert_eq!(plan.stride, None);
708 }
709
710 #[test]
711 fn arena_walk_plan_stride_visits_every_pgoff_when_short_tail() {
712 // tail < MAX_ARENA_STRIDE_PROBES: stride saturates at 1, so
713 // the sweep walks every remaining page. Verify by simulating
714 // the walk and counting positions.
715 // declared = MAX_ARENA_PAGES + 50 -> tail = 50 -> stride = 1.
716 let plan = ArenaWalkPlan::new((MAX_ARENA_PAGES + 50) * TEST_PAGE_SIZE, TEST_PAGE_SIZE);
717 assert_eq!(plan.stride, Some(1));
718 let mut pgoff = plan.sequential_to;
719 let mut visited = 0u64;
720 while pgoff < plan.declared_pages {
721 visited += 1;
722 pgoff = pgoff.saturating_add(plan.stride.unwrap());
723 }
724 assert_eq!(visited, 50, "every tail page should be visited");
725 }
726
727 #[test]
728 fn arena_walk_plan_stride_distributes_probes_in_long_tail() {
729 // tail >> MAX_ARENA_STRIDE_PROBES: stride > 1, fewer probes
730 // than tail pages. Verify the sweep visits exactly
731 // approximately MAX_ARENA_STRIDE_PROBES positions.
732 let plan = ArenaWalkPlan::new(MAX_VM_RANGE_BYTES, TEST_PAGE_SIZE); // full 4 GiB
733 let mut pgoff = plan.sequential_to;
734 let mut visited = 0u64;
735 while pgoff < plan.declared_pages {
736 visited += 1;
737 pgoff = pgoff.saturating_add(plan.stride.unwrap());
738 }
739 // The sweep visits ceil(tail / stride) positions; for the
740 // 4 GiB case `stride * MAX_ARENA_STRIDE_PROBES >= tail` so
741 // visited <= MAX_ARENA_STRIDE_PROBES, and `>= tail / stride`
742 // ensures it's not zero.
743 assert!(
744 visited <= MAX_ARENA_STRIDE_PROBES + 1,
745 "visited {visited}, expected ≤ {} probes",
746 MAX_ARENA_STRIDE_PROBES + 1
747 );
748 assert!(
749 visited >= MAX_ARENA_STRIDE_PROBES - 1,
750 "visited {visited}, expected ≥ {}-ish probes",
751 MAX_ARENA_STRIDE_PROBES - 1
752 );
753 }
754
755 /// `guard_half` mirrors the kernel's `bpf_arena_get_kern_vm_start`
756 /// `GUARD_SZ/2` formula. Pin the three legitimate page granules
757 /// (4 KiB, 16 KiB, 64 KiB) against the hand-computed values from
758 /// the doc comment so a regression in the
759 /// `next_multiple_of(page_size << 1)` math surfaces here.
760 #[test]
761 fn guard_half_matches_kernel_formula() {
762 // 4 KiB granule: round_up(65536, 8192) = 65536, /2 = 32768.
763 assert_eq!(guard_half(4096), 32768);
764 // 16 KiB granule: round_up(65536, 32768) = 65536, /2 = 32768.
765 assert_eq!(guard_half(16384), 32768);
766 // 64 KiB granule: round_up(65536, 131072) = 131072, /2 = 65536.
767 assert_eq!(guard_half(65536), 65536);
768 }
769
770 /// `guest_page_size` decodes `TCR_EL1.TG1` (bits [31:30]) into
771 /// the granule size on aarch64; on x86_64 it is fixed at 4 KiB
772 /// regardless of the input. Pin the four encodings + the
773 /// reserved fallback path so a regression in the bit math
774 /// surfaces here.
775 #[test]
776 fn guest_page_size_decodes_tg1() {
777 #[cfg(target_arch = "x86_64")]
778 {
779 // x86_64: page size is always 4 KiB, regardless of the
780 // (ignored) `tcr_el1` argument.
781 assert_eq!(guest_page_size(0), 4096);
782 assert_eq!(guest_page_size(0b01u64 << 30), 4096);
783 assert_eq!(guest_page_size(0b10u64 << 30), 4096);
784 assert_eq!(guest_page_size(0b11u64 << 30), 4096);
785 }
786 #[cfg(target_arch = "aarch64")]
787 {
788 // TG1=0b10 → 4 KiB
789 assert_eq!(guest_page_size(0b10u64 << 30), 4096);
790 // TG1=0b01 → 16 KiB (Apple Silicon style)
791 assert_eq!(guest_page_size(0b01u64 << 30), 16384);
792 // TG1=0b11 → 64 KiB
793 assert_eq!(guest_page_size(0b11u64 << 30), 65536);
794 // TG1=0b00 (reserved) → conservative 4 KiB fallback
795 assert_eq!(guest_page_size(0), 4096);
796 }
797 }
798
799 /// 16 KiB-granule arena (Apple Silicon kernel build): a single
800 /// declared page is 16 KiB. With raw_span = 16384 the plan must
801 /// report `declared_pages = 1`, no stride. Pre-fix, `PAGE_SIZE`
802 /// was hardcoded to 4096 so 16384 / 4096 = 4 pages — wrong.
803 #[test]
804 fn arena_walk_plan_16k_granule_single_page() {
805 let plan = ArenaWalkPlan::new(16384, 16384);
806 assert_eq!(plan.declared_pages, 1);
807 assert!(!plan.span_capped);
808 assert!(!plan.truncated);
809 assert_eq!(plan.sequential_to, 1);
810 assert_eq!(plan.stride, None);
811 }
812
813 /// 16 KiB-granule arena at the 4 GiB cap: `declared_pages` =
814 /// 4 GiB / 16 KiB = 256 K. Pre-fix, the divisor was 4 KiB so
815 /// the count would have been 4x too large.
816 #[test]
817 fn arena_walk_plan_16k_granule_full_cap() {
818 let plan = ArenaWalkPlan::new(MAX_VM_RANGE_BYTES, 16384);
819 assert_eq!(plan.declared_pages, MAX_VM_RANGE_BYTES / 16384);
820 assert!(!plan.span_capped);
821 assert!(plan.truncated);
822 assert_eq!(plan.sequential_to, MAX_ARENA_PAGES);
823 }
824}