ktstr/probe/process.rs
1//! Probe skeleton lifecycle: load, attach, run, collect.
2//!
3//! When the pipeline is split into two phases (see [`PhaseBInput`]):
4//! - **Phase A** attaches kprobes + the trigger + kernel fexit before
5//! the scheduler starts; it runs under the initial skeleton load.
6//! - **Phase B** attaches fentry/fexit to the scheduler's BPF
7//! struct_ops callbacks after the scheduler is up and BPF programs
8//! are discoverable.
9//!
10//! In the single-phase path, all probes attach after the scheduler is
11//! up.
12//!
13//! ## Two-phase sync mechanism
14//!
15//! Phase A runs on a probe worker thread. Caller and worker
16//! synchronize via two `Latch`es and one mpsc channel:
17//!
18//! 1. Caller spawns the probe worker, which loads the skeleton and
19//! attaches kprobes + trigger + kernel fexit, then signals the
20//! `probes_ready` latch (see `ready.set()` below). The worker
21//! then enters the ringbuf poll loop.
22//! 2. Caller waits on `probes_ready`. After it fires, the caller
23//! starts the scheduler — the scheduler launches with Phase A
24//! probes already attached, so the trigger and any kprobes that
25//! fire during scheduler init are observed.
26//! 3. After the scheduler is up, the caller discovers BPF programs
27//! by scheduler pid (see `discover_bpf_symbols` /
28//! `expand_bpf_to_kernel_callers`) and sends a [`PhaseBInput`]
29//! over the channel. The Phase B input includes BPF program FDs
30//! held open while the scheduler is alive.
31//! 4. The probe worker's poll loop calls `try_recv` on the channel
32//! every 100 ms; on receipt it attaches BPF fentry/fexit + extra
33//! kprobes for kernel callers, then signals the `done` latch on
34//! the [`PhaseBInput`].
35//! 5. Caller waits on `done` and proceeds to the test scenario with
36//! full instrumentation in place.
37
38use std::sync::Arc;
39use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
40use std::time::Duration;
41
42use super::btf::{BtfFunc, RenderHint, STRUCT_FIELDS};
43use super::stack::StackFunction;
44
45use crate::bpf_skel::types;
46use crate::sync::{Latch, RwLockExt};
47
48/// Cross-thread mirror of the BPF probe's `ktstr_err_exit_detected`
49/// BSS latch. The probe-poll thread reads the BSS slot via volatile
50/// load each iteration and `store`s the observed value here; scenario
51/// code calls [`sched_exit_kind`] from any thread to read the current
52/// classification synchronously without owning the skeleton.
53///
54/// Encoded as `u32` for `AtomicU32`-friendliness:
55/// - `0` (initial / probe-not-yet-armed / scheduler still alive):
56/// [`SchedExitKind::Unknown`].
57/// - `1` ([`SchedExitKind::Clean`]) — probe armed, BSS latch read as
58/// `0` after at least one poll iteration. Means the scheduler is
59/// either still alive (latch never set) or exited via the clean
60/// `SCX_EXIT_NONE` path.
61/// - `2` ([`SchedExitKind::Crashed`]) — probe armed, BSS latch
62/// observed at `!= 0`. Set by the BPF trace_sched_ext_exit handler
63/// under any non-clean kernel exit (`SCX_EXIT_ERROR`,
64/// `SCX_EXIT_ERROR_STALL`, watchdog kick, BPF-side error).
65///
66/// 0 → 1/2 transitions are monotonic for a single test run; once the
67/// kernel emits the exit trace, the latch never clears. Cross-thread
68/// observation uses Release / Acquire so the scenario-side load sees
69/// every prior probe-side store.
70pub(crate) static PROBE_SCHED_EXIT_STATE: AtomicU32 = AtomicU32::new(0);
71
72const PROBE_EXIT_STATE_CLEAN: u32 = 1;
73const PROBE_EXIT_STATE_CRASHED: u32 = 2;
74
75/// Classification of the scheduler's exit observed by the probe
76/// poll thread. Mirrors the BSS latch in
77/// [`PROBE_SCHED_EXIT_STATE`].
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79pub enum SchedExitKind {
80 /// Probe has not observed the scheduler's exit yet — either the
81 /// scheduler is still alive, the probe never armed for this run,
82 /// or the poll thread has not completed a single iteration since
83 /// the prior reset. Callers gating clean-vs-crash classification
84 /// on a known-dead scheduler treat `Unknown` as a probe-coverage
85 /// gap, not as evidence of either outcome.
86 Unknown,
87 /// Probe armed AND completed at least one observation with the
88 /// BSS latch unset. Scheduler is either still alive or exited
89 /// cleanly via the `SCX_EXIT_NONE` path that does not latch the
90 /// error flag.
91 Clean,
92 /// Probe armed AND observed the BSS latch set. Kernel emitted a
93 /// non-clean `trace_sched_ext_exit` event (error, stall,
94 /// watchdog kick, BPF-side error) before this read.
95 Crashed,
96}
97
98/// Read the current probe-observed scheduler exit classification.
99/// Synchronous, cross-thread-safe (Acquire load mirrors the probe's
100/// Release stores). Returns [`SchedExitKind::Unknown`] when the
101/// probe has not yet completed its first poll iteration.
102pub fn sched_exit_kind() -> SchedExitKind {
103 match PROBE_SCHED_EXIT_STATE.load(Ordering::Acquire) {
104 PROBE_EXIT_STATE_CLEAN => SchedExitKind::Clean,
105 PROBE_EXIT_STATE_CRASHED => SchedExitKind::Crashed,
106 _ => SchedExitKind::Unknown,
107 }
108}
109
110/// Override the cross-thread exit-state mirror. The probe poll thread
111/// is the production writer (it mirrors the BSS `ktstr_err_exit_detected`
112/// latch into [`PROBE_SCHED_EXIT_STATE`]); this lets host-side tests,
113/// where no BPF probe runs, simulate an observed crash latch. Pair a
114/// `Crashed` set with a reset to [`SchedExitKind::Unknown`] so neighbor
115/// tests start from a clean mirror.
116#[cfg(test)]
117pub(crate) fn set_probe_sched_exit_state(kind: SchedExitKind) {
118 let encoded = match kind {
119 SchedExitKind::Unknown => 0,
120 SchedExitKind::Clean => PROBE_EXIT_STATE_CLEAN,
121 SchedExitKind::Crashed => PROBE_EXIT_STATE_CRASHED,
122 };
123 PROBE_SCHED_EXIT_STATE.store(encoded, Ordering::Release);
124}
125
126/// Input for Phase B probe attachment (BPF fentry/fexit).
127///
128/// Sent via mpsc channel after the scheduler starts and BPF programs
129/// are discoverable. Phase A (kprobes + trigger + kernel fexit) runs
130/// before the scheduler; Phase B attaches fentry/fexit to the
131/// scheduler's BPF struct_ops callbacks.
132pub struct PhaseBInput {
133 /// BPF functions and kernel callers discovered from the running
134 /// scheduler. Includes both BPF callbacks (for fentry) and their
135 /// kernel-side callers from `expand_bpf_to_kernel_callers` (for
136 /// additional kprobes).
137 pub functions: Vec<StackFunction>,
138 /// Pre-opened BPF program FDs keyed by prog_id.
139 pub bpf_prog_fds: std::collections::HashMap<u32, i32>,
140 /// BTF-resolved function signatures for BPF callbacks and kernel callers.
141 pub btf_funcs: Vec<BtfFunc>,
142 /// Signaled by the probe worker thread once Phase B attachment
143 /// completes so the dispatch path can proceed past its wait.
144 pub done: Arc<Latch>,
145 /// Starting func_idx for Phase B functions. Must equal the number
146 /// of functions in Phase A to avoid index collisions in the shared
147 /// `func_meta_map` and `probe_data` maps.
148 pub func_idx_offset: u32,
149}
150
151/// Ring buffer event type for the trigger (matches `EVENT_TRIGGER`
152/// in `intf.h`). Currently the only record type emitted on the
153/// `ktstr_events` ringbuf — `EVENT_SCX_EVENT` was removed alongside
154/// the `tp_btf/sched_ext_event` BPF handler.
155const EVENT_TRIGGER: u32 = 2;
156
157/// Maximum string length carried in a probe_event entry (matches
158/// `MAX_STR_LEN` in `intf.h`). Used to size the `RbEvent.str_val`
159/// field for byte-level wire compatibility with `struct probe_event`
160/// in `intf.h`; the Rust dispatch path leaves it zeroed because
161/// the only producer that populated it (the removed
162/// `tp_btf/sched_ext_event` handler) is gone.
163const MAX_STR_LEN: usize = 64;
164
165/// Pipeline diagnostics from a probe run.
166///
167/// Tracks how many functions/events survived each stage so users can
168/// see WHERE data is being lost (filter, attach, capture, stitch).
169#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
170pub struct ProbeDiagnostics {
171 /// Kernel functions resolved to IPs.
172 pub kprobe_resolved: u32,
173 /// Kernel functions that failed IP resolution.
174 pub kprobe_resolve_failed: Vec<String>,
175 /// Kprobes successfully attached.
176 pub kprobe_attached: u32,
177 /// Kprobes that failed to attach (name, error).
178 pub kprobe_attach_failed: Vec<(String, String)>,
179 /// BPF functions with valid prog IDs for fentry.
180 pub fentry_candidates: u32,
181 /// Fentry probes successfully attached.
182 pub fentry_attached: u32,
183 /// Fentry probes that failed (name, error).
184 pub fentry_attach_failed: Vec<(String, String)>,
185 /// Total keys in probe_data map at readout.
186 pub probe_data_keys: u32,
187 /// Keys with unmatched IPs (no func_meta entry).
188 pub probe_data_unmatched_ips: u32,
189 /// Events read from probe_data before stitching.
190 pub events_before_stitch: u32,
191 /// Events surviving tptr+time stitching.
192 pub events_after_stitch: u32,
193 /// Whether the trigger fired.
194 pub trigger_fired: bool,
195 /// Which trigger mechanism attached ("tp_btf").
196 pub trigger_type: String,
197 /// Error from tp_btf/sched_ext_exit attach failure.
198 #[serde(default, skip_serializing_if = "Option::is_none")]
199 pub trigger_attach_error: Option<String>,
200 /// Panic payload from the guest-side probe-collection thread
201 /// when its `JoinHandle::join()` returned `Err`. `None` on a
202 /// clean run (thread exited normally — events may still be
203 /// empty if the trigger never fired). `Some(payload)`
204 /// distinguishes "the probe thread crashed before producing
205 /// events" from "the probe thread ran cleanly and observed no
206 /// trigger" — the COM2 payload's `events: []` is otherwise
207 /// indistinguishable between those two cases. Any consumer of
208 /// the payload (host harness, render layer, downstream test
209 /// verdict) MUST treat `Some(_)` as a failure even when
210 /// `trigger.fired == false` and `events` is empty.
211 #[serde(default, skip_serializing_if = "Option::is_none")]
212 pub host_thread_panic: Option<String>,
213 /// BPF-side kprobe fire count (cross-CPU sum of the
214 /// `KTSTR_PCPU_PROBE_COUNT` slot in `ktstr_pcpu_counters`).
215 pub bpf_kprobe_fires: u64,
216 /// BPF-side kprobe commit count (cross-CPU sum of the
217 /// `KTSTR_PCPU_KPROBE_RETURNS` slot).
218 /// `bpf_kprobe_fires - bpf_kprobe_returns` is the number of
219 /// kprobe fires that bailed before pushing into `probe_data`
220 /// (meta-map miss or scratch-slot miss).
221 pub bpf_kprobe_returns: u64,
222 /// BPF-side trigger fire count (cross-CPU sum of the
223 /// `KTSTR_PCPU_TRIGGER_COUNT` slot).
224 pub bpf_trigger_fires: u64,
225 /// BPF-side func_meta_map misses (cross-CPU sum of the
226 /// `KTSTR_PCPU_META_MISS` slot — IP not found in map).
227 pub bpf_meta_misses: u64,
228 /// IPs that missed func_meta_map lookup (from BSS ktstr_miss_log).
229 pub bpf_miss_ips: Vec<u64>,
230 /// BPF-side `bpf_ringbuf_reserve` failures inside the trigger
231 /// handler (cross-CPU sum of `KTSTR_PCPU_RINGBUF_DROPS`).
232 /// Non-zero means the userspace consumer fell behind on the
233 /// events ringbuf, so auto-repro will see a missing trigger
234 /// event even though the scheduler did fire.
235 pub bpf_ringbuf_drops: u64,
236 /// Nanosecond timestamp captured by the BPF trigger handler on
237 /// the first error-class `sched_ext_exit` (from BSS
238 /// ktstr_last_trigger_ts). 0 when no error-class exit fired.
239 pub bpf_first_trigger_ns: u64,
240 /// `kind` argument captured by the BPF trigger handler on the
241 /// first error-class `sched_ext_exit` (from BSS
242 /// `ktstr_exit_kind_snap`). 0 when no error-class exit fired,
243 /// otherwise one of the [`SCX_EXIT_*`](super::scx_defs) values.
244 /// Used by the host renderer to disambiguate "trigger fired with
245 /// kind=STALL/ERROR (no causal task; events suppressed)" from
246 /// "trigger never fired" when the post-stitch event count is 0.
247 pub bpf_exit_kind_snap: u32,
248 /// `true` when the readout phase reached the no-causal-tptr
249 /// branch and emitted events grouped by frequency rather than
250 /// stitched against a real trigger task pointer. Set in
251 /// [`run_probe_skeleton`] when the trigger fired with
252 /// `args[0] == 0` (kind=STALL or generic ERROR) or never fired
253 /// at all but at least one captured kprobe event had a non-zero
254 /// task pointer. Surfaced by the host renderer as
255 /// `events: ... — trigger absent, grouped by frequency` so the
256 /// operator does not misread the candidate chain as a verified
257 /// stitch.
258 pub stitch_fallback_used: bool,
259 /// Cumulative count of `tp_btf/sched_switch +
260 /// sched_migrate_task + sched_wakeup` records committed into
261 /// the dedicated `timeline_events` ringbuf by the timeline
262 /// handlers (cross-CPU sum of the
263 /// `KTSTR_PCPU_TIMELINE_COUNT` slot). Zero before any of
264 /// those tracepoints fire; otherwise grows continuously
265 /// while the probe runs. Combined with `bpf_timeline_drops`
266 /// it lets an operator tell whether a failure-time drain
267 /// saw the full window or only the tail.
268 pub bpf_timeline_count: u64,
269 /// `bpf_ringbuf_reserve` failures across the three timeline
270 /// tracepoint handlers (sched_switch / sched_migrate_task /
271 /// sched_wakeup), aggregated as cross-CPU sum of the
272 /// `KTSTR_PCPU_TIMELINE_DROPS` slot. Each drop is one new
273 /// event lost — the ring's existing contents are NOT evicted
274 /// on overflow, so the drain on failure recovers the OLDEST
275 /// captured events first.
276 pub bpf_timeline_drops: u64,
277}
278
279/// Structured probe event captured by the BPF skeleton.
280///
281/// One per (function, task_ptr) combination. `fields` contains BTF-resolved
282/// struct field values keyed as `"param:struct.field"` (from
283/// [`build_field_keys`]). Events are sorted by `ts` and stitched by
284/// `task_struct` pointer before output.
285#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
286pub struct ProbeEvent {
287 /// Index into the run's function list (matches the slot used in
288 /// the shared `func_meta_map`).
289 pub func_idx: u32,
290 /// `task_struct` pointer for the thread that triggered the probe;
291 /// used to stitch entry/exit pairs.
292 pub task_ptr: u64,
293 /// Nanosecond timestamp captured at entry.
294 pub ts: u64,
295 /// First six callee arguments captured verbatim from the call
296 /// site (after convention lowering).
297 pub args: [u64; 6],
298 /// BTF-resolved struct field values decoded post-hoc by the
299 /// caller; each entry is `(field_key, raw_value)`.
300 pub fields: Vec<(String, u64)>,
301 /// Kernel stack trace captured at entry, most recent frame first.
302 pub kstack: Vec<u64>,
303 /// Optional UTF-8 string associated with the event (e.g. a
304 /// scheduler-provided exit reason).
305 #[serde(default, skip_serializing_if = "Option::is_none")]
306 pub str_val: Option<String>,
307 /// Post-mutation field values captured by fexit.
308 /// Same field keys as `fields`, paired by index.
309 #[serde(default, skip_serializing_if = "Vec::is_empty")]
310 pub exit_fields: Vec<(String, u64)>,
311 /// Timestamp when fexit fired.
312 #[serde(default, skip_serializing_if = "Option::is_none")]
313 pub exit_ts: Option<u64>,
314}
315
316/// Parse `/proc/kallsyms` into a `name -> address` map. Returns `None`
317/// when the file is unreadable (expected outside a privileged context)
318/// OR when every parsed entry has a zero address (see
319/// [`accept_kallsyms_map`] for the kptr_restrict rationale).
320fn load_kallsyms() -> Option<std::collections::HashMap<String, u64>> {
321 let raw = std::fs::read_to_string("/proc/kallsyms").ok()?;
322 accept_kallsyms_map(parse_kallsyms(&raw))
323}
324
325/// Return `Some(map)` when at least one entry has a non-zero address,
326/// otherwise `None`. The all-zero case is what the kernel emits under
327/// `kernel.kptr_restrict=2` for non-CAP_SYSLOG callers — the file is
328/// readable, all symbol names are present, but every line carries
329/// `0000000000000000` for its address. Caching such a map would
330/// poison every later [`resolve_func_ip`] lookup with `Some(0)`,
331/// masking the unprivileged state from the retry-after-sudo path;
332/// treating it as a load failure instead lets the next caller (after
333/// [`RETRY_MIN_INTERVAL`]) try again under the new privilege level.
334fn accept_kallsyms_map(
335 map: std::collections::HashMap<String, u64>,
336) -> Option<std::collections::HashMap<String, u64>> {
337 if !map.values().any(|&a| a != 0) {
338 tracing::warn!(
339 entries = map.len(),
340 "/proc/kallsyms parsed with zero addresses only — kptr_restrict \
341 likely active; declining to cache",
342 );
343 return None;
344 }
345 Some(map)
346}
347
348/// Parse kallsyms-format text (one `HEX TYPE NAME ...` line per
349/// symbol) into a `name -> address` map. Extracted from
350/// [`load_kallsyms`] so unit tests can exercise the parser without
351/// touching `/proc/kallsyms`, which is usually unreadable in the
352/// unprivileged contexts the crate runs under.
353///
354/// Skipped lines (silently, without affecting other symbols):
355/// - lines with fewer than 3 whitespace-separated tokens (addr,
356/// type, name — all three are required; the type column is
357/// accepted but ignored)
358/// - lines whose first token is not a hex-parseable `u64`
359///
360/// A permanently-empty map is a valid return value — callers treat
361/// it as "no symbols found" rather than an error.
362fn parse_kallsyms(raw: &str) -> std::collections::HashMap<String, u64> {
363 // No pre-scan — HashMap grows from empty in a single pass over
364 // `raw`.
365 let mut map = std::collections::HashMap::new();
366 for line in raw.lines() {
367 let mut parts = line.split_whitespace();
368 let Some(addr) = parts.next() else { continue };
369 let _ty = parts.next();
370 let Some(sym) = parts.next() else { continue };
371 let Ok(addr) = u64::from_str_radix(addr, 16) else {
372 continue;
373 };
374 map.insert(sym.to_string(), addr);
375 }
376 map
377}
378
379/// Resolve a kernel function name to its address via /proc/kallsyms.
380///
381/// The parsed `name -> address` map is cached on first successful load
382/// so later lookups avoid re-reading and re-splitting ~200k lines.
383/// Callers that resolve many functions in a batch (auto-probe attach,
384/// probe-stack load) drop from O(N\*M) line scans to O(N) hash lookups.
385///
386/// A failed load (unreadable `/proc/kallsyms` — typical for
387/// unprivileged processes where the file is either missing or
388/// returns zeroed addresses) is rate-limited to one retry per
389/// `RETRY_MIN_INTERVAL` (1 s); calls within that window return
390/// `None` immediately. This matters both for performance — a caller
391/// resolving N symbols under a permanently unreadable
392/// `/proc/kallsyms` pays one load attempt, not N — and for
393/// privilege-escalation correctness, where a test harness that
394/// re-execs under `sudo` after the first miss still sees a retry
395/// within seconds.
396pub fn resolve_func_ip(name: &str) -> Option<u64> {
397 use std::sync::{OnceLock, RwLock};
398 use std::time::Instant;
399 static CACHE: OnceLock<RwLock<Option<std::collections::HashMap<String, u64>>>> =
400 OnceLock::new();
401 // Rate-limit the load retry on chronic failure. Without a
402 // floor, a caller that resolves N symbols under a permanently
403 // unreadable /proc/kallsyms triggers N * (read + parse) of a
404 // ~10 MB file. The floor turns that into one retry per window
405 // and returns `None` fast in between.
406 static LAST_LOAD_ATTEMPT: OnceLock<RwLock<Option<Instant>>> = OnceLock::new();
407
408 let slot = CACHE.get_or_init(|| RwLock::new(None));
409 // Fast path: take the read lock when the cache is populated.
410 // Post-load the lookup is read-only and batches resolving many
411 // symbols contend only on the shared read lock.
412 {
413 let read = slot.read_unpoisoned();
414 if let Some(map) = read.as_ref() {
415 return map.get(name).copied();
416 }
417 }
418 // Optional fast-decline: when the retry clock rules the caller
419 // out, avoid the write-lock acquire entirely. This is a
420 // performance hint only — correctness is enforced by the
421 // re-check below under the write lock, so a concurrent racer
422 // that slips past this gate still gets serialized.
423 let last_slot = LAST_LOAD_ATTEMPT.get_or_init(|| RwLock::new(None));
424 {
425 let last = last_slot.read_unpoisoned();
426 if let Some(t) = *last
427 && t.elapsed() < RETRY_MIN_INTERVAL
428 {
429 return None;
430 }
431 }
432 // Slow path: escalate to write lock to populate. Re-check both
433 // the cache and the retry clock under the write lock so N
434 // concurrent first-callers don't stampede into N serialized
435 // loads: only the winner gets past the timestamp gate, everyone
436 // else observes `*last = Some(now)` and bails.
437 let mut write = slot.write_unpoisoned();
438 if write.is_none() {
439 let mut last = last_slot.write_unpoisoned();
440 if last.is_none_or(|t| t.elapsed() >= RETRY_MIN_INTERVAL) {
441 *write = load_kallsyms();
442 *last = Some(Instant::now());
443 }
444 }
445 write.as_ref()?.get(name).copied()
446}
447
448/// Minimum interval between retry attempts when `/proc/kallsyms` is
449/// unreadable; see [`resolve_func_ip`] for the rationale.
450const RETRY_MIN_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1);
451
452/// Build a `func_idx -> task_struct_param_idx` map for stitching by
453/// task pointer. Resolves each function's task-struct argument
454/// position from [`super::stack::BPF_OP_CALLERS`] first, falling back
455/// to the BTF param list (Phase A `btf_funcs` chained with Phase B
456/// `phase_b_btf`).
457///
458/// Entries with `pidx >= 6` are dropped with a warn rather than
459/// stored: the stitch code reads `ProbeEvent::args[pidx]` against a
460/// fixed-size `[u64; 6]` (matching the BPF-side capture limit), so
461/// any larger index would panic. A function with its task_struct
462/// past arg-6 simply cannot be stitched here — the BPF probe never
463/// captured that arg.
464fn build_task_param_idx(
465 func_ips: &[(u32, u64, String)],
466 btf_funcs: &[BtfFunc],
467 phase_b_btf: &[BtfFunc],
468) -> std::collections::HashMap<u32, usize> {
469 func_ips
470 .iter()
471 .filter_map(|(idx, _, name)| {
472 // BPF_OP_CALLERS: (op_fragment, kernel_caller, task_arg_idx)
473 let pidx = if let Some((_, _, tidx)) = super::stack::BPF_OP_CALLERS
474 .iter()
475 .find(|(_, caller, _)| *caller == name.as_str())
476 {
477 *tidx as usize
478 } else {
479 // Fallback: BTF params with task_struct
480 let btf = btf_funcs
481 .iter()
482 .chain(phase_b_btf.iter())
483 .find(|f| f.name == *name)?;
484 btf.params
485 .iter()
486 .position(|p| p.struct_name.as_deref() == Some("task_struct"))?
487 };
488 if pidx >= 6 {
489 tracing::warn!(
490 func = %name,
491 pidx,
492 "task_struct param index out of args[6] bounds — \
493 skipping stitch entry",
494 );
495 return None;
496 }
497 Some((*idx, pidx))
498 })
499 .collect()
500}
501
502/// Populate a `func_meta` with field specs from BTF-resolved offsets.
503/// Shared between kprobe and fentry paths.
504///
505/// Invariant: `meta.nr_field_specs = max(field_idx) + 1`, NOT the count
506/// of specs. The BPF side writes `entry.fields[field_idx]` positionally,
507/// and the Rust side reads `entry.fields[..nr_field_specs]` positionally
508/// against [`build_field_keys`] (which includes skipped fields). A
509/// reorder of either loop that turns this into a count would silently
510/// mismatch keys to values.
511fn populate_field_specs(meta: &mut types::func_meta, field_specs: &[super::btf::FieldSpec]) {
512 let n = field_specs.len().min(16);
513 let max_fidx = field_specs
514 .iter()
515 .take(n)
516 .map(|fs| fs.field_idx)
517 .max()
518 .map(|m| m + 1)
519 .unwrap_or(0);
520 meta.nr_field_specs = max_fidx.min(16);
521 for fs in field_specs.iter().take(n) {
522 let slot = fs.field_idx as usize;
523 if slot < 16 {
524 meta.specs[slot] = types::field_spec {
525 param_idx: fs.param_idx,
526 offset: fs.offset,
527 size: fs.size,
528 field_idx: fs.field_idx,
529 ptr_offset: fs.ptr_offset,
530 };
531 }
532 }
533}
534
535/// Build field key names for a function based on its BTF info.
536///
537/// Returns a vec mapping `field_idx` to an output key name. Format:
538/// - Known struct param: `"p:task_struct.pid"`
539/// - Auto-discovered BPF struct: `"ctx:task_ctx.field_a"`
540/// - Scalar param: `"flags:val.flags"`
541///
542/// Processes at most 6 params (fentry/kprobe register limit) and
543/// at most 16 fields total (matching `MAX_FIELDS` in intf.h).
544///
545/// Invariant: keys are emitted in the same order [`populate_field_specs`]
546/// consumes `field_specs`, so `entry.fields[i]` maps to the i-th key.
547fn build_field_keys(btf_func: &BtfFunc) -> Vec<(String, RenderHint)> {
548 let mut keys = Vec::new();
549 let mut field_idx: u32 = 0;
550
551 let max_params = btf_func.params.len().min(6);
552 for param in &btf_func.params[..max_params] {
553 if let Some(ref sname) = param.struct_name {
554 if let Some((_, fields)) = STRUCT_FIELDS.iter().find(|(s, _)| *s == sname) {
555 for (_, key) in *fields {
556 // Known struct fields use dedicated decoders in
557 // decode.rs — hint is irrelevant (Default/Hex).
558 keys.push((format!("{}:{}.{}", param.name, sname, key), RenderHint::Hex));
559 field_idx += 1;
560 if field_idx >= 16 {
561 break;
562 }
563 }
564 }
565 } else if !param.auto_fields.is_empty() {
566 let tname = param.type_name.as_deref().unwrap_or("void");
567 for (fname, _, hint) in ¶m.auto_fields {
568 keys.push((format!("{}:{}.{}", param.name, tname, fname), *hint));
569 field_idx += 1;
570 if field_idx >= 16 {
571 break;
572 }
573 }
574 } else if !param.is_ptr {
575 keys.push((
576 format!("{}:val.{}", param.name, param.name),
577 RenderHint::Hex,
578 ));
579 field_idx += 1;
580 }
581 }
582
583 keys
584}
585
586/// Detect which param (if any) is a char * string.
587/// Uses BTF type detection first, then name heuristic as fallback.
588/// Returns 0xff if none found.
589fn detect_str_param(btf_func: &BtfFunc) -> u8 {
590 let max = btf_func.params.len().min(6);
591 // BTF-based: check is_string_ptr flag set by parse_btf_functions.
592 for (i, p) in btf_func.params[..max].iter().enumerate() {
593 if p.is_string_ptr {
594 return i as u8;
595 }
596 }
597 // Name heuristic fallback.
598 for (i, p) in btf_func.params[..max].iter().enumerate() {
599 if !p.is_ptr || p.struct_name.is_some() {
600 continue;
601 }
602 let n = p.name.as_str();
603 if matches!(n, "fmt" | "msg" | "str" | "reason" | "buf" | "s")
604 || n.contains("str")
605 || n.contains("msg")
606 || n.contains("fmt")
607 {
608 return i as u8;
609 }
610 }
611 0xff
612}
613
614/// Pre-open BPF program FDs while the scheduler is alive.
615///
616/// Returns a map from `bpf_prog_id` to owned fd. Holding these FDs
617/// keeps the BPF programs alive via kernel refcounting even after the
618/// scheduler exits. Must be called before the test function runs
619/// (which may crash the scheduler).
620pub fn open_bpf_prog_fds(functions: &[StackFunction]) -> std::collections::HashMap<u32, i32> {
621 let mut fds = std::collections::HashMap::new();
622 for f in functions {
623 if let Some(prog_id) = f.bpf_prog_id {
624 let fd = unsafe { libbpf_rs::libbpf_sys::bpf_prog_get_fd_by_id(prog_id) };
625 if fd >= 0 {
626 fds.insert(prog_id, fd);
627 }
628 }
629 }
630 fds
631}
632
633/// `&ProgramMut<'_>` newtype that asserts thread-shared access is
634/// safe for [`libbpf_rs::ProgramMut::attach_kprobe`].
635///
636/// `ProgramMut` holds a `NonNull<bpf_program>` which keeps it
637/// `!Sync` at the Rust type-system level. Concurrent attach is
638/// nonetheless sound:
639/// - `bpf_program__attach_kprobe_opts` (the libbpf C path
640/// `attach_kprobe` calls) takes `const struct bpf_program *prog` —
641/// it does not mutate the program through that pointer.
642/// - Each attach call creates an independent `perf_event_open` fd
643/// plus an independent `BPF_LINK_CREATE` syscall; those resources
644/// are not shared with the program object.
645/// - The only program-adjacent state the path touches is the global
646/// feature-detection cache via `kernel_supports` →
647/// `feat_supported`, which uses `READ_ONCE` / `WRITE_ONCE` and is
648/// idempotent across racing readers (see
649/// `libbpf-sys/libbpf/src/features.c::feat_supported`).
650/// All threads call only `attach_kprobe` on the inner reference; no
651/// mutating method is invoked here.
652///
653/// `Send` is paired with `Sync` because `std::thread::scope` requires
654/// the captured reference's referent to be `Sync` for `&_` to be
655/// `Send`.
656struct AttachableProgRef<'a, 'obj> {
657 inner: &'a libbpf_rs::ProgramMut<'obj>,
658}
659// SAFETY: see the type-level doc.
660unsafe impl Send for AttachableProgRef<'_, '_> {}
661// SAFETY: see the type-level doc.
662unsafe impl Sync for AttachableProgRef<'_, '_> {}
663
664/// Attach `prog` as a kprobe to every name in `func_names`,
665/// parallelising across worker threads.
666///
667/// Each attach is an independent `perf_event_open` + `BPF_LINK_CREATE`
668/// syscall pair. Sequentialised, the per-attach syscall round-trip
669/// dominates the auto-repro setup phase when the crash backtrace
670/// names many kernel functions. Spawning a small worker pool lets
671/// the kernel run the attaches concurrently and shrinks the total
672/// wall-clock cost of the loop to roughly `total / parallelism`
673/// (bounded by kernel-side serialisation inside `perf_event_open` for
674/// kprobe registration).
675///
676/// Worker count is `min(func_names.len(), 8)` — a fixed cap that
677/// matches the typical small ktstr backtrace width while avoiding
678/// thread-spawn overhead when there are only a handful of probes.
679/// Returns one `(name, Result<Link, libbpf_rs::Error>)` entry per
680/// input in input order, so the caller can populate
681/// [`ProbeDiagnostics::kprobe_attach_failed`] / `links` with the
682/// same shape as the prior sequential loop.
683fn parallel_attach_kprobes<'a, 'obj>(
684 prog: &'a libbpf_rs::ProgramMut<'obj>,
685 func_names: &[String],
686) -> Vec<(String, libbpf_rs::Result<libbpf_rs::Link>)> {
687 if func_names.is_empty() {
688 return Vec::new();
689 }
690
691 let prog_ref = AttachableProgRef { inner: prog };
692 // `min(N, 8)` balances syscall parallelism against thread-spawn
693 // overhead. Empirically the kernel kprobe registration path
694 // serialises on `kprobe_mutex` so going wider yields diminishing
695 // returns; the cap keeps us comfortably under that wall.
696 const MAX_WORKERS: usize = 8;
697 let workers = func_names.len().min(MAX_WORKERS);
698
699 // Slot the function names into round-robin per-worker buckets
700 // so each thread owns a disjoint subset. Index-tagged so the
701 // output preserves input order regardless of which worker
702 // finishes first.
703 let mut buckets: Vec<Vec<(usize, String)>> = (0..workers).map(|_| Vec::new()).collect();
704 for (i, name) in func_names.iter().enumerate() {
705 buckets[i % workers].push((i, name.clone()));
706 }
707
708 let mut results: Vec<Option<(String, libbpf_rs::Result<libbpf_rs::Link>)>> =
709 (0..func_names.len()).map(|_| None).collect();
710
711 std::thread::scope(|s| {
712 let mut handles = Vec::with_capacity(workers);
713 for bucket in buckets {
714 if bucket.is_empty() {
715 continue;
716 }
717 let prog_ref = &prog_ref;
718 handles.push(s.spawn(move || {
719 let mut out: Vec<(usize, String, libbpf_rs::Result<libbpf_rs::Link>)> =
720 Vec::with_capacity(bucket.len());
721 for (i, name) in bucket {
722 let r = prog_ref.inner.attach_kprobe(false, &name);
723 out.push((i, name, r));
724 }
725 out
726 }));
727 }
728 for h in handles {
729 // Worker panics propagate as the join Err — re-panic on
730 // the main thread to surface bugs rather than silently
731 // dropping attach results.
732 let out = h.join().expect("kprobe attach worker panicked");
733 for (i, name, r) in out {
734 results[i] = Some((name, r));
735 }
736 }
737 });
738
739 results
740 .into_iter()
741 .map(|o| o.expect("every input slot must be filled by exactly one worker"))
742 .collect()
743}
744
745/// Attach the fentry program in slot 0..=3 on the fentry skeleton.
746///
747/// The fentry skeleton exposes four indexed programs
748/// (`ktstr_fentry_0`..`ktstr_fentry_3`) matching the 4-slot batch
749/// model in `src/bpf/fentry_probe.bpf.c`. Call sites previously spelled the
750/// full 4-arm `match slot { ... }` inline; routing through this
751/// family of helpers keeps the dispatch in one place so a future
752/// slot addition is a one-line change per helper instead of
753/// scattered across every batch.
754///
755/// Returns `None` for slot indices outside 0..=3, matching the
756/// existing `continue;` behaviour at call sites.
757fn attach_fentry_by_slot(
758 skel: &crate::bpf_skel::fentry::FentryProbeSkel<'_>,
759 slot: usize,
760) -> Option<libbpf_rs::Result<libbpf_rs::Link>> {
761 Some(match slot {
762 0 => skel.progs.ktstr_fentry_0.attach_trace(),
763 1 => skel.progs.ktstr_fentry_1.attach_trace(),
764 2 => skel.progs.ktstr_fentry_2.attach_trace(),
765 3 => skel.progs.ktstr_fentry_3.attach_trace(),
766 _ => return None,
767 })
768}
769
770/// Attach the fexit program in slot 0..=3 on the fentry skeleton.
771/// Sibling of [`attach_fentry_by_slot`]; see its doc for the routing
772/// rationale.
773fn attach_fexit_by_slot(
774 skel: &crate::bpf_skel::fentry::FentryProbeSkel<'_>,
775 slot: usize,
776) -> Option<libbpf_rs::Result<libbpf_rs::Link>> {
777 Some(match slot {
778 0 => skel.progs.ktstr_fexit_0.attach_trace(),
779 1 => skel.progs.ktstr_fexit_1.attach_trace(),
780 2 => skel.progs.ktstr_fexit_2.attach_trace(),
781 3 => skel.progs.ktstr_fexit_3.attach_trace(),
782 _ => return None,
783 })
784}
785
786/// Borrow the open fentry program in slot 0..=3 for pre-load
787/// configuration (`set_attach_target`, `set_autoload`).
788///
789/// Pre-load sibling of [`attach_fentry_by_slot`]: operates on
790/// [`OpenFentryProbeSkel`] before [`OpenSkel::load`] consumes it.
791/// Returns `None` for slot indices outside 0..=3, matching the
792/// existing `continue;` behaviour at call sites.
793///
794/// [`OpenFentryProbeSkel`]: crate::bpf_skel::fentry::OpenFentryProbeSkel
795/// [`OpenSkel::load`]: libbpf_rs::skel::OpenSkel::load
796fn fentry_prog_mut_by_slot<'a, 'obj>(
797 open_skel: &'a mut crate::bpf_skel::fentry::OpenFentryProbeSkel<'obj>,
798 slot: usize,
799) -> Option<&'a mut libbpf_rs::OpenProgramMut<'obj>> {
800 Some(match slot {
801 0 => &mut open_skel.progs.ktstr_fentry_0,
802 1 => &mut open_skel.progs.ktstr_fentry_1,
803 2 => &mut open_skel.progs.ktstr_fentry_2,
804 3 => &mut open_skel.progs.ktstr_fentry_3,
805 _ => return None,
806 })
807}
808
809/// Borrow the open fexit program in slot 0..=3 for pre-load
810/// configuration. Pre-load sibling of [`attach_fexit_by_slot`];
811/// see [`fentry_prog_mut_by_slot`] for the routing rationale.
812fn fexit_prog_mut_by_slot<'a, 'obj>(
813 open_skel: &'a mut crate::bpf_skel::fentry::OpenFentryProbeSkel<'obj>,
814 slot: usize,
815) -> Option<&'a mut libbpf_rs::OpenProgramMut<'obj>> {
816 Some(match slot {
817 0 => &mut open_skel.progs.ktstr_fexit_0,
818 1 => &mut open_skel.progs.ktstr_fexit_1,
819 2 => &mut open_skel.progs.ktstr_fexit_2,
820 3 => &mut open_skel.progs.ktstr_fexit_3,
821 _ => return None,
822 })
823}
824
825/// Disable autoload on both the fentry and fexit programs for
826/// slot 0..=3 so the verifier skips them at
827/// [`OpenSkel::load`][libbpf_rs::skel::OpenSkel::load]. Used for
828/// unused batch slots and slots whose
829/// [`set_attach_target`][libbpf_rs::OpenProgramMut::set_attach_target]
830/// call failed — either leaves the skeleton with placeholder
831/// targets the verifier would reject.
832///
833/// No-op for slot indices outside 0..=3.
834fn disable_slot_programs(
835 open_skel: &mut crate::bpf_skel::fentry::OpenFentryProbeSkel<'_>,
836 slot: usize,
837) {
838 if let Some(p) = fentry_prog_mut_by_slot(open_skel, slot) {
839 p.set_autoload(false);
840 }
841 if let Some(p) = fexit_prog_mut_by_slot(open_skel, slot) {
842 p.set_autoload(false);
843 }
844}
845
846/// Write the per-slot rodata fields (`ktstr_fentry_func_idx_N`,
847/// `ktstr_fentry_is_kernel_N`) for slot 0..=3. Mirrors the BPF
848/// side's positional `rodata` layout in `src/bpf/fentry_probe.bpf.c`.
849///
850/// No-op for slot indices outside 0..=3.
851fn set_rodata_slot(
852 rodata: &mut crate::bpf_skel::fentry::types::rodata,
853 slot: usize,
854 idx: u32,
855 is_kernel: bool,
856) {
857 let k = is_kernel as u8;
858 match slot {
859 0 => {
860 rodata.ktstr_fentry_func_idx_0 = idx;
861 rodata.ktstr_fentry_is_kernel_0 = k;
862 }
863 1 => {
864 rodata.ktstr_fentry_func_idx_1 = idx;
865 rodata.ktstr_fentry_is_kernel_1 = k;
866 }
867 2 => {
868 rodata.ktstr_fentry_func_idx_2 = idx;
869 rodata.ktstr_fentry_is_kernel_2 = k;
870 }
871 3 => {
872 rodata.ktstr_fentry_func_idx_3 = idx;
873 rodata.ktstr_fentry_is_kernel_3 = k;
874 }
875 _ => {}
876 }
877}
878
879/// Causal-task filter for the trigger event's `task_ptr` (sourced
880/// from `args[0]`).
881///
882/// The BPF `ktstr_trigger_tp` handler sets `args[0]` to
883/// `bpf_get_current_task()` only for `SCX_EXIT_ERROR_BPF` (where
884/// `current` IS the causal task); for `SCX_EXIT_ERROR` (kworker /
885/// sysrq context) it emits `args[0] == 0`. A zero `task_ptr` means
886/// "no causal task" and must suppress probe stitching, so this
887/// returns `None` for 0 and `Some(p)` otherwise. `run_probe_skeleton`
888/// calls this on the last trigger event; the `args0_*` tests pin both
889/// sides of the contract against this exact predicate.
890fn causal_tptr(task_ptr: u64) -> Option<u64> {
891 Some(task_ptr).filter(|&p| p != 0)
892}
893
894/// Run the BPF probe skeleton for auto-repro.
895///
896/// Operates in two modes depending on `phase_b_rx`:
897///
898/// **Single-phase (`phase_b_rx = None`):** loads the kprobe skeleton
899/// and fentry/fexit skeleton together, attaches all probes, then
900/// polls until the trigger fires.
901///
902/// **Two-phase (`phase_b_rx = Some`):** Phase A attaches kprobes +
903/// kernel fexit + the `tp_btf/sched_ext_exit` trigger before the
904/// scheduler starts, signals `ready`, then polls the ring buffer
905/// while waiting for Phase B input via the channel. When Phase B
906/// input arrives, attaches fentry/fexit to BPF struct_ops callbacks
907/// and additional kprobes for kernel callers. Signals `done` on the
908/// `PhaseBInput` when Phase B attachment completes. If the trigger
909/// fires before Phase B input arrives, fentry is skipped — the
910/// crash happened before BPF programs could be probed.
911///
912/// The trigger fires on `sched_ext_exit` inside `scx_claim_exit()`
913/// — exactly once per scheduler lifetime. If the tracepoint is
914/// unavailable, auto-repro is skipped.
915///
916/// Returns accumulated func_names from both phases as the third
917/// tuple element.
918pub fn run_probe_skeleton(
919 functions: &[StackFunction],
920 btf_funcs: &[BtfFunc],
921 stop: &AtomicBool,
922 bpf_prog_fds: &std::collections::HashMap<u32, i32>,
923 ready: &Latch,
924 phase_b_rx: Option<std::sync::mpsc::Receiver<PhaseBInput>>,
925) -> (
926 Option<Vec<ProbeEvent>>,
927 ProbeDiagnostics,
928 Vec<(u32, String)>,
929) {
930 use crate::bpf_skel::*;
931 use libbpf_rs::skel::{OpenSkel, SkelBuilder};
932 use libbpf_rs::{Link, MapCore, MapFlags, RingBufferBuilder};
933
934 tracing::debug!(n = functions.len(), "run_probe_skeleton");
935
936 let mut diag = ProbeDiagnostics::default();
937
938 // Open skeleton. Two MaybeUninit slots: the first backs the
939 // initial load attempt; the second backs the fallback retry when
940 // optional programs cause ESRCH. Both must outlive `skel`.
941 let mut open_object = std::mem::MaybeUninit::uninit();
942 let mut open_object_fallback = std::mem::MaybeUninit::uninit();
943 let builder = ProbeSkelBuilder::default();
944 let mut open_skel = match builder.open(&mut open_object) {
945 Ok(s) => s,
946 Err(e) => {
947 tracing::error!(%e, "probe skeleton open failed");
948 diag.trigger_attach_error = Some(format!("skeleton open: {e}"));
949 ready.set();
950 return (None, diag, Vec::new());
951 }
952 };
953
954 // Enable probes (must set before load — rodata is immutable after)
955 if let Some(rodata) = open_skel.maps.rodata_data.as_mut() {
956 rodata.ktstr_enabled = true;
957 }
958
959 // Load skeleton. Try with all programs first; if a missing tp_btf
960 // target causes ESRCH, re-open with optional programs disabled.
961 // The fallback unconditionally fires on any load error, not
962 // strictly ESRCH — libbpf doesn't surface a stable errno through
963 // its Error type, so an exact ESRCH match is brittle. The retry
964 // is cheap (re-open + load with autoload disabled on the optional
965 // set), so the broader gate is acceptable. When the retry ALSO
966 // fails, we surface BOTH errors so the operator sees the original
967 // root cause alongside the retry failure — a verifier rejection
968 // on a non-optional program would otherwise be masked by the
969 // retry's unrelated error.
970 let (skel, optional_programs_loaded) = match open_skel.load() {
971 Ok(s) => (s, true),
972 Err(first_err) => {
973 tracing::debug!(
974 %first_err,
975 "probe skeleton load failed with all programs; \
976 retrying with optional programs disabled"
977 );
978 let builder2 = ProbeSkelBuilder::default();
979 let mut open_skel2 = match builder2.open(&mut open_object_fallback) {
980 Ok(s) => s,
981 Err(e) => {
982 tracing::error!(%e, "probe skeleton re-open failed");
983 diag.trigger_attach_error = Some(format!(
984 "skeleton open (retry): {e}; original load error: {first_err}"
985 ));
986 ready.set();
987 return (None, diag, Vec::new());
988 }
989 };
990 if let Some(rodata) = open_skel2.maps.rodata_data.as_mut() {
991 rodata.ktstr_enabled = true;
992 }
993 open_skel2.progs.ktstr_pi_fentry.set_autoload(false);
994 open_skel2.progs.ktstr_pi_fexit.set_autoload(false);
995 open_skel2.progs.ktstr_lock_contend.set_autoload(false);
996 open_skel2
997 .progs
998 .ktstr_preempt_disable_tp
999 .set_autoload(false);
1000 open_skel2.progs.ktstr_preempt_enable_tp.set_autoload(false);
1001 match open_skel2.load() {
1002 Ok(s) => (s, false),
1003 Err(e) => {
1004 tracing::error!(
1005 %e, %first_err,
1006 "probe skeleton load failed (retry); \
1007 surfacing both original and retry errors"
1008 );
1009 diag.trigger_attach_error = Some(format!(
1010 "skeleton load (retry): {e}; original error before retry: {first_err}"
1011 ));
1012 ready.set();
1013 return (None, diag, Vec::new());
1014 }
1015 }
1016 }
1017 };
1018
1019 // Populate func_meta_map with function IPs and metadata
1020 let mut func_ips: Vec<(u32, u64, String)> = Vec::new(); // (idx, ip, display_name)
1021 let mut bpf_funcs: Vec<(u32, &StackFunction)> = Vec::new(); // BPF functions for fentry
1022
1023 // Load vmlinux BTF once and reuse across every kprobe meta
1024 // population in the loop below. The previous code called
1025 // `resolve_field_specs(_, None)` per function, which re-parsed
1026 // the multi-MB vmlinux BTF on every iteration (>1 s per kernel
1027 // with thousands of kprobed functions). Loading once turns the
1028 // hot path into pure type lookups against a borrowed handle.
1029 // `None` (load failure) leaves `cached_btf` empty and downstream
1030 // call sites fall back to the no-BTF path — same behaviour as
1031 // the previous per-call `Err(...) -> Vec::new()` branch.
1032 //
1033 // `cached_vmlinux_btf` memoises the first successful parse
1034 // process-wide so repeated auto-repro cycles in the same nextest
1035 // process share one `Arc<Btf>` instead of re-reading and
1036 // re-parsing the multi-MB blob each time.
1037 let cached_btf = crate::monitor::btf_offsets::cached_vmlinux_btf();
1038
1039 for (idx, func) in functions.iter().enumerate() {
1040 if func.is_bpf {
1041 bpf_funcs.push((idx as u32, func));
1042 continue;
1043 }
1044 let ip = match resolve_func_ip(&func.raw_name) {
1045 Some(ip) => ip,
1046 None => {
1047 tracing::warn!(func = %func.raw_name, "could not resolve function IP");
1048 diag.kprobe_resolve_failed.push(func.raw_name.clone());
1049 continue;
1050 }
1051 };
1052
1053 let mut meta = types::func_meta {
1054 func_idx: idx as u32,
1055 ..Default::default()
1056 };
1057
1058 // Populate field specs from BTF-resolved offsets.
1059 if let Some(btf_func) = btf_funcs.iter().find(|f| f.name == func.raw_name) {
1060 let field_specs = match cached_btf.as_ref() {
1061 Some(btf) => super::btf::resolve_field_specs_with_btf(btf_func, btf),
1062 None => Vec::new(),
1063 };
1064 populate_field_specs(&mut meta, &field_specs);
1065 // Detect char * params for string capture.
1066 meta.str_param_idx = detect_str_param(btf_func);
1067 }
1068
1069 let key_bytes = ip.to_ne_bytes();
1070 let meta_bytes = unsafe {
1071 std::slice::from_raw_parts(
1072 &meta as *const _ as *const u8,
1073 std::mem::size_of::<types::func_meta>(),
1074 )
1075 };
1076
1077 if let Err(e) = skel
1078 .maps
1079 .func_meta_map
1080 .update(&key_bytes, meta_bytes, MapFlags::ANY)
1081 {
1082 tracing::warn!(%e, func = %func.raw_name, "failed to update func_meta_map");
1083 continue;
1084 }
1085
1086 tracing::debug!(func = %func.raw_name, ip, nr = meta.nr_field_specs, "kprobe meta");
1087 diag.kprobe_resolved += 1;
1088 func_ips.push((idx as u32, ip, func.display_name.clone()));
1089 }
1090
1091 if func_ips.is_empty() && bpf_funcs.is_empty() && phase_b_rx.is_none() {
1092 tracing::warn!("no kprobe IPs resolved and no BPF functions for fentry");
1093 diag.trigger_attach_error =
1094 Some("no functions resolved — kprobes and trigger skipped".to_string());
1095 ready.set();
1096 return (None, diag, Vec::new());
1097 }
1098 if func_ips.is_empty() && (phase_b_rx.is_some() || !bpf_funcs.is_empty()) {
1099 tracing::debug!("no kernel functions resolved to IPs, proceeding with fentry only");
1100 }
1101
1102 // Attach kprobes to each function for entry capture. Exit capture
1103 // for kernel functions uses fexit via the fentry skeleton (batched
1104 // separately below with fd=0 for vmlinux BTF).
1105 //
1106 // Parallelised via [`parallel_attach_kprobes`]: each attach is an
1107 // independent `perf_event_open` + `BPF_LINK_CREATE` syscall pair,
1108 // and the sequential loop's round-trip cost dominated the auto-
1109 // repro Phase A setup time when the crash backtrace named many
1110 // functions. Worker pool runs them concurrently; results land
1111 // back in the original input order so the post-attach
1112 // `kprobe_attach_failed` / `links` shape matches the prior loop.
1113 let mut links: Vec<(Link, String)> = Vec::new();
1114 let attach_input: Vec<String> = func_ips
1115 .iter()
1116 .map(|(idx, _, _)| functions[*idx as usize].raw_name.clone())
1117 .collect();
1118 for (raw, result) in parallel_attach_kprobes(&skel.progs.ktstr_probe, &attach_input) {
1119 match result {
1120 Ok(link) => {
1121 links.push((link, raw));
1122 }
1123 Err(e) => {
1124 tracing::warn!(%e, func = %raw, "kprobe attach failed");
1125 diag.kprobe_attach_failed.push((raw, e.to_string()));
1126 }
1127 }
1128 }
1129 diag.kprobe_attached = links.len() as u32;
1130 tracing::debug!(attached = links.len(), total = func_ips.len(), "kprobes");
1131
1132 // Attach fentry+fexit for BPF callbacks and kernel functions.
1133 // Batched in groups of FENTRY_BATCH per skeleton load to reduce
1134 // verifier passes. BPF callbacks use prog FD + sentinel IP.
1135 // Kernel functions use fd=0 (vmlinux BTF) + real IP.
1136 const FENTRY_BATCH: usize = 4;
1137 let mut fentry_links: Vec<Link> = Vec::new();
1138 let mut fexit_links: Vec<Link> = Vec::new();
1139
1140 struct FentryTarget<'a> {
1141 slot: usize,
1142 fd: i32,
1143 idx: u32,
1144 name: &'a str,
1145 ok: bool,
1146 is_kernel: bool,
1147 }
1148
1149 // Build combined list of targets: BPF callbacks + kernel functions.
1150 let valid_bpf: Vec<_> = bpf_funcs
1151 .iter()
1152 .filter(|(_, f)| f.bpf_prog_id.is_some())
1153 .collect();
1154 diag.fentry_candidates = valid_bpf.len() as u32;
1155
1156 // Kernel functions that were attached via kprobe also get fentry+fexit
1157 // for exit capture. fd=0 targets vmlinux BTF.
1158 struct KernelFentryTarget {
1159 idx: u32,
1160 name: String,
1161 }
1162 let kernel_fexit_targets: Vec<KernelFentryTarget> = func_ips
1163 .iter()
1164 .map(|(idx, _, name)| KernelFentryTarget {
1165 idx: *idx,
1166 name: name.clone(),
1167 })
1168 .collect();
1169
1170 // Interleave: process BPF targets first, then kernel targets.
1171 // Each gets batched into the fentry skeleton in groups of 4.
1172 //
1173 // When phase_b_rx is Some (Phase A/B split), BPF callback fentry
1174 // attachment is deferred to Phase B after the scheduler starts.
1175 // Only kernel fexit (fd=0) and kprobes run in Phase A.
1176
1177 // --- BPF callback batches (skip in Phase A when split is active) ---
1178 if phase_b_rx.is_none() {
1179 for chunk in valid_bpf.chunks(FENTRY_BATCH) {
1180 let mut targets: Vec<FentryTarget<'_>> = Vec::new();
1181 for (slot, (idx, func)) in chunk.iter().enumerate() {
1182 let prog_id = func.bpf_prog_id.unwrap();
1183 let fd = if let Some(&pre_fd) = bpf_prog_fds.get(&prog_id) {
1184 let dup_fd = unsafe { libc::dup(pre_fd) };
1185 if dup_fd < 0 {
1186 tracing::warn!(prog_id, func = %func.display_name, "fentry: dup failed");
1187 diag.fentry_attach_failed.push((
1188 func.display_name.clone(),
1189 format!("dup(pre_fd={pre_fd}) failed"),
1190 ));
1191 continue;
1192 }
1193 dup_fd
1194 } else {
1195 let fd = unsafe { libbpf_rs::libbpf_sys::bpf_prog_get_fd_by_id(prog_id) };
1196 if fd < 0 {
1197 tracing::warn!(prog_id, func = %func.display_name, "fentry: failed to get fd");
1198 diag.fentry_attach_failed.push((
1199 func.display_name.clone(),
1200 format!("bpf_prog_get_fd_by_id({prog_id}) returned {fd}"),
1201 ));
1202 continue;
1203 }
1204 fd
1205 };
1206 targets.push(FentryTarget {
1207 slot,
1208 fd,
1209 idx: *idx,
1210 name: &func.display_name,
1211 ok: false,
1212 is_kernel: false,
1213 });
1214 }
1215 if targets.is_empty() {
1216 continue;
1217 }
1218
1219 use crate::bpf_skel::fentry::*;
1220 let mut fentry_open_obj = std::mem::MaybeUninit::uninit();
1221 let fentry_builder = FentryProbeSkelBuilder::default();
1222 let mut fentry_open = match fentry_builder.open(&mut fentry_open_obj) {
1223 Ok(s) => s,
1224 Err(e) => {
1225 tracing::warn!(%e, "fentry skeleton open failed");
1226 for t in &targets {
1227 unsafe { libc::close(t.fd) };
1228 }
1229 continue;
1230 }
1231 };
1232
1233 // Set rodata: func_idx and is_kernel per slot.
1234 if let Some(rodata) = fentry_open.maps.rodata_data.as_mut() {
1235 rodata.ktstr_enabled = true;
1236 for t in &targets {
1237 set_rodata_slot(rodata, t.slot, t.idx, t.is_kernel);
1238 }
1239 }
1240
1241 for t in targets.iter_mut() {
1242 // Set fentry attach target.
1243 let Some(fentry_prog) = fentry_prog_mut_by_slot(&mut fentry_open, t.slot) else {
1244 continue;
1245 };
1246 match fentry_prog.set_attach_target(t.fd, Some(t.name.to_string())) {
1247 Ok(()) => {
1248 t.ok = true;
1249 tracing::debug!(
1250 slot = t.slot,
1251 func = t.name,
1252 "fentry: set_attach_target ok"
1253 );
1254 }
1255 Err(e) => {
1256 tracing::warn!(slot = t.slot, func = t.name, %e, "fentry: set_attach_target failed");
1257 diag.fentry_attach_failed
1258 .push((t.name.to_string(), format!("set_attach_target: {e}")));
1259 continue;
1260 }
1261 }
1262 // Set fexit attach target on the same function.
1263 let Some(fexit_prog) = fexit_prog_mut_by_slot(&mut fentry_open, t.slot) else {
1264 continue;
1265 };
1266 if let Err(e) = fexit_prog.set_attach_target(t.fd, Some(t.name.to_string())) {
1267 tracing::debug!(slot = t.slot, func = t.name, %e, "fexit: set_attach_target failed (entry-only)");
1268 // Disable autoload so the verifier doesn't reject the
1269 // skeleton due to a stale placeholder target.
1270 fexit_prog.set_autoload(false);
1271 }
1272 }
1273
1274 if !targets.iter().any(|t| t.ok) {
1275 for t in &targets {
1276 unsafe { libc::close(t.fd) };
1277 }
1278 continue;
1279 }
1280
1281 // Disable autoload on unused or failed fentry/fexit slots so the
1282 // verifier doesn't reject the placeholder target.
1283 let used_slots: std::collections::HashSet<usize> =
1284 targets.iter().filter(|t| t.ok).map(|t| t.slot).collect();
1285 for slot in 0..FENTRY_BATCH {
1286 if !used_slots.contains(&slot) {
1287 disable_slot_programs(&mut fentry_open, slot);
1288 }
1289 }
1290 tracing::debug!(
1291 active = used_slots.len(),
1292 disabled = FENTRY_BATCH - used_slots.len(),
1293 "fentry: loading batch",
1294 );
1295 // Reuse the main skeleton's maps so fentry events land in the
1296 // same probe_data map that the Rust side reads.
1297 use std::os::unix::io::AsFd;
1298 if let Err(e) = fentry_open
1299 .maps
1300 .probe_data
1301 .reuse_fd(skel.maps.probe_data.as_fd())
1302 {
1303 tracing::warn!(%e, "fentry: probe_data reuse_fd failed");
1304 }
1305 if let Err(e) = fentry_open
1306 .maps
1307 .func_meta_map
1308 .reuse_fd(skel.maps.func_meta_map.as_fd())
1309 {
1310 tracing::warn!(%e, "fentry: func_meta_map reuse_fd failed");
1311 }
1312
1313 let fentry_skel = match fentry_open.load() {
1314 Ok(s) => {
1315 tracing::debug!("fentry: batch load success");
1316 for t in &targets {
1317 unsafe { libc::close(t.fd) };
1318 }
1319 s
1320 }
1321 Err(e) => {
1322 tracing::warn!(%e, "fentry: batch load failed");
1323 for t in &targets {
1324 if t.ok {
1325 diag.fentry_attach_failed
1326 .push((t.name.to_string(), format!("batch load: {e}")));
1327 }
1328 unsafe { libc::close(t.fd) };
1329 }
1330 continue;
1331 }
1332 };
1333
1334 // Populate func_meta and attach each slot.
1335 for t in &targets {
1336 if !t.ok {
1337 continue;
1338 }
1339
1340 let sentinel_ip = (t.idx as u64) | (1u64 << 63);
1341 let mut meta = crate::bpf_skel::types::func_meta {
1342 func_idx: t.idx,
1343 ..Default::default()
1344 };
1345
1346 if let Some(btf_func) = btf_funcs.iter().find(|f| f.name == t.name) {
1347 // Try vmlinux BTF first (for known struct params like
1348 // task_struct and auto-discovered vmlinux fields),
1349 // then BPF program BTF (for BPF-local types like task_ctx).
1350 let mut field_specs = match cached_btf.as_ref() {
1351 Some(btf) => super::btf::resolve_field_specs_with_btf(btf_func, btf),
1352 None => Vec::new(),
1353 };
1354 if field_specs.is_empty()
1355 && let Some(prog_id) = functions
1356 .iter()
1357 .find(|f| f.display_name == t.name)
1358 .and_then(|f| f.bpf_prog_id)
1359 {
1360 field_specs = super::btf::resolve_bpf_field_specs(btf_func, prog_id);
1361 }
1362 populate_field_specs(&mut meta, &field_specs);
1363 meta.str_param_idx = detect_str_param(btf_func);
1364 }
1365
1366 let Some(result) = attach_fentry_by_slot(&fentry_skel, t.slot) else {
1367 continue;
1368 };
1369 let link = match result {
1370 Ok(link) => {
1371 tracing::debug!(func = t.name, "fentry attached");
1372 link
1373 }
1374 Err(e) => {
1375 tracing::warn!(%e, func = t.name, "fentry attach failed");
1376 diag.fentry_attach_failed
1377 .push((t.name.to_string(), e.to_string()));
1378 continue;
1379 }
1380 };
1381
1382 // func_meta_map.update + func_ips.push run AFTER the
1383 // fentry attach succeeds. Reversing the order would
1384 // orphan map entries and func_ip tuples on attach
1385 // failure — downstream reporting ("successfully
1386 // probed N funcs") would then show false-positive
1387 // successes for probes that never fired. If the
1388 // map update fails after the attach succeeded, drop
1389 // the Link (which detaches the program) so the
1390 // post-attach state matches what func_ips reports.
1391 let key_bytes = sentinel_ip.to_ne_bytes();
1392 let meta_bytes = unsafe {
1393 std::slice::from_raw_parts(
1394 &meta as *const _ as *const u8,
1395 std::mem::size_of::<crate::bpf_skel::types::func_meta>(),
1396 )
1397 };
1398 if let Err(e) =
1399 skel.maps
1400 .func_meta_map
1401 .update(&key_bytes, meta_bytes, MapFlags::ANY)
1402 {
1403 tracing::warn!(%e, func = t.name, "fentry: failed to update func_meta_map; dropping attached link");
1404 drop(link);
1405 continue;
1406 }
1407 fentry_links.push(link);
1408 func_ips.push((t.idx, sentinel_ip, t.name.to_string()));
1409 // Attach fexit for exit-side capture.
1410 let Some(fexit_result) = attach_fexit_by_slot(&fentry_skel, t.slot) else {
1411 continue;
1412 };
1413 match fexit_result {
1414 Ok(link) => {
1415 tracing::debug!(func = t.name, "fexit attached");
1416 fexit_links.push(link);
1417 }
1418 Err(e) => {
1419 tracing::debug!(%e, func = t.name, "fexit attach failed (entry-only)");
1420 }
1421 }
1422 }
1423
1424 drop(fentry_skel);
1425 }
1426 diag.fentry_attached = fentry_links.len() as u32;
1427 if !valid_bpf.is_empty() {
1428 tracing::debug!(
1429 fentry = fentry_links.len(),
1430 fexit = fexit_links.len(),
1431 total = valid_bpf.len(),
1432 "BPF probes",
1433 );
1434 }
1435 } // end if phase_b_rx.is_none() — BPF callback batches
1436
1437 // --- Kernel function fexit batches (fd=0 = vmlinux BTF) ---
1438 for chunk in kernel_fexit_targets.chunks(FENTRY_BATCH) {
1439 let mut targets: Vec<FentryTarget<'_>> = Vec::new();
1440 for (slot, kt) in chunk.iter().enumerate() {
1441 targets.push(FentryTarget {
1442 slot,
1443 fd: 0, // vmlinux BTF
1444 idx: kt.idx,
1445 name: &kt.name,
1446 ok: false,
1447 is_kernel: true,
1448 });
1449 }
1450
1451 use crate::bpf_skel::fentry::*;
1452 let mut fentry_open_obj = std::mem::MaybeUninit::uninit();
1453 let fentry_builder = FentryProbeSkelBuilder::default();
1454 let mut fentry_open = match fentry_builder.open(&mut fentry_open_obj) {
1455 Ok(s) => s,
1456 Err(e) => {
1457 tracing::warn!(%e, "kernel fexit skeleton open failed");
1458 continue;
1459 }
1460 };
1461
1462 if let Some(rodata) = fentry_open.maps.rodata_data.as_mut() {
1463 rodata.ktstr_enabled = true;
1464 for t in &targets {
1465 set_rodata_slot(rodata, t.slot, t.idx, t.is_kernel);
1466 }
1467 }
1468
1469 // For kernel fexit, we only need fexit programs — disable fentry
1470 // (entry capture is handled by the kprobe skeleton).
1471 for t in targets.iter_mut() {
1472 // Disable fentry for kernel functions (kprobe handles entry).
1473 let Some(fentry_prog) = fentry_prog_mut_by_slot(&mut fentry_open, t.slot) else {
1474 continue;
1475 };
1476 fentry_prog.set_autoload(false);
1477
1478 // Set fexit attach target with fd=0 (vmlinux BTF).
1479 let Some(fexit_prog) = fexit_prog_mut_by_slot(&mut fentry_open, t.slot) else {
1480 continue;
1481 };
1482 match fexit_prog.set_attach_target(0, Some(t.name.to_string())) {
1483 Ok(()) => {
1484 t.ok = true;
1485 tracing::debug!(
1486 slot = t.slot,
1487 func = t.name,
1488 "kernel fexit: set_attach_target ok"
1489 );
1490 }
1491 Err(e) => {
1492 tracing::debug!(slot = t.slot, func = t.name, %e, "kernel fexit: set_attach_target failed");
1493 fexit_prog.set_autoload(false);
1494 }
1495 }
1496 }
1497
1498 if !targets.iter().any(|t| t.ok) {
1499 continue;
1500 }
1501
1502 // Disable fexit for unused slots. Fentry for these slots was
1503 // left at its default by the `targets` loop above (which
1504 // disables fentry only for slots that have a target); no
1505 // attach_target was set for them either, so libbpf loads
1506 // them with a NULL target. Disabling fexit here keeps the
1507 // behaviour as it was before the slot helpers were
1508 // introduced.
1509 let used_slots: std::collections::HashSet<usize> =
1510 targets.iter().filter(|t| t.ok).map(|t| t.slot).collect();
1511 for slot in 0..FENTRY_BATCH {
1512 if !used_slots.contains(&slot)
1513 && let Some(p) = fexit_prog_mut_by_slot(&mut fentry_open, slot)
1514 {
1515 p.set_autoload(false);
1516 }
1517 }
1518
1519 // Reuse probe_data and func_meta_map from the main skeleton.
1520 use std::os::unix::io::AsFd;
1521 if let Err(e) = fentry_open
1522 .maps
1523 .probe_data
1524 .reuse_fd(skel.maps.probe_data.as_fd())
1525 {
1526 tracing::warn!(%e, "kernel fexit: probe_data reuse_fd failed");
1527 }
1528 if let Err(e) = fentry_open
1529 .maps
1530 .func_meta_map
1531 .reuse_fd(skel.maps.func_meta_map.as_fd())
1532 {
1533 tracing::warn!(%e, "kernel fexit: func_meta_map reuse_fd failed");
1534 }
1535
1536 let fentry_skel = match fentry_open.load() {
1537 Ok(s) => s,
1538 Err(e) => {
1539 tracing::warn!(%e, "kernel fexit: batch load failed");
1540 continue;
1541 }
1542 };
1543
1544 for t in &targets {
1545 if !t.ok {
1546 continue;
1547 }
1548 let Some(result) = attach_fexit_by_slot(&fentry_skel, t.slot) else {
1549 continue;
1550 };
1551 match result {
1552 Ok(link) => {
1553 tracing::debug!(func = t.name, "kernel fexit attached");
1554 fexit_links.push(link);
1555 }
1556 Err(e) => {
1557 tracing::debug!(%e, func = t.name, "kernel fexit attach failed");
1558 }
1559 }
1560 }
1561
1562 drop(fentry_skel);
1563 }
1564 if !kernel_fexit_targets.is_empty() {
1565 tracing::debug!(
1566 fexit = fexit_links.len(),
1567 total = kernel_fexit_targets.len(),
1568 "kernel fexit probes",
1569 );
1570 }
1571
1572 // Attach trigger: tp_btf/sched_ext_exit fires inside
1573 // scx_claim_exit() in the context of the current task at exit time.
1574 match skel.progs.ktstr_trigger_tp.attach_trace() {
1575 Ok(link) => {
1576 tracing::debug!("trigger attached via tp_btf/sched_ext_exit");
1577 diag.trigger_type = "tp_btf".to_string();
1578 links.push((link, "tp_btf/sched_ext_exit".to_string()));
1579 }
1580 Err(e) => {
1581 let msg = format!("auto-repro requires kernel with sched_ext_exit tracepoint: {e}");
1582 tracing::error!(%msg, "trigger attach failed");
1583 diag.trigger_attach_error = Some(msg);
1584 ready.set();
1585 return (None, diag, Vec::new());
1586 }
1587 }
1588
1589 // Attach timeline programs (loaded by the skeleton but not
1590 // auto-attached — they need explicit attach_trace calls).
1591 match skel.progs.ktstr_tl_switch.attach_trace() {
1592 Ok(link) => links.push((link, "tp_btf/sched_switch".to_string())),
1593 Err(e) => tracing::warn!(%e, "timeline sched_switch attach failed"),
1594 }
1595 match skel.progs.ktstr_tl_migrate.attach_trace() {
1596 Ok(link) => links.push((link, "tp_btf/sched_migrate_task".to_string())),
1597 Err(e) => tracing::warn!(%e, "timeline sched_migrate_task attach failed"),
1598 }
1599 match skel.progs.ktstr_tl_wakeup.attach_trace() {
1600 Ok(link) => links.push((link, "tp_btf/sched_wakeup".to_string())),
1601 Err(e) => tracing::warn!(%e, "timeline sched_wakeup attach failed"),
1602 }
1603
1604 // Attach optional programs. When the first load succeeded (all
1605 // targets present), they were loaded and just need attachment.
1606 // When the fallback path ran, they were not loaded — attach
1607 // returns an error that we silently absorb.
1608 if optional_programs_loaded {
1609 if let Ok(link) = skel.progs.ktstr_pi_fentry.attach_trace() {
1610 links.push((link, "fentry/rt_mutex_setprio".to_string()));
1611 }
1612 if let Ok(link) = skel.progs.ktstr_pi_fexit.attach_trace() {
1613 links.push((link, "fexit/rt_mutex_setprio".to_string()));
1614 }
1615 if let Ok(link) = skel.progs.ktstr_lock_contend.attach_trace() {
1616 links.push((link, "tp_btf/contention_begin".to_string()));
1617 }
1618 if let Ok(link) = skel.progs.ktstr_preempt_disable_tp.attach_trace() {
1619 links.push((link, "tp_btf/preempt_disable".to_string()));
1620 }
1621 if let Ok(link) = skel.progs.ktstr_preempt_enable_tp.attach_trace() {
1622 links.push((link, "tp_btf/preempt_enable".to_string()));
1623 }
1624 }
1625
1626 // Set up ring buffer
1627 let events: std::sync::Arc<std::sync::Mutex<Vec<ProbeEvent>>> =
1628 std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
1629 let events_clone = events.clone();
1630 let triggered = std::sync::Arc::new(AtomicBool::new(false));
1631 let triggered_clone = triggered.clone();
1632
1633 // Ring buffer event layout matching probe_event in intf.h.
1634 // `str_val` + `has_str` + `str_param_idx` are kept in the wire
1635 // layout for ABI symmetry with `struct probe_entry` (the
1636 // kprobe-side hash map shares the field names) — the
1637 // EVENT_TRIGGER producer leaves them zeroed.
1638 #[repr(C)]
1639 struct RbEvent {
1640 type_: u32,
1641 tid: u32,
1642 func_idx: u32,
1643 ts: u64,
1644 args: [u64; 6],
1645 fields: [u64; 16],
1646 nr_fields: u32,
1647 kstack: [u64; 32],
1648 kstack_sz: u32,
1649 str_val: [u8; MAX_STR_LEN],
1650 has_str: u8,
1651 str_param_idx: u8,
1652 }
1653
1654 let mut rb_builder = RingBufferBuilder::new();
1655 if let Err(e) = rb_builder.add(&skel.maps.ktstr_events, move |data: &[u8]| {
1656 if data.len() < std::mem::size_of::<RbEvent>() {
1657 return 0;
1658 }
1659 let raw: &RbEvent = unsafe { &*(data.as_ptr() as *const RbEvent) };
1660
1661 if raw.type_ == EVENT_TRIGGER {
1662 triggered_clone.store(true, Ordering::Release);
1663
1664 let kstack_sz = (raw.kstack_sz as usize).min(32);
1665 let event = ProbeEvent {
1666 func_idx: 0,
1667 task_ptr: raw.args[0],
1668 ts: raw.ts,
1669 args: raw.args,
1670 fields: vec![],
1671 kstack: raw.kstack[..kstack_sz].to_vec(),
1672 str_val: None,
1673 exit_fields: vec![],
1674 exit_ts: None,
1675 };
1676
1677 events_clone.lock().unwrap().push(event);
1678 }
1679
1680 0
1681 }) {
1682 tracing::error!(%e, "failed to register ring buffer callback");
1683 ready.set();
1684 return (None, diag, Vec::new());
1685 }
1686
1687 let rb = match rb_builder.build() {
1688 Ok(rb) => rb,
1689 Err(e) => {
1690 tracing::error!(%e, "failed to build ring buffer");
1691 ready.set();
1692 return (None, diag, Vec::new());
1693 }
1694 };
1695
1696 // Enable is handled by the BPF program reading the volatile const.
1697 // Since we can't mutate rodata after load, the program starts enabled.
1698 // (ktstr_enabled defaults to false in BPF, but we always want probes
1699 // active once attached — remove the gate or set it before load.)
1700
1701 tracing::debug!(
1702 funcs = func_ips.len(),
1703 links = links.len(),
1704 trigger_type = %diag.trigger_type,
1705 "polling for probe data",
1706 );
1707
1708 // Signal Phase A probes attached (kprobes + kernel fexit +
1709 // trigger). When phase_b_rx is None, this means all probes.
1710 // When Some, BPF fentry is deferred to Phase B.
1711 ready.set();
1712
1713 // Phase B: receive BPF fentry targets and attach them while
1714 // polling the ring buffer. The channel is consumed once; after
1715 // that phase_b_done prevents re-checking.
1716 let mut phase_b_rx = phase_b_rx;
1717 let mut phase_b_done = false;
1718 // Accumulates BTF from Phase B functions so the readout phase
1719 // can resolve field keys for both Phase A and Phase B functions.
1720 let mut phase_b_btf: Vec<BtfFunc> = Vec::new();
1721
1722 // Poll until trigger fires or stop requested. When stop is
1723 // signaled, iterate all probe_data entries instead of waiting
1724 // for the trigger.
1725 loop {
1726 let _ = rb.poll(Duration::from_millis(100));
1727
1728 // Check for Phase B input while polling.
1729 if !phase_b_done && let Some(ref rx) = phase_b_rx {
1730 match rx.try_recv() {
1731 Ok(pb) => {
1732 tracing::debug!(
1733 bpf_funcs = pb.functions.len(),
1734 "Phase B: attaching BPF fentry/fexit"
1735 );
1736 // Save Phase B BTF for readout field key resolution.
1737 phase_b_btf = pb.btf_funcs.clone();
1738 // Attach Phase B probes: kernel callers (kprobes
1739 // + kernel fexit) and BPF callbacks (fentry/fexit).
1740 attach_phase_b_fentry(
1741 &skel,
1742 &pb,
1743 &mut func_ips,
1744 &mut fentry_links,
1745 &mut fexit_links,
1746 &mut links,
1747 &mut diag,
1748 );
1749 pb.done.set();
1750 phase_b_done = true;
1751 // Drop the receiver to release the channel.
1752 phase_b_rx = None;
1753 }
1754 Err(std::sync::mpsc::TryRecvError::Empty) => {
1755 // No Phase B input yet; if trigger already fired,
1756 // break immediately — the crash happened before
1757 // Phase B could attach.
1758 if triggered.load(Ordering::Acquire) {
1759 tracing::debug!("trigger fired during Phase B wait, skipping fentry");
1760 phase_b_done = true;
1761 phase_b_rx = None;
1762 }
1763 }
1764 Err(std::sync::mpsc::TryRecvError::Disconnected) => {
1765 // Channel sender dropped without delivering Phase B
1766 // input. Without a Phase B payload there is nothing
1767 // left for this probe loop to attach; if `stop` is
1768 // also set, return empty diagnostics rather than
1769 // poll an empty ringbuf for the rest of the timeout.
1770 tracing::debug!("Phase B channel disconnected");
1771 phase_b_done = true;
1772 phase_b_rx = None;
1773 if stop.load(Ordering::Relaxed) {
1774 return (None, diag, Vec::new());
1775 }
1776 }
1777 }
1778 }
1779
1780 // Also check BSS err_exit_detected: stall exits skip the
1781 // ring buffer (bpf_get_current_task is unrelated to the
1782 // stall cause) but still latch the BSS flag. Volatile read
1783 // because the BPF program writes via the kernel-side mmap
1784 // and Rust's aliasing rules let the compiler hoist a normal
1785 // read out of the loop.
1786 let bss_triggered = skel.maps.bss_data.as_ref().is_some_and(|bss| unsafe {
1787 std::ptr::read_volatile(&bss.ktstr_err_exit_detected as *const u32) != 0
1788 });
1789 // Mirror the latch into the cross-thread atomic so scenario
1790 // code can synchronously classify SchedulerDied as Clean vs
1791 // Crashed without owning the skeleton. Monotonic 0→1→2 —
1792 // once a poll iteration completes we know the probe armed,
1793 // and the BSS latch only ever transitions 0→1 within a
1794 // single test run. Release pairs with the Acquire load in
1795 // `sched_exit_kind()`.
1796 PROBE_SCHED_EXIT_STATE.store(
1797 if bss_triggered {
1798 PROBE_EXIT_STATE_CRASHED
1799 } else {
1800 PROBE_EXIT_STATE_CLEAN
1801 },
1802 Ordering::Release,
1803 );
1804 // Snapshot `triggered` once: a second `triggered.load` for the
1805 // diag assignment would observe an unrelated state if a racing
1806 // trigger fires between the two reads, so the gate decision
1807 // and the recorded `trigger_fired` could disagree.
1808 let triggered_snapshot = triggered.load(Ordering::Acquire);
1809 if triggered_snapshot || bss_triggered || stop.load(Ordering::Acquire) {
1810 // Final ringbuf drain before breaking. BPF-side ordering:
1811 // the trigger handler does
1812 // `__sync_val_compare_and_swap(&ktstr_err_exit_detected, 0u, 1u)`
1813 // BEFORE `bpf_ringbuf_reserve` + `bpf_ringbuf_submit`
1814 // (see src/bpf/probe.bpf.c near line 687-731). A userspace
1815 // observer can therefore see `bss_triggered=true` (CAS
1816 // visible) while the ringbuf event is still in transit
1817 // (submit not yet visible to `rb.poll`). Without an
1818 // explicit drain here, breaking out of the loop on
1819 // `bss_triggered` would lose the trigger event — the
1820 // readout phase below would see `events.last() = None`
1821 // (or the prior probe's trailing event), drop into the
1822 // no-causal-tptr fallback path, and produce
1823 // grouped-by-frequency output instead of a verified
1824 // stitch even though the kernel did publish a real
1825 // causal task pointer.
1826 //
1827 // 100 ms is bounded by the same teardown budget the loop's
1828 // top-level `rb.poll(100ms)` uses; libbpf's
1829 // `RingBuffer::poll` returns as soon as events are
1830 // consumed (it uses epoll-edge under the hood), so the
1831 // worst case is one full timeout window when no event
1832 // arrives — acceptable since we already know the trigger
1833 // fired (bss_triggered is true).
1834 let _ = rb.poll(Duration::from_millis(100));
1835 diag.trigger_fired = triggered_snapshot || bss_triggered;
1836
1837 // Read BPF-side diagnostic counters from BSS. The hot
1838 // counters live in the per-CPU `ktstr_pcpu_counters`
1839 // 2D array (`[MAX_CPUS][KTSTR_PCPU_NR]`); each per-CPU
1840 // slot is a cacheline-aligned `pcpu_counter { long
1841 // value; }`. Sum across CPUs to recover the cumulative
1842 // count — see `enum ktstr_pcpu_idx` in
1843 // src/bpf/probe.bpf.c for the slot indices.
1844 //
1845 // Every read below uses [`std::ptr::read_volatile`].
1846 // The BSS struct is mapped to userspace via the BPF
1847 // map's mmap region; the BPF program writes through
1848 // its own kernel-side mapping concurrently with these
1849 // reads. Without `read_volatile`, the userspace
1850 // compiler is free to hoist the loads (Rust's
1851 // aliasing rules: the compiler does not know a
1852 // `&types::bss` reference is shared with a kernel
1853 // writer through an unrelated mapping), miss the
1854 // post-trigger updates, and leave the diagnostic
1855 // counters / miss log / first-trigger timestamp
1856 // showing pre-trigger zeroes. Mirrors the existing
1857 // `ktstr_err_exit_detected` `read_volatile` site
1858 // upstream — same hazard, same fix, applied to every
1859 // BPF-mutated field the diag block reads.
1860 if let Some(bss) = skel.maps.bss_data.as_ref() {
1861 // Slot indices must match `enum ktstr_pcpu_idx`. A
1862 // reorder in the BPF source breaks every reader; the
1863 // explicit constants here surface that drift at the
1864 // call site instead of silently aliasing two
1865 // counters.
1866 const PCPU_PROBE_COUNT: usize = 0;
1867 const PCPU_KPROBE_RETURNS: usize = 1;
1868 const PCPU_META_MISS: usize = 2;
1869 const PCPU_RINGBUF_DROPS: usize = 3;
1870 const PCPU_TIMELINE_COUNT: usize = 4;
1871 const PCPU_TIMELINE_DROPS: usize = 5;
1872 const PCPU_TRIGGER_COUNT: usize = 14;
1873 let counters = &bss.ktstr_pcpu_counters;
1874 // SAFETY: each `pcpu_counter::value` is a plain
1875 // 64-bit integer at a stable BSS offset; the BPF
1876 // side updates it via `__sync_add_and_fetch`
1877 // (atomic add). A volatile load reads whatever
1878 // the kernel-side mmap currently shows — torn
1879 // reads are impossible because aligned 64-bit
1880 // loads are atomic on every supported arch
1881 // (x86_64, aarch64). The volatile qualifier is
1882 // what prevents the compiler from hoisting the
1883 // load out of the `sum` reduction across the
1884 // outer poll loop.
1885 let sum_pcpu = |idx: usize| -> u64 {
1886 counters
1887 .iter()
1888 .map(|cpu_slots| unsafe {
1889 std::ptr::read_volatile(&cpu_slots[idx].value as *const _) as u64
1890 })
1891 .sum()
1892 };
1893 diag.bpf_kprobe_fires = sum_pcpu(PCPU_PROBE_COUNT);
1894 diag.bpf_kprobe_returns = sum_pcpu(PCPU_KPROBE_RETURNS);
1895 diag.bpf_trigger_fires = sum_pcpu(PCPU_TRIGGER_COUNT);
1896 diag.bpf_meta_misses = sum_pcpu(PCPU_META_MISS);
1897 // SAFETY: `ktstr_miss_log_idx` is a `u32` written
1898 // via `__sync_fetch_and_add` from the BPF side
1899 // (see `src/bpf/probe.bpf.c` near the
1900 // `ktstr_miss_log[idx] = ip;` line). Aligned u32
1901 // loads are atomic on x86_64/aarch64. The BPF
1902 // writer increments-then-stores; a volatile read
1903 // observes either the pre- or post-update value
1904 // — both are bounded by the array length, so the
1905 // subsequent `.min(ktstr_miss_log.len())` keeps
1906 // the slice safe even if the kernel-side write
1907 // races this read.
1908 let miss_idx = unsafe {
1909 std::ptr::read_volatile(&bss.ktstr_miss_log_idx as *const u32) as usize
1910 };
1911 let n = miss_idx.min(bss.ktstr_miss_log.len());
1912 // Element-wise volatile reads of the miss-log
1913 // entries that fall within `n`. A bulk
1914 // `to_vec()` over the `bss.ktstr_miss_log[..n]`
1915 // slice would let the compiler vectorise the
1916 // copy and elide the volatile semantics; pulling
1917 // each `u64` through `read_volatile` keeps every
1918 // load ordered against the BPF-side write.
1919 //
1920 // SAFETY: each entry is a 64-bit IP value the
1921 // BPF writer stores after its CAS-like
1922 // increment of `ktstr_miss_log_idx`. Aligned
1923 // u64 loads are atomic on every supported
1924 // arch; the BPF write order
1925 // (increment of `ktstr_miss_log_idx` via
1926 // `__sync_fetch_and_add` BEFORE the store
1927 // `ktstr_miss_log[idx] = ip`) means a
1928 // volatile read of `[..miss_idx]` may cover a
1929 // slot whose index was already reserved but
1930 // whose store has not yet landed. We tolerate
1931 // that race: a stale-zero entry is harmless
1932 // diagnostic noise compared with the
1933 // alternative (compiler-hoisted loads of
1934 // pre-trigger zeroes).
1935 diag.bpf_miss_ips = (0..n)
1936 .map(|i| unsafe {
1937 std::ptr::read_volatile(&bss.ktstr_miss_log[i] as *const u64)
1938 })
1939 .collect();
1940 diag.bpf_ringbuf_drops = sum_pcpu(PCPU_RINGBUF_DROPS);
1941 // SAFETY: `ktstr_last_trigger_ts` is a `u64`
1942 // written by the BPF trigger handler via
1943 // `bpf_ktime_get_ns()` (see
1944 // `src/bpf/probe.bpf.c::ktstr_last_trigger_ts`).
1945 // Aligned u64 loads are atomic; the volatile
1946 // qualifier prevents hoisting across the outer
1947 // poll loop so the userspace reader observes the
1948 // post-trigger timestamp instead of a cached
1949 // pre-trigger zero.
1950 diag.bpf_first_trigger_ns =
1951 unsafe { std::ptr::read_volatile(&bss.ktstr_last_trigger_ts as *const u64) };
1952 // SAFETY: `ktstr_exit_kind_snap` is a `u32` written
1953 // by the BPF trigger handler in the same publishing
1954 // CAS sequence as `ktstr_err_exit_detected` (see
1955 // `src/bpf/probe.bpf.c::ktstr_exit_kind_snap`).
1956 // Aligned u32 loads are atomic on every supported
1957 // arch; the volatile qualifier prevents the compiler
1958 // from hoisting the load across the outer poll loop
1959 // so userspace observes the post-trigger SCX_EXIT_*
1960 // value rather than the pre-trigger zero.
1961 diag.bpf_exit_kind_snap =
1962 unsafe { std::ptr::read_volatile(&bss.ktstr_exit_kind_snap as *const u32) };
1963 diag.bpf_timeline_count = sum_pcpu(PCPU_TIMELINE_COUNT);
1964 diag.bpf_timeline_drops = sum_pcpu(PCPU_TIMELINE_DROPS);
1965 }
1966
1967 let key_size = std::mem::size_of::<types::probe_key>();
1968 let mut probe_events = Vec::new();
1969 let mut total_keys = 0u32;
1970 let mut unmatched_ips = 0u32;
1971
1972 // Build IP → (func_idx, display_name) lookup once. The
1973 // event-drain loop below would otherwise scan every entry
1974 // in `func_ips` per probe_data key — O(events × funcs).
1975 // With thousands of funcs and tens of thousands of events
1976 // on a normal run, the linear scan dominates dump time.
1977 // HashMap turns the per-event lookup into O(1).
1978 let func_ips_by_ip: std::collections::HashMap<u64, (u32, &str)> = func_ips
1979 .iter()
1980 .map(|(idx, ip, name)| (*ip, (*idx, name.as_str())))
1981 .collect();
1982
1983 // Pre-compute per-function `field_keys_hints` once. The
1984 // previous code recomputed `build_field_keys` for every
1985 // event, even when many events share the same function —
1986 // O(events × funcs) field-key construction. Building the
1987 // map keyed by function name turns the per-event work
1988 // into a single HashMap lookup.
1989 let field_keys_by_func: std::collections::HashMap<&str, Vec<(String, RenderHint)>> =
1990 btf_funcs
1991 .iter()
1992 .chain(phase_b_btf.iter())
1993 .map(|f| (f.name.as_str(), build_field_keys(f)))
1994 .collect();
1995
1996 for key_bytes in skel.maps.probe_data.keys() {
1997 if key_bytes.len() < key_size {
1998 continue;
1999 }
2000 total_keys += 1;
2001 let key: &types::probe_key =
2002 unsafe { &*(key_bytes.as_ptr() as *const types::probe_key) };
2003
2004 // Find which function this IP belongs to.
2005 let (func_idx, display_name) = match func_ips_by_ip.get(&key.func_ip) {
2006 Some(&(idx, name)) => (idx, name),
2007 None => {
2008 unmatched_ips += 1;
2009 continue;
2010 }
2011 };
2012
2013 if let Ok(Some(val_bytes)) = skel.maps.probe_data.lookup(&key_bytes, MapFlags::ANY)
2014 {
2015 let entry: &types::probe_entry =
2016 unsafe { &*(val_bytes.as_ptr() as *const types::probe_entry) };
2017 if entry.ts == 0 {
2018 continue;
2019 }
2020
2021 // Borrow the pre-computed hints for this function;
2022 // an empty slice for unknown funcs preserves the
2023 // previous `unwrap_or_default()` behaviour.
2024 let empty: Vec<(String, RenderHint)> = Vec::new();
2025 let field_keys_hints: &Vec<(String, RenderHint)> =
2026 field_keys_by_func.get(display_name).unwrap_or(&empty);
2027
2028 let nr = (entry.nr_fields as usize).min(16);
2029 let fields: Vec<(String, u64)> = entry.fields[..nr]
2030 .iter()
2031 .enumerate()
2032 .filter_map(|(i, &val)| {
2033 field_keys_hints.get(i).map(|(k, _)| (k.clone(), val))
2034 })
2035 .collect();
2036
2037 let str_val = if entry.has_str != 0 {
2038 let s = &entry.str_val;
2039 let bytes: Vec<u8> = s.iter().map(|&b| b as u8).collect();
2040 let len = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
2041 let text = std::str::from_utf8(&bytes[..len]).unwrap_or("").to_string();
2042 if text.is_empty() { None } else { Some(text) }
2043 } else {
2044 None
2045 };
2046
2047 // Extract exit-side fields if fexit fired.
2048 let (exit_fields, exit_ts) = if entry.has_exit != 0 {
2049 let nr_exit = (entry.nr_exit_fields as usize).min(16);
2050 let ef: Vec<(String, u64)> = entry.exit_fields[..nr_exit]
2051 .iter()
2052 .enumerate()
2053 .filter_map(|(i, &val)| {
2054 field_keys_hints.get(i).map(|(k, _)| (k.clone(), val))
2055 })
2056 .collect();
2057 (ef, Some(entry.exit_ts))
2058 } else {
2059 (Vec::new(), None)
2060 };
2061
2062 probe_events.push(ProbeEvent {
2063 func_idx,
2064 task_ptr: key.task_ptr,
2065 ts: entry.ts,
2066 args: entry.args,
2067 fields,
2068 kstack: vec![],
2069 str_val,
2070 exit_fields,
2071 exit_ts,
2072 });
2073 }
2074 }
2075
2076 probe_events.sort_by_key(|e| e.ts);
2077
2078 diag.probe_data_keys = total_keys;
2079 diag.probe_data_unmatched_ips = unmatched_ips;
2080 diag.events_before_stitch = probe_events.len() as u32;
2081
2082 tracing::debug!(
2083 events = probe_events.len(),
2084 total_keys,
2085 unmatched_ips,
2086 "probe_data readout",
2087 );
2088
2089 if probe_events.is_empty() {
2090 return (None, diag, Vec::new());
2091 }
2092
2093 // Stitch by task_struct pointer. Build a map of func_idx ->
2094 // task_struct param index from BPF_OP_CALLERS and BTF, then
2095 // filter events to those referencing the same task_struct
2096 // pointer as the causal task.
2097 //
2098 // The args[0] assignment in ktstr_trigger_tp (the BPF
2099 // trigger handler) sets args[0] to
2100 // bpf_get_current_task() ONLY for
2101 // SCX_EXIT_ERROR_BPF (1025), where a BPF scheduler
2102 // callback faulted in the running task's context — so
2103 // `current` IS the causal task. For SCX_EXIT_ERROR
2104 // (1024), args[0] is 0 because that exit can fire from
2105 // kworker context (async unregistration, sysrq), where
2106 // `current` is the worker thread rather than the task
2107 // that triggered the exit. For SCX_EXIT_ERROR_STALL
2108 // (1026) the trigger handler returns early without
2109 // submitting an event at all (watchdog/timer context).
2110 // The filter below therefore drops args[0] == 0 to
2111 // suppress non-causal probe output: no causal task
2112 // means no useful stitch chain.
2113 let task_param_idx = build_task_param_idx(&func_ips, btf_funcs, &phase_b_btf);
2114
2115 // Extract tptr and kstack from the trigger event in one
2116 // lock acquisition. When the trigger did not fire (stop-
2117 // signaled) or the exit kind lacks a causal task, probe
2118 // output is suppressed.
2119 let (target_tptr, trigger_kstack) = {
2120 let guard = events.lock().unwrap();
2121 let tptr = guard.last().and_then(|e| causal_tptr(e.task_ptr));
2122 let kstack = guard.last().map(|e| e.kstack.clone()).unwrap_or_default();
2123 (tptr, kstack)
2124 };
2125
2126 let Some(tptr) = target_tptr else {
2127 // No causal task identified — the trigger fired with
2128 // args[0]==0 (kind=STALL or generic ERROR from kworker
2129 // context), or never fired at all (probe lifecycle
2130 // race, scheduler clean-exited).
2131 //
2132 // Best-effort fallback: instead of returning empty,
2133 // group the captured events by task_struct pointer
2134 // (resolved via task_param_idx for events with a
2135 // task_struct param, otherwise key.task_ptr from the
2136 // probe map) and pick the top-N most-frequent
2137 // pointers. The events that fired during the actual
2138 // crash window ARE causal data — the host renderer
2139 // will mark the output as "trigger absent —
2140 // best-effort grouped by frequency" so the operator
2141 // doesn't mistake the candidate chain for a verified
2142 // stitch. Cap at 3 candidates to keep the output
2143 // readable; one task usually dominates a single
2144 // crash window.
2145 use std::collections::HashMap;
2146 let mut counts: HashMap<u64, u32> = HashMap::new();
2147 for ev in &probe_events {
2148 let key = if let Some(&pidx) = task_param_idx.get(&ev.func_idx) {
2149 ev.args[pidx]
2150 } else {
2151 ev.task_ptr
2152 };
2153 if key != 0 {
2154 *counts.entry(key).or_default() += 1;
2155 }
2156 }
2157 if counts.is_empty() {
2158 tracing::debug!(
2159 "no causal tptr and no candidate task_ptrs — suppressing probe output"
2160 );
2161 return (None, diag, Vec::new());
2162 }
2163 let mut sorted: Vec<(u64, u32)> = counts.into_iter().collect();
2164 sorted.sort_by(|a, b| b.1.cmp(&a.1));
2165 const MAX_CANDIDATES: usize = 3;
2166 sorted.truncate(MAX_CANDIDATES);
2167 let candidate_set: std::collections::HashSet<u64> =
2168 sorted.iter().map(|(k, _)| *k).collect();
2169 let before = probe_events.len();
2170 probe_events.retain(|e| {
2171 let key = if let Some(&pidx) = task_param_idx.get(&e.func_idx) {
2172 e.args[pidx]
2173 } else {
2174 e.task_ptr
2175 };
2176 candidate_set.contains(&key)
2177 });
2178 tracing::debug!(
2179 candidates = sorted.len(),
2180 kept = probe_events.len(),
2181 total = before,
2182 "stitched by frequency (fallback — no causal tptr)"
2183 );
2184 diag.events_after_stitch = probe_events.len() as u32;
2185 diag.stitch_fallback_used = true;
2186 let fnames: Vec<(u32, String)> = func_ips
2187 .iter()
2188 .map(|(idx, _, name)| (*idx, name.clone()))
2189 .collect();
2190 return (Some(probe_events), diag, fnames);
2191 };
2192
2193 let before = probe_events.len();
2194 probe_events.retain(|e| {
2195 if let Some(&pidx) = task_param_idx.get(&e.func_idx) {
2196 e.args[pidx] == tptr
2197 } else {
2198 e.task_ptr == tptr // no task_struct param — match on current
2199 }
2200 });
2201 tracing::debug!(
2202 tptr = format_args!("0x{tptr:x}"),
2203 kept = probe_events.len(),
2204 total = before,
2205 "stitched by task_struct arg",
2206 );
2207
2208 diag.events_after_stitch = probe_events.len() as u32;
2209
2210 // Attach trigger kstack if available.
2211 if let Some(last) = probe_events.last_mut() {
2212 last.kstack = trigger_kstack;
2213 }
2214
2215 let fnames: Vec<(u32, String)> = func_ips
2216 .iter()
2217 .map(|(idx, _, name)| (*idx, name.clone()))
2218 .collect();
2219 return (Some(probe_events), diag, fnames);
2220 }
2221 }
2222}
2223
2224/// Attach Phase B probes after the scheduler starts.
2225///
2226/// Handles both kernel callers (kprobes + kernel fexit) and BPF
2227/// callbacks (fentry/fexit). Uses `pb.func_idx_offset` for all
2228/// func_idx values to avoid collisions with Phase A indices.
2229///
2230/// Kprobes are attached via the Phase A kprobe skeleton (`skel`),
2231/// which stays alive throughout. BPF fentry/fexit use separate
2232/// skeleton loads that share `probe_data` and `func_meta_map` via
2233/// `reuse_fd`.
2234fn attach_phase_b_fentry(
2235 skel: &crate::bpf_skel::ProbeSkel<'_>,
2236 pb: &PhaseBInput,
2237 func_ips: &mut Vec<(u32, u64, String)>,
2238 fentry_links: &mut Vec<libbpf_rs::Link>,
2239 fexit_links: &mut Vec<libbpf_rs::Link>,
2240 links: &mut Vec<(libbpf_rs::Link, String)>,
2241 diag: &mut ProbeDiagnostics,
2242) {
2243 use libbpf_rs::MapCore;
2244 use libbpf_rs::MapFlags;
2245 use libbpf_rs::skel::{OpenSkel, SkelBuilder};
2246
2247 const FENTRY_BATCH: usize = 4;
2248
2249 struct FentryTarget<'a> {
2250 slot: usize,
2251 fd: i32,
2252 idx: u32,
2253 name: &'a str,
2254 ok: bool,
2255 is_kernel: bool,
2256 }
2257
2258 let offset = pb.func_idx_offset;
2259
2260 // Pre-load vmlinux BTF once for the Phase B kprobe + fentry
2261 // loops below. Same rationale as the Phase A path in
2262 // `run_probe_skeleton`: per-call `resolve_field_specs(_, None)`
2263 // would re-parse the multi-MB vmlinux BTF on every iteration,
2264 // dominating Phase B attach time on a kernel with thousands of
2265 // probed functions.
2266 //
2267 // `cached_vmlinux_btf` shares the process-global memo with Phase
2268 // A so a single auto-repro VM pays one parse for both phases, and
2269 // a multi-VM nextest run pays one parse total.
2270 let cached_btf = crate::monitor::btf_offsets::cached_vmlinux_btf();
2271
2272 // --- Phase B kernel functions: kprobes + func_meta ---
2273 //
2274 // Two-pass shape: first populate `func_meta_map` for every
2275 // resolvable target and stage the (raw_name, idx, ip,
2276 // display_name) tuples, then run all kprobe attaches in
2277 // parallel via [`parallel_attach_kprobes`]. Splitting the loop
2278 // matters because the attach is the slow syscall pair (one
2279 // `perf_event_open` + one `BPF_LINK_CREATE` each); meta
2280 // population is a quick map update. Doing them as a single
2281 // sequential loop forces the slow attach to gate the next
2282 // iteration's meta update for no good reason.
2283 struct PhaseBKprobeTarget {
2284 raw_name: String,
2285 display_name: String,
2286 idx: u32,
2287 ip: u64,
2288 }
2289 let mut kprobe_targets: Vec<PhaseBKprobeTarget> = Vec::new();
2290 for (i, func) in pb.functions.iter().enumerate() {
2291 if func.is_bpf {
2292 continue;
2293 }
2294 let idx = offset + i as u32;
2295 let ip = match resolve_func_ip(&func.raw_name) {
2296 Some(ip) => ip,
2297 None => {
2298 tracing::warn!(func = %func.raw_name, "phase_b: could not resolve function IP");
2299 diag.kprobe_resolve_failed.push(func.raw_name.clone());
2300 continue;
2301 }
2302 };
2303
2304 let mut meta = types::func_meta {
2305 func_idx: idx,
2306 ..Default::default()
2307 };
2308
2309 if let Some(btf_func) = pb.btf_funcs.iter().find(|f| f.name == func.raw_name) {
2310 let field_specs = match cached_btf.as_ref() {
2311 Some(btf) => super::btf::resolve_field_specs_with_btf(btf_func, btf),
2312 None => Vec::new(),
2313 };
2314 populate_field_specs(&mut meta, &field_specs);
2315 meta.str_param_idx = detect_str_param(btf_func);
2316 }
2317
2318 let key_bytes = ip.to_ne_bytes();
2319 let meta_bytes = unsafe {
2320 std::slice::from_raw_parts(
2321 &meta as *const _ as *const u8,
2322 std::mem::size_of::<types::func_meta>(),
2323 )
2324 };
2325
2326 if let Err(e) = skel
2327 .maps
2328 .func_meta_map
2329 .update(&key_bytes, meta_bytes, MapFlags::ANY)
2330 {
2331 tracing::warn!(%e, func = %func.raw_name, "phase_b: failed to update func_meta_map");
2332 continue;
2333 }
2334
2335 kprobe_targets.push(PhaseBKprobeTarget {
2336 raw_name: func.raw_name.clone(),
2337 display_name: func.display_name.clone(),
2338 idx,
2339 ip,
2340 });
2341 }
2342
2343 let attach_input: Vec<String> = kprobe_targets.iter().map(|t| t.raw_name.clone()).collect();
2344 let attach_results = parallel_attach_kprobes(&skel.progs.ktstr_probe, &attach_input);
2345 // Pair the attach result with its target metadata. `attach_results`
2346 // is in the same order as `attach_input` is in the same order as
2347 // `kprobe_targets`, so a zip-and-iterate replays the original
2348 // sequential post-attach bookkeeping (links push, func_ips push,
2349 // counter bumps) without reordering.
2350 for (target, (raw_name, result)) in kprobe_targets.into_iter().zip(attach_results.into_iter()) {
2351 debug_assert_eq!(target.raw_name, raw_name);
2352 match result {
2353 Ok(link) => {
2354 links.push((link, raw_name));
2355 diag.kprobe_attached += 1;
2356 }
2357 Err(e) => {
2358 tracing::warn!(%e, func = %raw_name, "phase_b kprobe attach failed");
2359 diag.kprobe_attach_failed.push((raw_name, e.to_string()));
2360 }
2361 }
2362 diag.kprobe_resolved += 1;
2363 func_ips.push((target.idx, target.ip, target.display_name));
2364 }
2365
2366 // --- Phase B kernel function fexit batches (fd=0 = vmlinux BTF) ---
2367 struct KernelFexitTarget {
2368 idx: u32,
2369 name: String,
2370 }
2371 let kernel_fexit_targets: Vec<KernelFexitTarget> = pb
2372 .functions
2373 .iter()
2374 .enumerate()
2375 .filter(|(_, f)| !f.is_bpf)
2376 .map(|(i, f)| KernelFexitTarget {
2377 idx: offset + i as u32,
2378 name: f.display_name.clone(),
2379 })
2380 .collect();
2381
2382 for chunk in kernel_fexit_targets.chunks(FENTRY_BATCH) {
2383 let mut targets: Vec<FentryTarget<'_>> = Vec::new();
2384 for (slot, kt) in chunk.iter().enumerate() {
2385 targets.push(FentryTarget {
2386 slot,
2387 fd: 0,
2388 idx: kt.idx,
2389 name: &kt.name,
2390 ok: false,
2391 is_kernel: true,
2392 });
2393 }
2394
2395 use crate::bpf_skel::fentry::*;
2396 let mut fentry_open_obj = std::mem::MaybeUninit::uninit();
2397 let fentry_builder = FentryProbeSkelBuilder::default();
2398 let mut fentry_open = match fentry_builder.open(&mut fentry_open_obj) {
2399 Ok(s) => s,
2400 Err(e) => {
2401 tracing::warn!(%e, "phase_b kernel fexit skeleton open failed");
2402 continue;
2403 }
2404 };
2405
2406 if let Some(rodata) = fentry_open.maps.rodata_data.as_mut() {
2407 rodata.ktstr_enabled = true;
2408 for t in &targets {
2409 set_rodata_slot(rodata, t.slot, t.idx, t.is_kernel);
2410 }
2411 }
2412
2413 for t in targets.iter_mut() {
2414 let Some(fentry_prog) = fentry_prog_mut_by_slot(&mut fentry_open, t.slot) else {
2415 continue;
2416 };
2417 fentry_prog.set_autoload(false);
2418
2419 let Some(fexit_prog) = fexit_prog_mut_by_slot(&mut fentry_open, t.slot) else {
2420 continue;
2421 };
2422 match fexit_prog.set_attach_target(0, Some(t.name.to_string())) {
2423 Ok(()) => {
2424 t.ok = true;
2425 }
2426 Err(e) => {
2427 tracing::debug!(func = t.name, %e, "phase_b kernel fexit: set_attach_target failed");
2428 fexit_prog.set_autoload(false);
2429 }
2430 }
2431 }
2432
2433 if !targets.iter().any(|t| t.ok) {
2434 continue;
2435 }
2436
2437 // Disable fexit for unused slots (see the matching single-phase
2438 // kernel-fexit batch for the fentry-left-at-default rationale).
2439 let used_slots: std::collections::HashSet<usize> =
2440 targets.iter().filter(|t| t.ok).map(|t| t.slot).collect();
2441 for slot in 0..FENTRY_BATCH {
2442 if !used_slots.contains(&slot)
2443 && let Some(p) = fexit_prog_mut_by_slot(&mut fentry_open, slot)
2444 {
2445 p.set_autoload(false);
2446 }
2447 }
2448
2449 use std::os::unix::io::AsFd;
2450 if let Err(e) = fentry_open
2451 .maps
2452 .probe_data
2453 .reuse_fd(skel.maps.probe_data.as_fd())
2454 {
2455 tracing::warn!(%e, "phase_b kernel fexit: probe_data reuse_fd failed");
2456 }
2457 if let Err(e) = fentry_open
2458 .maps
2459 .func_meta_map
2460 .reuse_fd(skel.maps.func_meta_map.as_fd())
2461 {
2462 tracing::warn!(%e, "phase_b kernel fexit: func_meta_map reuse_fd failed");
2463 }
2464
2465 let fentry_skel = match fentry_open.load() {
2466 Ok(s) => s,
2467 Err(e) => {
2468 tracing::warn!(%e, "phase_b kernel fexit: batch load failed");
2469 continue;
2470 }
2471 };
2472
2473 for t in &targets {
2474 if !t.ok {
2475 continue;
2476 }
2477 let Some(result) = attach_fexit_by_slot(&fentry_skel, t.slot) else {
2478 continue;
2479 };
2480 match result {
2481 Ok(link) => {
2482 tracing::debug!(func = t.name, "phase_b kernel fexit attached");
2483 fexit_links.push(link);
2484 }
2485 Err(e) => {
2486 tracing::debug!(%e, func = t.name, "phase_b kernel fexit attach failed");
2487 }
2488 }
2489 }
2490
2491 drop(fentry_skel);
2492 }
2493
2494 // --- Phase B BPF callback fentry/fexit batches ---
2495 let valid_bpf: Vec<(u32, &StackFunction)> = pb
2496 .functions
2497 .iter()
2498 .enumerate()
2499 .filter(|(_, f)| f.bpf_prog_id.is_some())
2500 .map(|(i, f)| (offset + i as u32, f))
2501 .collect();
2502 diag.fentry_candidates += valid_bpf.len() as u32;
2503
2504 for chunk in valid_bpf.chunks(FENTRY_BATCH) {
2505 let mut targets: Vec<FentryTarget<'_>> = Vec::new();
2506 for (slot, (idx, func)) in chunk.iter().enumerate() {
2507 let prog_id = func.bpf_prog_id.unwrap();
2508 let fd = if let Some(&pre_fd) = pb.bpf_prog_fds.get(&prog_id) {
2509 let dup_fd = unsafe { libc::dup(pre_fd) };
2510 if dup_fd < 0 {
2511 tracing::warn!(prog_id, func = %func.display_name, "phase_b fentry: dup failed");
2512 diag.fentry_attach_failed.push((
2513 func.display_name.clone(),
2514 format!("dup(pre_fd={pre_fd}) failed"),
2515 ));
2516 continue;
2517 }
2518 dup_fd
2519 } else {
2520 let fd = unsafe { libbpf_rs::libbpf_sys::bpf_prog_get_fd_by_id(prog_id) };
2521 if fd < 0 {
2522 tracing::warn!(prog_id, func = %func.display_name, "phase_b fentry: failed to get fd");
2523 diag.fentry_attach_failed.push((
2524 func.display_name.clone(),
2525 format!("bpf_prog_get_fd_by_id({prog_id}) returned {fd}"),
2526 ));
2527 continue;
2528 }
2529 fd
2530 };
2531 targets.push(FentryTarget {
2532 slot,
2533 fd,
2534 idx: *idx,
2535 name: &func.display_name,
2536 ok: false,
2537 is_kernel: false,
2538 });
2539 }
2540 if targets.is_empty() {
2541 continue;
2542 }
2543
2544 use crate::bpf_skel::fentry::*;
2545 let mut fentry_open_obj = std::mem::MaybeUninit::uninit();
2546 let fentry_builder = FentryProbeSkelBuilder::default();
2547 let mut fentry_open = match fentry_builder.open(&mut fentry_open_obj) {
2548 Ok(s) => s,
2549 Err(e) => {
2550 tracing::warn!(%e, "phase_b fentry skeleton open failed");
2551 for t in &targets {
2552 unsafe { libc::close(t.fd) };
2553 }
2554 continue;
2555 }
2556 };
2557
2558 if let Some(rodata) = fentry_open.maps.rodata_data.as_mut() {
2559 rodata.ktstr_enabled = true;
2560 for t in &targets {
2561 set_rodata_slot(rodata, t.slot, t.idx, t.is_kernel);
2562 }
2563 }
2564
2565 for t in targets.iter_mut() {
2566 let Some(fentry_prog) = fentry_prog_mut_by_slot(&mut fentry_open, t.slot) else {
2567 continue;
2568 };
2569 match fentry_prog.set_attach_target(t.fd, Some(t.name.to_string())) {
2570 Ok(()) => {
2571 t.ok = true;
2572 }
2573 Err(e) => {
2574 tracing::warn!(slot = t.slot, func = t.name, %e, "phase_b fentry: set_attach_target failed");
2575 diag.fentry_attach_failed
2576 .push((t.name.to_string(), format!("set_attach_target: {e}")));
2577 continue;
2578 }
2579 }
2580 let Some(fexit_prog) = fexit_prog_mut_by_slot(&mut fentry_open, t.slot) else {
2581 continue;
2582 };
2583 if let Err(e) = fexit_prog.set_attach_target(t.fd, Some(t.name.to_string())) {
2584 tracing::debug!(slot = t.slot, func = t.name, %e, "phase_b fexit: set_attach_target failed (entry-only)");
2585 fexit_prog.set_autoload(false);
2586 }
2587 }
2588
2589 if !targets.iter().any(|t| t.ok) {
2590 for t in &targets {
2591 unsafe { libc::close(t.fd) };
2592 }
2593 continue;
2594 }
2595
2596 let used_slots: std::collections::HashSet<usize> =
2597 targets.iter().filter(|t| t.ok).map(|t| t.slot).collect();
2598 for slot in 0..FENTRY_BATCH {
2599 if !used_slots.contains(&slot) {
2600 disable_slot_programs(&mut fentry_open, slot);
2601 }
2602 }
2603
2604 use std::os::unix::io::AsFd;
2605 if let Err(e) = fentry_open
2606 .maps
2607 .probe_data
2608 .reuse_fd(skel.maps.probe_data.as_fd())
2609 {
2610 tracing::warn!(%e, "phase_b fentry: probe_data reuse_fd failed");
2611 }
2612 if let Err(e) = fentry_open
2613 .maps
2614 .func_meta_map
2615 .reuse_fd(skel.maps.func_meta_map.as_fd())
2616 {
2617 tracing::warn!(%e, "phase_b fentry: func_meta_map reuse_fd failed");
2618 }
2619
2620 let fentry_skel = match fentry_open.load() {
2621 Ok(s) => {
2622 for t in &targets {
2623 unsafe { libc::close(t.fd) };
2624 }
2625 s
2626 }
2627 Err(e) => {
2628 tracing::warn!(%e, "phase_b fentry: batch load failed");
2629 for t in &targets {
2630 if t.ok {
2631 diag.fentry_attach_failed
2632 .push((t.name.to_string(), format!("batch load: {e}")));
2633 }
2634 unsafe { libc::close(t.fd) };
2635 }
2636 continue;
2637 }
2638 };
2639
2640 for t in &targets {
2641 if !t.ok {
2642 continue;
2643 }
2644
2645 let sentinel_ip = (t.idx as u64) | (1u64 << 63);
2646 let mut meta = crate::bpf_skel::types::func_meta {
2647 func_idx: t.idx,
2648 ..Default::default()
2649 };
2650
2651 if let Some(btf_func) = pb.btf_funcs.iter().find(|f| f.name == t.name) {
2652 let mut field_specs = match cached_btf.as_ref() {
2653 Some(btf) => super::btf::resolve_field_specs_with_btf(btf_func, btf),
2654 None => Vec::new(),
2655 };
2656 if field_specs.is_empty()
2657 && let Some(prog_id) = pb
2658 .functions
2659 .iter()
2660 .find(|f| f.display_name == t.name)
2661 .and_then(|f| f.bpf_prog_id)
2662 {
2663 field_specs = super::btf::resolve_bpf_field_specs(btf_func, prog_id);
2664 }
2665 populate_field_specs(&mut meta, &field_specs);
2666 meta.str_param_idx = detect_str_param(btf_func);
2667 }
2668
2669 let Some(result) = attach_fentry_by_slot(&fentry_skel, t.slot) else {
2670 continue;
2671 };
2672 let link = match result {
2673 Ok(link) => {
2674 tracing::debug!(func = t.name, "phase_b fentry attached");
2675 link
2676 }
2677 Err(e) => {
2678 tracing::warn!(%e, func = t.name, "phase_b fentry attach failed");
2679 diag.fentry_attach_failed
2680 .push((t.name.to_string(), e.to_string()));
2681 continue;
2682 }
2683 };
2684
2685 // func_meta_map.update + func_ips.push run AFTER the
2686 // fentry attach succeeds. See the matching ordering
2687 // rationale at the phase A site above: reversing the
2688 // order would orphan map entries and func_ip tuples on
2689 // attach failure. If the map update fails after the
2690 // attach succeeded, drop the Link so post-attach state
2691 // matches what func_ips reports.
2692 let key_bytes = sentinel_ip.to_ne_bytes();
2693 let meta_bytes = unsafe {
2694 std::slice::from_raw_parts(
2695 &meta as *const _ as *const u8,
2696 std::mem::size_of::<crate::bpf_skel::types::func_meta>(),
2697 )
2698 };
2699 if let Err(e) = skel
2700 .maps
2701 .func_meta_map
2702 .update(&key_bytes, meta_bytes, MapFlags::ANY)
2703 {
2704 tracing::warn!(%e, func = t.name, "phase_b fentry: failed to update func_meta_map; dropping attached link");
2705 drop(link);
2706 continue;
2707 }
2708 fentry_links.push(link);
2709 func_ips.push((t.idx, sentinel_ip, t.name.to_string()));
2710 let Some(fexit_result) = attach_fexit_by_slot(&fentry_skel, t.slot) else {
2711 continue;
2712 };
2713 match fexit_result {
2714 Ok(link) => {
2715 tracing::debug!(func = t.name, "phase_b fexit attached");
2716 fexit_links.push(link);
2717 }
2718 Err(e) => {
2719 tracing::debug!(%e, func = t.name, "phase_b fexit attach failed (entry-only)");
2720 }
2721 }
2722 }
2723
2724 drop(fentry_skel);
2725 }
2726
2727 diag.fentry_attached = fentry_links.len() as u32;
2728 tracing::debug!(
2729 fentry = fentry_links.len(),
2730 fexit = fexit_links.len(),
2731 bpf_targets = valid_bpf.len(),
2732 kernel_targets = kernel_fexit_targets.len(),
2733 "Phase B probes attached",
2734 );
2735}
2736
2737#[cfg(test)]
2738#[path = "process_tests.rs"]
2739mod tests;