ktstr/monitor/
task_enrichment.rs

1//! Per-task failure-dump enrichment: read every Tier-1 task field the
2//! failure-dump renderer surfaces in one pass.
3//!
4//! Given a task_struct KVA from a freeze-time walker (rq->scx walker
5//! for runnable tasks, DSQ walker for queued tasks, init_task→tasks
6//! walker for thread-group enumeration), this module reads:
7//!
8//! - Identity: pid, tgid, comm, group_leader_pid, real_parent_pid+comm
9//! - Process tree: pgid, sid, nr_threads (via signal_struct)
10//! - Scheduling: prio, static_prio, normal_prio, rt_priority,
11//!   sched_class decoded to a name (CFS / RT / DL / IDLE / STOP / EXT),
12//!   scx.weight, core_cookie (CONFIG_SCHED_CORE-gated)
13//! - Watchdog disambiguation: `pi_boosted_out_of_scx` flag set when
14//!   the runnable task's `sched_class` is not `ext_sched_class` (the
15//!   PI-boost path moved it out and the failure isn't the BPF
16//!   scheduler's fault — see kernel/sched/core.c rt_mutex_setprio interactions)
17//! - Context-switch counters: per-task nvcsw/nivcsw + per-thread-group
18//!   signal->nvcsw/nivcsw
19//! - Lock-contention hints: stack-trace pattern match against the
20//!   sched_class symbol KVAs of `queued_spin_lock_slowpath`,
21//!   `__mutex_lock_slowpath`, `rwsem_down_read_slowpath`,
22//!   `rwsem_down_write_slowpath`. A PC inside any slowpath function
23//!   on a runnable ('R') task indicates lock contention rather than
24//!   scheduler fault. Stack-trace walking is only attempted when the
25//!   caller supplies a single instruction-pointer PC (`Option<u64>`,
26//!   typically harvested from the freeze coordinator's
27//!   `VcpuRegSnapshot.instruction_pointer`
28//!   for currently-running tasks; runnable-but-not-current tasks have
29//!   no stack PCs without a kernel-side unwinder, which ktstr does
30//!   not implement).
31//!
32//! The walker is best-effort: any pointer follow that fails to
33//! translate yields a `None` for that derived field rather than
34//! aborting the whole enrichment.
35
36use serde::{Deserialize, Serialize};
37
38use super::btf_offsets::{TaskEnrichmentOffsets, pid_type};
39use super::guest::GuestKernel;
40use super::idr::translate_any_kva;
41
42/// Maximum bytes of `comm` to read.
43///
44/// Kernel-pinned at 16 by `include/linux/sched.h::TASK_COMM_LEN`. The
45/// walker reads exactly this many bytes; trailing nuls are stripped
46/// when forming the `String`.
47const TASK_COMM_LEN: usize = 16;
48
49/// Sched-class symbol KVAs cached for decode + watchdog
50/// disambiguation. All six are resolved via vmlinux ELF symbol table
51/// at coordinator start; missing symbols (kernel built without the
52/// corresponding scheduling class — typically `dl_sched_class` on
53/// CONFIG_SCHED_DEADLINE=n) leave the slot as `None`, and the
54/// decoder returns `None` for that class.
55///
56/// All addresses are KVAs of the per-class `sched_class` static
57/// (`fair_sched_class`, `rt_sched_class`, `dl_sched_class`,
58/// `idle_sched_class`, `stop_sched_class`, `ext_sched_class`). On a
59/// running guest, `task_struct.sched_class` points at exactly one of
60/// these — comparing the read pointer to the cached set yields the
61/// class name without needing a kallsyms parse.
62#[derive(Debug, Clone, Default)]
63pub struct SchedClassRegistry {
64    pub fair: Option<u64>,
65    pub rt: Option<u64>,
66    pub dl: Option<u64>,
67    pub idle: Option<u64>,
68    pub stop: Option<u64>,
69    pub ext: Option<u64>,
70}
71
72#[allow(dead_code)] // wired through DumpContext.task_enrichment_capture:
73// capture_tasks::build resolves this registry, freeze_coord builds a
74// TaskEnrichmentCapture from it, and dump_state passes it to
75// walk_task_enrichment for each captured task.
76impl SchedClassRegistry {
77    /// Resolve all six class symbols via the GuestKernel's vmlinux
78    /// symbol table. Each lookup is independent — a missing symbol
79    /// for one class doesn't fail the others.
80    pub fn from_guest_kernel(kernel: &GuestKernel) -> Self {
81        Self {
82            fair: kernel.symbol_kva("fair_sched_class"),
83            rt: kernel.symbol_kva("rt_sched_class"),
84            dl: kernel.symbol_kva("dl_sched_class"),
85            idle: kernel.symbol_kva("idle_sched_class"),
86            stop: kernel.symbol_kva("stop_sched_class"),
87            ext: kernel.symbol_kva("ext_sched_class"),
88        }
89    }
90
91    /// Decode a `task_struct.sched_class` pointer to a class name.
92    /// Returns `None` when the pointer matches no known class
93    /// (stripped vmlinux, an out-of-tree class the kernel added,
94    /// or a torn read landing on garbage).
95    pub fn decode(&self, sched_class_kva: u64) -> Option<&'static str> {
96        if sched_class_kva == 0 {
97            return None;
98        }
99        if Some(sched_class_kva) == self.fair {
100            return Some("fair");
101        }
102        if Some(sched_class_kva) == self.rt {
103            return Some("rt");
104        }
105        if Some(sched_class_kva) == self.dl {
106            return Some("dl");
107        }
108        if Some(sched_class_kva) == self.idle {
109            return Some("idle");
110        }
111        if Some(sched_class_kva) == self.stop {
112            return Some("stop");
113        }
114        if Some(sched_class_kva) == self.ext {
115            return Some("ext");
116        }
117        None
118    }
119}
120
121/// Lock-slowpath function KVAs. Used by the stack-trace lock detector
122/// to flag runnable tasks whose stack contains a slowpath PC —
123/// indicating the apparent scheduler stall is actually lock
124/// contention, not BPF scheduler fault.
125///
126/// Each address is the function entry; the detector flags any PC in
127/// `[start, start + LOCK_SLOWPATH_FN_MAX_SIZE)`. Without ELF symbol
128/// size info we can't bound this exactly, so we use a conservative
129/// 4 KiB window. False positives on adjacent functions are acceptable
130/// for a diagnostic flag; false negatives only matter on slowpaths
131/// longer than 4 KiB, none of which occur in mainline.
132#[derive(Debug, Clone, Default)]
133pub struct LockSlowpathRegistry {
134    pub queued_spin_lock_slowpath: Option<u64>,
135    pub mutex_lock_slowpath: Option<u64>,
136    pub rwsem_down_read_slowpath: Option<u64>,
137    pub rwsem_down_write_slowpath: Option<u64>,
138}
139
140/// Conservative max function size for stack-PC matching against
141/// lock-slowpath entry symbols. See `LockSlowpathRegistry` doc.
142const LOCK_SLOWPATH_FN_MAX_SIZE: u64 = 4096;
143
144#[allow(dead_code)] // same wiring as SchedClassRegistry above: resolved by
145// capture_tasks::build, threaded through TaskEnrichmentCapture into
146// walk_task_enrichment.
147impl LockSlowpathRegistry {
148    /// Resolve the four lock-slowpath symbols from the GuestKernel's
149    /// vmlinux. Each lookup is independent; absent symbols leave the
150    /// corresponding slot None and the matcher silently skips that
151    /// pattern.
152    pub fn from_guest_kernel(kernel: &GuestKernel) -> Self {
153        Self {
154            queued_spin_lock_slowpath: kernel.symbol_kva("queued_spin_lock_slowpath"),
155            // `__mutex_lock_slowpath` is the historical name; modern
156            // kernels (~4.15+) inline the slowpath into
157            // `__mutex_lock`, but a leftover symbol remains in many
158            // configs. Fall through to `__mutex_lock` if the
159            // slowpath symbol is absent — both indicate the same
160            // contention pattern at PC granularity.
161            mutex_lock_slowpath: kernel
162                .symbol_kva("__mutex_lock_slowpath")
163                .or_else(|| kernel.symbol_kva("__mutex_lock")),
164            rwsem_down_read_slowpath: kernel.symbol_kva("rwsem_down_read_slowpath"),
165            rwsem_down_write_slowpath: kernel.symbol_kva("rwsem_down_write_slowpath"),
166        }
167    }
168
169    /// Match a single PC against the four slowpath windows. Returns
170    /// the pattern name when any window contains the PC, or `None`
171    /// otherwise.
172    pub fn match_pc(&self, pc: u64) -> Option<&'static str> {
173        let probe = |start: Option<u64>, name: &'static str| -> Option<&'static str> {
174            let s = start?;
175            // Symbol KVAs come from the guest's vmlinux. A corrupt
176            // ELF could place a slowpath symbol near u64::MAX; the
177            // window upper bound `s + 4096` would wrap, and `pc <
178            // wrapped` would falsely match every PC. checked_add
179            // returning None means "this symbol can't define a
180            // valid window" — treat as no match for that pattern.
181            let end = s.checked_add(LOCK_SLOWPATH_FN_MAX_SIZE)?;
182            if pc >= s && pc < end {
183                Some(name)
184            } else {
185                None
186            }
187        };
188        probe(self.queued_spin_lock_slowpath, "queued_spin_lock_slowpath")
189            .or_else(|| probe(self.mutex_lock_slowpath, "mutex_lock_slowpath"))
190            .or_else(|| probe(self.rwsem_down_read_slowpath, "rwsem_down_read_slowpath"))
191            .or_else(|| probe(self.rwsem_down_write_slowpath, "rwsem_down_write_slowpath"))
192    }
193}
194
195/// Per-task enrichment captured at freeze time.
196///
197/// Every field is best-effort: read failures (untranslatable RCU
198/// pointer, slab-page eviction race, missing BTF field) yield `None`
199/// rather than failing the whole capture. Optional fields cover both
200/// "absent on this kernel build" (e.g. `core_cookie` without
201/// CONFIG_SCHED_CORE) and "unreadable at this freeze instant" (e.g.
202/// `real_parent_pid` when the parent task_struct's slab page didn't
203/// translate).
204#[derive(Debug, Clone, Default, Serialize, Deserialize)]
205#[non_exhaustive]
206pub struct TaskEnrichment {
207    /// `task_struct.pid`. The kernel's per-task identifier.
208    pub pid: i32,
209    /// `task_struct.tgid`. Thread-group identifier (POSIX `getpid()`).
210    pub tgid: i32,
211    /// `task_struct.comm` truncated at the first nul byte.
212    pub comm: String,
213    /// `task_struct.group_leader->pid`. Pointer-followed; `None` on
214    /// translate failure or NULL group_leader (init_task case).
215    #[serde(default, skip_serializing_if = "Option::is_none")]
216    pub group_leader_pid: Option<i32>,
217    /// `task_struct.real_parent->pid`. RCU pointer-followed;
218    /// `None` on translate failure or NULL real_parent (init_task).
219    #[serde(default, skip_serializing_if = "Option::is_none")]
220    pub real_parent_pid: Option<i32>,
221    /// `task_struct.real_parent->comm` truncated at the first nul.
222    /// `None` if real_parent unreadable.
223    #[serde(default, skip_serializing_if = "Option::is_none")]
224    pub real_parent_comm: Option<String>,
225    /// `signal->pids[PIDTYPE_PGID]->numbers[0].nr`. Process group id.
226    /// `None` on signal_struct translate failure or NULL pids slot.
227    #[serde(default, skip_serializing_if = "Option::is_none")]
228    pub pgid: Option<i32>,
229    /// `signal->pids[PIDTYPE_SID]->numbers[0].nr`. Session id.
230    #[serde(default, skip_serializing_if = "Option::is_none")]
231    pub sid: Option<i32>,
232    /// `signal->nr_threads`. Live thread count for the thread group.
233    /// `None` on signal_struct translate failure.
234    #[serde(default, skip_serializing_if = "Option::is_none")]
235    pub nr_threads: Option<i32>,
236    /// `task_struct.scx.weight` (u32). scx-domain CFS-equivalent
237    /// weight; 100 default.
238    pub weight: u32,
239    /// `task_struct.prio`. Effective scheduling priority
240    /// (PI-boost-aware).
241    pub prio: i32,
242    /// `task_struct.static_prio`. User-set priority before PI boost.
243    pub static_prio: i32,
244    /// `task_struct.normal_prio`. Normal priority for the class.
245    pub normal_prio: i32,
246    /// `task_struct.rt_priority`. RT priority (1-99) for SCHED_FIFO/RR.
247    pub rt_priority: u32,
248    /// Decoded sched_class name: "fair", "rt", "dl", "idle", "stop",
249    /// or "ext". `None` when the pointer matches no cached class
250    /// (stripped vmlinux or out-of-tree class).
251    #[serde(default, skip_serializing_if = "Option::is_none")]
252    pub sched_class: Option<String>,
253    /// `task_struct.core_cookie` (unsigned long).
254    /// CONFIG_SCHED_CORE-gated; `None` on kernels built without it.
255    #[serde(default, skip_serializing_if = "Option::is_none")]
256    pub core_cookie: Option<u64>,
257    /// True iff the task was on the rq->scx.runnable_list at freeze
258    /// time AND the `ext_sched_class` symbol was resolved
259    /// (`classes.ext.is_some()`) AND `sched_class != ext_sched_class`;
260    /// an unresolved ext symbol leaves the flag false. Indicates the PI
261    /// boost path moved it out of SCX (rt_mutex_setprio) — failure
262    /// is not the BPF scheduler's fault. Set only by the runnable
263    /// walker; the queued-DSQ walker leaves this `false`.
264    pub pi_boosted_out_of_scx: bool,
265    /// `task_struct.nvcsw` (unsigned long). Voluntary context
266    /// switches for this live thread.
267    pub nvcsw: u64,
268    /// `task_struct.nivcsw` (unsigned long). Involuntary context
269    /// switches.
270    pub nivcsw: u64,
271    /// `signal->nvcsw` (unsigned long). Thread-group accumulator
272    /// for dead threads. `None` on signal_struct translate failure.
273    #[serde(default, skip_serializing_if = "Option::is_none")]
274    pub signal_nvcsw: Option<u64>,
275    /// `signal->nivcsw` (unsigned long). Mirror of `signal_nvcsw`.
276    #[serde(default, skip_serializing_if = "Option::is_none")]
277    pub signal_nivcsw: Option<u64>,
278    /// `task_struct.utime` (u64, nanoseconds). This live thread's
279    /// cumulative user-mode CPU time, task-lifetime monotonic. The raw
280    /// kernel accumulator (NOT the cputime_adjust-scaled /proc value),
281    /// equal to taskstats `ac_utime` modulo ns→us truncation. Counter
282    /// semantics: per-phase user time = end-start delta of the
283    /// summed-across-tasks reading.
284    pub utime: u64,
285    /// `task_struct.stime` (u64, nanoseconds). This live thread's
286    /// cumulative system-mode (in-kernel) CPU time — the DSQ-spinlock
287    /// regression's direct symptom. Same raw-accumulator / Counter
288    /// semantics as [`Self::utime`].
289    pub stime: u64,
290    /// `signal->utime` (u64, ns). Thread-group accumulator of EXITED
291    /// threads' user-mode CPU time (`__exit_signal` folds a dying
292    /// thread's utime here). `None` on signal_struct translate failure.
293    /// Shared across a thread group, so a per-group sum must add it
294    /// exactly once; combined with the live-thread [`Self::utime`] sum
295    /// it keeps the per-phase total from dipping when a worker exits.
296    #[serde(default, skip_serializing_if = "Option::is_none")]
297    pub signal_utime: Option<u64>,
298    /// `signal->stime` (u64, ns). Exited threads' system-mode CPU time
299    /// accumulator. Mirror of [`Self::signal_utime`].
300    #[serde(default, skip_serializing_if = "Option::is_none")]
301    pub signal_stime: Option<u64>,
302    /// Lock-slowpath pattern matched on a PC supplied by the caller
303    /// (typically `VcpuRegSnapshot.instruction_pointer` for the task
304    /// running on a vCPU at freeze time). One of
305    /// "queued_spin_lock_slowpath", "mutex_lock_slowpath",
306    /// "rwsem_down_read_slowpath", "rwsem_down_write_slowpath", or
307    /// `None` when the supplied PC matched nothing OR the caller
308    /// supplied no PCs.
309    ///
310    /// Set only when `walk_task_enrichment` is called with a `Some(pc)`
311    /// that matches a slowpath window; a `None` pc always leaves this
312    /// `None`. A stack walker that produces multiple PCs (a future
313    /// kernel-side unwinder) would surface them as a `Vec<String>`
314    /// in a non_exhaustive struct extension.
315    #[serde(default, skip_serializing_if = "Option::is_none")]
316    pub lock_slowpath_match: Option<String>,
317}
318
319/// Walk one task_struct and populate every Tier-1 enrichment field.
320/// `task_kva` must point at a valid `struct task_struct` reachable
321/// via `translate_any_kva`. `is_runnable_in_scx` is set by the rq->scx
322/// walker for tasks read off `rq->scx.runnable_list` (used for the
323/// PI-boost-out-of-SCX flag); the queued-DSQ walker passes `false`.
324///
325/// `pc` (`Option<u64>`) is the task's instruction pointer for the
326/// lock-slowpath stack matcher. Pass the corresponding vCPU's
327/// `instruction_pointer` when this task was running on that vCPU at
328/// freeze time; pass `None` for tasks not actively running (the
329/// matcher needs an unwinder we don't have).
330#[allow(dead_code)]
331pub fn walk_task_enrichment(
332    kernel: &GuestKernel,
333    task_kva: u64,
334    offsets: &TaskEnrichmentOffsets,
335    classes: &SchedClassRegistry,
336    locks: &LockSlowpathRegistry,
337    is_runnable_in_scx: bool,
338    pc: Option<u64>,
339) -> Option<TaskEnrichment> {
340    let mem = kernel.mem();
341    let walk = kernel.walk_context();
342
343    let task_pa = translate_any_kva(
344        mem,
345        walk.cr3_pa,
346        walk.page_offset,
347        task_kva,
348        walk.l5,
349        walk.tcr_el1,
350    )?;
351
352    // Identity.
353    let pid = mem.read_u32(task_pa, offsets.task_struct_pid) as i32;
354    let tgid = mem.read_u32(task_pa, offsets.task_struct_tgid) as i32;
355    let comm = read_comm(mem, task_pa, offsets.task_struct_comm);
356
357    // Scheduling fields.
358    let prio = mem.read_u32(task_pa, offsets.task_struct_prio) as i32;
359    let static_prio = mem.read_u32(task_pa, offsets.task_struct_static_prio) as i32;
360    let normal_prio = mem.read_u32(task_pa, offsets.task_struct_normal_prio) as i32;
361    let rt_priority = mem.read_u32(task_pa, offsets.task_struct_rt_priority);
362    let sched_class_kva = mem.read_u64(task_pa, offsets.task_struct_sched_class);
363    let sched_class = classes.decode(sched_class_kva).map(str::to_string);
364    let weight = mem.read_u32(task_pa, offsets.task_struct_scx + offsets.see_weight);
365    let core_cookie = offsets
366        .task_struct_core_cookie
367        .map(|off| mem.read_u64(task_pa, off));
368
369    // PI-boost-out-of-SCX flag: set only when the task was reached
370    // via the rq->scx.runnable_list AND its current sched_class is
371    // not ext_sched_class. This catches the case where rt_mutex_setprio
372    // moved the task to a higher-prio class while it remained on the
373    // SCX runnable list.
374    let pi_boosted_out_of_scx =
375        is_runnable_in_scx && classes.ext.is_some() && Some(sched_class_kva) != classes.ext;
376
377    // Per-task context-switch counters.
378    let nvcsw = mem.read_u64(task_pa, offsets.task_struct_nvcsw);
379    let nivcsw = mem.read_u64(task_pa, offsets.task_struct_nivcsw);
380    // Per-task cumulative CPU time (nanoseconds), task-lifetime
381    // monotonic. Read raw from frozen guest memory — zero guest work
382    // (vs the taskstats genetlink query). utime already includes gtime
383    // (guest time is double-counted into utime); do not sum the two.
384    let utime = mem.read_u64(task_pa, offsets.task_struct_utime);
385    let stime = mem.read_u64(task_pa, offsets.task_struct_stime);
386
387    // Pointer follows: group_leader, real_parent, signal.
388    let group_leader_kva = mem.read_u64(task_pa, offsets.task_struct_group_leader);
389    let group_leader_pid =
390        follow_task_for_pid(mem, walk, group_leader_kva, offsets.task_struct_pid);
391
392    let real_parent_kva = mem.read_u64(task_pa, offsets.task_struct_real_parent);
393    let (real_parent_pid, real_parent_comm) = follow_task_for_pid_and_comm(
394        mem,
395        walk,
396        real_parent_kva,
397        offsets.task_struct_pid,
398        offsets.task_struct_comm,
399    );
400
401    let signal_kva = mem.read_u64(task_pa, offsets.task_struct_signal);
402    let (nr_threads, signal_nvcsw, signal_nivcsw, signal_utime, signal_stime, pgid, sid) =
403        if signal_kva == 0 {
404            (None, None, None, None, None, None, None)
405        } else {
406            match translate_any_kva(
407                mem,
408                walk.cr3_pa,
409                walk.page_offset,
410                signal_kva,
411                walk.l5,
412                walk.tcr_el1,
413            ) {
414                None => (None, None, None, None, None, None, None),
415                Some(signal_pa) => {
416                    let nr_threads_v =
417                        mem.read_u32(signal_pa, offsets.signal_struct_nr_threads) as i32;
418                    let signal_nvcsw_v = mem.read_u64(signal_pa, offsets.signal_struct_nvcsw);
419                    let signal_nivcsw_v = mem.read_u64(signal_pa, offsets.signal_struct_nivcsw);
420                    // Exited-thread CPU-time accumulators (ns): added to
421                    // the live-thread sum so a mid-phase exit does not
422                    // undercount per-phase CPU time.
423                    let signal_utime_v = mem.read_u64(signal_pa, offsets.signal_struct_utime);
424                    let signal_stime_v = mem.read_u64(signal_pa, offsets.signal_struct_stime);
425                    // pids[PIDTYPE_PGID] / pids[PIDTYPE_SID] traversal.
426                    // Each slot is `struct pid *` (8 bytes); the
427                    // numbers[0].nr deref reads the canonical root-ns
428                    // pid number.
429                    let pgid_v = read_pid_nr_at_index(
430                        mem,
431                        walk,
432                        signal_pa,
433                        offsets.signal_struct_pids,
434                        pid_type::PGID,
435                        offsets.pid_numbers,
436                        offsets.upid_size,
437                        offsets.upid_nr,
438                    );
439                    let sid_v = read_pid_nr_at_index(
440                        mem,
441                        walk,
442                        signal_pa,
443                        offsets.signal_struct_pids,
444                        pid_type::SID,
445                        offsets.pid_numbers,
446                        offsets.upid_size,
447                        offsets.upid_nr,
448                    );
449                    (
450                        Some(nr_threads_v),
451                        Some(signal_nvcsw_v),
452                        Some(signal_nivcsw_v),
453                        Some(signal_utime_v),
454                        Some(signal_stime_v),
455                        pgid_v,
456                        sid_v,
457                    )
458                }
459            }
460        };
461
462    // Lock-slowpath PC match, if a PC was supplied.
463    let lock_slowpath_match = pc.and_then(|p| locks.match_pc(p)).map(str::to_string);
464
465    Some(TaskEnrichment {
466        pid,
467        tgid,
468        comm,
469        group_leader_pid,
470        real_parent_pid,
471        real_parent_comm,
472        pgid,
473        sid,
474        nr_threads,
475        weight,
476        prio,
477        static_prio,
478        normal_prio,
479        rt_priority,
480        sched_class,
481        core_cookie,
482        pi_boosted_out_of_scx,
483        nvcsw,
484        nivcsw,
485        signal_nvcsw,
486        signal_nivcsw,
487        utime,
488        stime,
489        signal_utime,
490        signal_stime,
491        lock_slowpath_match,
492    })
493}
494
495/// Read `comm` as a `String` truncated at the first nul.
496fn read_comm(mem: &super::reader::GuestMem, task_pa: u64, comm_off: usize) -> String {
497    let mut buf = [0u8; TASK_COMM_LEN];
498    mem.read_bytes(task_pa + comm_off as u64, &mut buf);
499    let n = buf.iter().position(|&b| b == 0).unwrap_or(TASK_COMM_LEN);
500    String::from_utf8_lossy(&buf[..n]).to_string()
501}
502
503/// Translate a `task_struct *` to its physical address and return
504/// `(pid, comm)`. Returns `(None, None)` on any failure.
505fn follow_task_for_pid_and_comm(
506    mem: &super::reader::GuestMem,
507    walk: super::reader::WalkContext,
508    task_kva: u64,
509    pid_off: usize,
510    comm_off: usize,
511) -> (Option<i32>, Option<String>) {
512    if task_kva == 0 {
513        return (None, None);
514    }
515    let Some(task_pa) = translate_any_kva(
516        mem,
517        walk.cr3_pa,
518        walk.page_offset,
519        task_kva,
520        walk.l5,
521        walk.tcr_el1,
522    ) else {
523        return (None, None);
524    };
525    let pid = mem.read_u32(task_pa, pid_off) as i32;
526    let comm = read_comm(mem, task_pa, comm_off);
527    (Some(pid), Some(comm))
528}
529
530/// Translate a `task_struct *` and read just the pid.
531fn follow_task_for_pid(
532    mem: &super::reader::GuestMem,
533    walk: super::reader::WalkContext,
534    task_kva: u64,
535    pid_off: usize,
536) -> Option<i32> {
537    if task_kva == 0 {
538        return None;
539    }
540    let task_pa = translate_any_kva(
541        mem,
542        walk.cr3_pa,
543        walk.page_offset,
544        task_kva,
545        walk.l5,
546        walk.tcr_el1,
547    )?;
548    Some(mem.read_u32(task_pa, pid_off) as i32)
549}
550
551/// Read `signal->pids[idx]->numbers[0].nr`.
552///
553/// Three pointer hops:
554///   1. `signal_pa + pids_off + idx * 8` reads the `struct pid *`.
555///   2. Translate the pid pointer; the `numbers[0]` element starts at
556///      `pid_pa + numbers_off`.
557///   3. Read the `nr` field at `numbers[0] + nr_off`.
558///
559/// Returns `None` on any translate failure or when the pid pointer is
560/// NULL (typical for `pids[PIDTYPE_PGID/SID]` on threads that aren't
561/// session/process group leaders).
562#[allow(clippy::too_many_arguments)]
563fn read_pid_nr_at_index(
564    mem: &super::reader::GuestMem,
565    walk: super::reader::WalkContext,
566    signal_pa: u64,
567    pids_off: usize,
568    idx: usize,
569    numbers_off: usize,
570    upid_size: usize,
571    nr_off: usize,
572) -> Option<i32> {
573    let pid_kva = mem.read_u64(signal_pa, pids_off + idx * 8);
574    if pid_kva == 0 {
575        return None;
576    }
577    let pid_pa = translate_any_kva(
578        mem,
579        walk.cr3_pa,
580        walk.page_offset,
581        pid_kva,
582        walk.l5,
583        walk.tcr_el1,
584    )?;
585    // numbers[0] is at offset `numbers_off`; subsequent levels are at
586    // `numbers_off + level * upid_size`. We always read level 0
587    // (root pid namespace) per the kernel's `pid_nr` contract.
588    let _ = upid_size; // captured in signature for level>0 callers
589    Some(mem.read_u32(pid_pa, numbers_off + nr_off) as i32)
590}
591
592#[cfg(test)]
593mod tests {
594    use super::*;
595
596    #[test]
597    fn sched_class_registry_decode_known_class() {
598        let r = SchedClassRegistry {
599            fair: Some(0xffff_ffff_8000_1000),
600            rt: Some(0xffff_ffff_8000_1100),
601            dl: None,
602            idle: None,
603            stop: None,
604            ext: Some(0xffff_ffff_8000_1300),
605        };
606        assert_eq!(r.decode(0xffff_ffff_8000_1000), Some("fair"));
607        assert_eq!(r.decode(0xffff_ffff_8000_1100), Some("rt"));
608        assert_eq!(r.decode(0xffff_ffff_8000_1300), Some("ext"));
609    }
610
611    #[test]
612    fn sched_class_registry_decode_unknown_returns_none() {
613        let r = SchedClassRegistry {
614            fair: Some(0xffff_ffff_8000_1000),
615            rt: None,
616            dl: None,
617            idle: None,
618            stop: None,
619            ext: None,
620        };
621        assert_eq!(r.decode(0xffff_ffff_8000_2000), None);
622        // Zero pointer never decodes (would-be-NULL sched_class).
623        assert_eq!(r.decode(0), None);
624    }
625
626    #[test]
627    fn lock_slowpath_match_within_window() {
628        let r = LockSlowpathRegistry {
629            queued_spin_lock_slowpath: Some(0xffff_ffff_8001_0000),
630            mutex_lock_slowpath: Some(0xffff_ffff_8002_0000),
631            rwsem_down_read_slowpath: None,
632            rwsem_down_write_slowpath: None,
633        };
634        // Inside the qsl window.
635        assert_eq!(
636            r.match_pc(0xffff_ffff_8001_0010),
637            Some("queued_spin_lock_slowpath")
638        );
639        // Inside the mutex window.
640        assert_eq!(
641            r.match_pc(0xffff_ffff_8002_0fff),
642            Some("mutex_lock_slowpath")
643        );
644        // Past the qsl window (4 KiB cap).
645        assert!(r.match_pc(0xffff_ffff_8001_2000).is_none());
646        // Before the qsl window.
647        assert!(r.match_pc(0xffff_ffff_8000_ffff).is_none());
648    }
649
650    #[test]
651    fn lock_slowpath_no_match_when_all_none() {
652        let r = LockSlowpathRegistry::default();
653        assert_eq!(r.match_pc(0xdeadbeef), None);
654    }
655
656    /// Pin the wire shape of `TaskEnrichment` — every optional field
657    /// should skip on `None` so a populated capture renders cleanly
658    /// without a wall of nulls in the JSON.
659    #[test]
660    fn task_enrichment_serde_skip_none_fields() {
661        let e = TaskEnrichment {
662            pid: 42,
663            tgid: 42,
664            comm: "ktstr_worker".to_string(),
665            group_leader_pid: None,
666            real_parent_pid: None,
667            real_parent_comm: None,
668            pgid: None,
669            sid: None,
670            nr_threads: None,
671            weight: 100,
672            prio: 120,
673            static_prio: 120,
674            normal_prio: 120,
675            rt_priority: 0,
676            sched_class: Some("fair".to_string()),
677            core_cookie: None,
678            pi_boosted_out_of_scx: false,
679            nvcsw: 0,
680            nivcsw: 0,
681            signal_nvcsw: None,
682            signal_nivcsw: None,
683            utime: 0,
684            stime: 0,
685            signal_utime: None,
686            signal_stime: None,
687            lock_slowpath_match: None,
688        };
689        let json = serde_json::to_string(&e).unwrap();
690        // Skipped fields must not appear in the JSON.
691        assert!(!json.contains("group_leader_pid"));
692        assert!(!json.contains("real_parent_pid"));
693        assert!(!json.contains("pgid"));
694        assert!(!json.contains("nr_threads"));
695        assert!(!json.contains("core_cookie"));
696        assert!(!json.contains("signal_nvcsw"));
697        // signal_utime/signal_stime skip on None (exited-thread
698        // accumulators absent when signal_struct didn't translate).
699        assert!(!json.contains("signal_utime"));
700        assert!(!json.contains("signal_stime"));
701        assert!(!json.contains("lock_slowpath_match"));
702        // Required fields must appear.
703        assert!(json.contains("\"pid\":42"));
704        assert!(json.contains("\"comm\":\"ktstr_worker\""));
705        assert!(json.contains("\"weight\":100"));
706        assert!(json.contains("\"sched_class\":\"fair\""));
707        // utime/stime are unconditional (always serialized, even 0).
708        assert!(json.contains("\"utime\":0"));
709        assert!(json.contains("\"stime\":0"));
710    }
711
712    #[test]
713    fn task_enrichment_serde_roundtrip_populated() {
714        let e = TaskEnrichment {
715            pid: 1234,
716            tgid: 1230,
717            comm: "stress-ng".to_string(),
718            group_leader_pid: Some(1230),
719            real_parent_pid: Some(1),
720            real_parent_comm: Some("systemd".to_string()),
721            pgid: Some(1230),
722            sid: Some(1),
723            nr_threads: Some(8),
724            weight: 200,
725            prio: 100,
726            static_prio: 120,
727            normal_prio: 100,
728            rt_priority: 50,
729            sched_class: Some("rt".to_string()),
730            core_cookie: Some(0xc0c01e),
731            pi_boosted_out_of_scx: true,
732            nvcsw: 12345,
733            nivcsw: 678,
734            signal_nvcsw: Some(50_000),
735            signal_nivcsw: Some(1_234),
736            utime: 99_000,
737            stime: 88_000,
738            signal_utime: Some(7_000),
739            signal_stime: Some(6_000),
740            lock_slowpath_match: Some("queued_spin_lock_slowpath".to_string()),
741        };
742        let json = serde_json::to_string(&e).unwrap();
743        let parsed: TaskEnrichment = serde_json::from_str(&json).unwrap();
744        assert_eq!(parsed.pid, 1234);
745        assert_eq!(parsed.comm, "stress-ng");
746        assert_eq!(parsed.real_parent_comm.as_deref(), Some("systemd"));
747        assert_eq!(parsed.nr_threads, Some(8));
748        assert_eq!(parsed.core_cookie, Some(0xc0c01e));
749        assert!(parsed.pi_boosted_out_of_scx);
750        assert_eq!(parsed.utime, 99_000);
751        assert_eq!(parsed.stime, 88_000);
752        assert_eq!(parsed.signal_utime, Some(7_000));
753        assert_eq!(parsed.signal_stime, Some(6_000));
754        assert_eq!(
755            parsed.lock_slowpath_match.as_deref(),
756            Some("queued_spin_lock_slowpath"),
757        );
758    }
759
760    // -- walk_task_enrichment memory-walk coverage --
761    //
762    // All fixtures use page_offset = 0 so a planted KVA equals its
763    // DRAM offset (kva_to_pa is `kva.wrapping_sub(0)`), and
764    // translate_any_kva returns the KVA directly when it is
765    // `< mem.size()`. cr3_pa = 0 / l5 = false means an out-of-bounds
766    // KVA falls through to the page-table walk, which fails with a
767    // zero root => None. OOB scalar reads return 0 (GuestMem's
768    // read_scalar), so every buffer is sized past the highest
769    // touched offset.
770
771    /// Synthetic struct layout shared by the walk fixtures. Offsets
772    /// are chosen so no two read ranges overlap. `scx` base is 0x30
773    /// and `see_weight` is 0x04 within it, so the weight read lands
774    /// at task+0x34.
775    fn fixture_offsets() -> TaskEnrichmentOffsets {
776        TaskEnrichmentOffsets {
777            // task_struct fields
778            task_struct_pid: 0x00,
779            task_struct_tgid: 0x04,
780            task_struct_prio: 0x08,
781            task_struct_static_prio: 0x0c,
782            task_struct_normal_prio: 0x10,
783            task_struct_rt_priority: 0x14,
784            task_struct_comm: 0x18,
785            task_struct_sched_class: 0x28,
786            task_struct_scx: 0x30,
787            task_struct_core_cookie: Some(0x38),
788            task_struct_nvcsw: 0x40,
789            task_struct_nivcsw: 0x48,
790            task_struct_utime: 0x50,
791            task_struct_stime: 0x58,
792            task_struct_group_leader: 0x60,
793            task_struct_real_parent: 0x68,
794            task_struct_signal: 0x70,
795            task_struct_stack: 0x78,
796            // sched_ext_entity field (relative to scx base)
797            see_weight: 0x04,
798            // signal_struct fields
799            signal_struct_nr_threads: 0x00,
800            signal_struct_nvcsw: 0x08,
801            signal_struct_nivcsw: 0x10,
802            signal_struct_utime: 0x18,
803            signal_struct_stime: 0x20,
804            signal_struct_pids: 0x30,
805            // struct pid / struct upid fields
806            pid_numbers: 0x00,
807            pid_size: 0x00,
808            upid_nr: 0x00,
809            upid_size: 16,
810        }
811    }
812
813    // In-buffer addresses (== KVA == PA under page_offset = 0).
814    const TASK_ADDR: u64 = 0x100;
815    const SIGNAL_ADDR: u64 = 0x400;
816    const PGID_PID_ADDR: u64 = 0x600;
817    const SID_PID_ADDR: u64 = 0x680;
818    const PARENT_TASK_ADDR: u64 = 0x800;
819    // sched_class comparison sentinels. Never dereferenced — only
820    // compared against the SchedClassRegistry slots.
821    const EXT_CLASS_KVA: u64 = 0x9000;
822    const FAIR_CLASS_KVA: u64 = 0x9100;
823
824    fn put_u32(buf: &mut [u8], at: u64, off: usize, val: u32) {
825        let a = at as usize + off;
826        buf[a..a + 4].copy_from_slice(&val.to_le_bytes());
827    }
828
829    fn put_u64(buf: &mut [u8], at: u64, off: usize, val: u64) {
830        let a = at as usize + off;
831        buf[a..a + 8].copy_from_slice(&val.to_le_bytes());
832    }
833
834    fn put_comm(buf: &mut [u8], at: u64, off: usize, bytes: &[u8]) {
835        let a = at as usize + off;
836        buf[a..a + bytes.len()].copy_from_slice(bytes);
837    }
838
839    /// Plant a complete, well-formed task_struct + signal_struct +
840    /// PGID/SID struct pids into `buf` using [`fixture_offsets`].
841    /// sched_class is planted as [`EXT_CLASS_KVA`]. The caller may
842    /// then mutate individual fields before walking to exercise a
843    /// specific branch.
844    fn plant_happy_task(buf: &mut [u8]) {
845        let o = fixture_offsets();
846        // Identity.
847        put_u32(buf, TASK_ADDR, o.task_struct_pid, 4321);
848        put_u32(buf, TASK_ADDR, o.task_struct_tgid, 4300);
849        put_comm(buf, TASK_ADDR, o.task_struct_comm, b"worker\0");
850        // Scheduling scalars.
851        put_u32(buf, TASK_ADDR, o.task_struct_prio, 100);
852        put_u32(buf, TASK_ADDR, o.task_struct_static_prio, 120);
853        put_u32(buf, TASK_ADDR, o.task_struct_normal_prio, 100);
854        put_u32(buf, TASK_ADDR, o.task_struct_rt_priority, 50);
855        put_u64(buf, TASK_ADDR, o.task_struct_sched_class, EXT_CLASS_KVA);
856        // weight at scx + see_weight.
857        put_u32(buf, TASK_ADDR, o.task_struct_scx + o.see_weight, 200);
858        // core_cookie.
859        put_u64(buf, TASK_ADDR, o.task_struct_core_cookie.unwrap(), 0xABCD);
860        // Per-task context-switch + cputime counters.
861        put_u64(buf, TASK_ADDR, o.task_struct_nvcsw, 11);
862        put_u64(buf, TASK_ADDR, o.task_struct_nivcsw, 22);
863        put_u64(buf, TASK_ADDR, o.task_struct_utime, 999_000);
864        put_u64(buf, TASK_ADDR, o.task_struct_stime, 888_000);
865        // Pointer fields: parent + group_leader point at the mini
866        // task; signal points at the signal_struct.
867        put_u64(buf, TASK_ADDR, o.task_struct_group_leader, PARENT_TASK_ADDR);
868        put_u64(buf, TASK_ADDR, o.task_struct_real_parent, PARENT_TASK_ADDR);
869        put_u64(buf, TASK_ADDR, o.task_struct_signal, SIGNAL_ADDR);
870
871        // Mini parent/group-leader task: only pid + comm are read by
872        // follow_task_for_pid (group_leader) and
873        // follow_task_for_pid_and_comm (real_parent), both at the
874        // task_struct pid/comm offsets.
875        put_u32(buf, PARENT_TASK_ADDR, o.task_struct_pid, 1);
876        put_comm(buf, PARENT_TASK_ADDR, o.task_struct_comm, b"init\0");
877
878        // signal_struct.
879        put_u32(buf, SIGNAL_ADDR, o.signal_struct_nr_threads, 8);
880        put_u64(buf, SIGNAL_ADDR, o.signal_struct_nvcsw, 70_000);
881        put_u64(buf, SIGNAL_ADDR, o.signal_struct_nivcsw, 3);
882        put_u64(buf, SIGNAL_ADDR, o.signal_struct_utime, 7_000);
883        put_u64(buf, SIGNAL_ADDR, o.signal_struct_stime, 6_000);
884        // pids[PGID] / pids[SID] slots -> struct pid KVAs.
885        put_u64(
886            buf,
887            SIGNAL_ADDR,
888            o.signal_struct_pids + pid_type::PGID * 8,
889            PGID_PID_ADDR,
890        );
891        put_u64(
892            buf,
893            SIGNAL_ADDR,
894            o.signal_struct_pids + pid_type::SID * 8,
895            SID_PID_ADDR,
896        );
897        // struct pid numbers[0].nr for PGID (4300) and SID (1).
898        put_u32(buf, PGID_PID_ADDR, o.pid_numbers + o.upid_nr, 4300);
899        put_u32(buf, SID_PID_ADDR, o.pid_numbers + o.upid_nr, 1);
900    }
901
902    /// Build a direct-mapped test kernel over `buf` (page_offset = 0,
903    /// cr3_pa = 0, l5 = false). `buf` must outlive the returned
904    /// kernel — the GuestMem holds a raw pointer into it.
905    fn build_kernel(buf: &mut [u8]) -> GuestKernel {
906        // SAFETY: `buf` is a live caller-owned slice that outlives the
907        // GuestKernel (and thus the GuestMem) in every test below.
908        let mem =
909            unsafe { crate::monitor::reader::GuestMem::new(buf.as_mut_ptr(), buf.len() as u64) };
910        GuestKernel::new_for_test(
911            std::sync::Arc::new(mem),
912            std::collections::HashMap::new(),
913            0,
914            0,
915            false,
916        )
917    }
918
919    fn ext_registry() -> SchedClassRegistry {
920        SchedClassRegistry {
921            fair: Some(FAIR_CLASS_KVA),
922            ext: Some(EXT_CLASS_KVA),
923            ..Default::default()
924        }
925    }
926
927    #[test]
928    fn walk_task_enrichment_root_translate_fail_returns_none() {
929        // task_kva == mem.size() => direct_pa == size (NOT < size) so
930        // translate_any_kva skips the direct map, then the cr3_pa = 0
931        // page-table walk fails => the whole fn returns None.
932        let mut buf = vec![0u8; 0x2000];
933        let kernel = build_kernel(&mut buf);
934        let offsets = fixture_offsets();
935        let oob_task_kva = kernel.mem().size(); // == 0x2000
936        let result = walk_task_enrichment(
937            &kernel,
938            oob_task_kva,
939            &offsets,
940            &ext_registry(),
941            &LockSlowpathRegistry::default(),
942            false,
943            None,
944        );
945        assert!(
946            result.is_none(),
947            "unreadable task_struct must abort the whole enrichment"
948        );
949    }
950
951    #[test]
952    fn walk_task_enrichment_full_happy_path_exact_fields() {
953        let mut buf = vec![0u8; 0x2000];
954        plant_happy_task(&mut buf);
955        let kernel = build_kernel(&mut buf);
956        let offsets = fixture_offsets();
957        let e = walk_task_enrichment(
958            &kernel,
959            TASK_ADDR,
960            &offsets,
961            &ext_registry(),
962            &LockSlowpathRegistry::default(),
963            false,
964            None,
965        )
966        .expect("task built");
967
968        assert_eq!(e.pid, 4321);
969        assert_eq!(e.tgid, 4300);
970        assert_eq!(e.comm, "worker");
971        assert_eq!(e.prio, 100);
972        assert_eq!(e.static_prio, 120);
973        assert_eq!(e.normal_prio, 100);
974        assert_eq!(e.rt_priority, 50);
975        assert_eq!(e.sched_class.as_deref(), Some("ext"));
976        assert_eq!(e.weight, 200);
977        assert_eq!(e.core_cookie, Some(0xABCD));
978        assert_eq!(e.nvcsw, 11);
979        assert_eq!(e.nivcsw, 22);
980        assert_eq!(e.utime, 999_000);
981        assert_eq!(e.stime, 888_000);
982        assert_eq!(e.nr_threads, Some(8));
983        assert_eq!(e.signal_nvcsw, Some(70_000));
984        assert_eq!(e.signal_nivcsw, Some(3));
985        assert_eq!(e.signal_utime, Some(7_000));
986        assert_eq!(e.signal_stime, Some(6_000));
987        assert_eq!(e.pgid, Some(4300));
988        assert_eq!(e.sid, Some(1));
989        assert_eq!(e.group_leader_pid, Some(1));
990        assert_eq!(e.real_parent_pid, Some(1));
991        assert_eq!(e.real_parent_comm.as_deref(), Some("init"));
992        // pi_boosted_out_of_scx is false because is_runnable_in_scx
993        // is false on this call.
994        assert!(!e.pi_boosted_out_of_scx);
995        // pc = None => no lock-slowpath match attempted.
996        assert_eq!(e.lock_slowpath_match, None);
997    }
998
999    /// Helper for the pi_boosted truth table: plant a task whose
1000    /// sched_class is `sched_class_kva`, build a registry with the
1001    /// given `ext` slot, and return the resulting flag.
1002    fn pi_boosted(sched_class_kva: u64, ext: Option<u64>, is_runnable_in_scx: bool) -> bool {
1003        let mut buf = vec![0u8; 0x2000];
1004        plant_happy_task(&mut buf);
1005        let offsets = fixture_offsets();
1006        put_u64(
1007            &mut buf,
1008            TASK_ADDR,
1009            offsets.task_struct_sched_class,
1010            sched_class_kva,
1011        );
1012        let kernel = build_kernel(&mut buf);
1013        let classes = SchedClassRegistry {
1014            fair: Some(FAIR_CLASS_KVA),
1015            ext,
1016            ..Default::default()
1017        };
1018        walk_task_enrichment(
1019            &kernel,
1020            TASK_ADDR,
1021            &offsets,
1022            &classes,
1023            &LockSlowpathRegistry::default(),
1024            is_runnable_in_scx,
1025            None,
1026        )
1027        .expect("task built")
1028        .pi_boosted_out_of_scx
1029    }
1030
1031    #[test]
1032    fn walk_task_enrichment_pi_boosted_out_of_scx_truth_table() {
1033        // runnable + sched_class == ext => not boosted out.
1034        assert!(!pi_boosted(EXT_CLASS_KVA, Some(EXT_CLASS_KVA), true));
1035        // runnable + sched_class != ext + ext.is_some() => boosted out.
1036        assert!(pi_boosted(FAIR_CLASS_KVA, Some(EXT_CLASS_KVA), true));
1037        // runnable + ext == None => short-circuits on is_some() => false.
1038        assert!(!pi_boosted(FAIR_CLASS_KVA, None, true));
1039        // not runnable + sched_class != ext => false (gated on runnable).
1040        assert!(!pi_boosted(FAIR_CLASS_KVA, Some(EXT_CLASS_KVA), false));
1041    }
1042
1043    #[test]
1044    fn walk_task_enrichment_core_cookie_absent_offset_yields_none() {
1045        // Even though a non-zero u64 sits at the would-be core_cookie
1046        // offset, task_struct_core_cookie = None skips the read.
1047        let mut buf = vec![0u8; 0x2000];
1048        plant_happy_task(&mut buf);
1049        let kernel = build_kernel(&mut buf);
1050        let mut offsets = fixture_offsets();
1051        offsets.task_struct_core_cookie = None;
1052        let e = walk_task_enrichment(
1053            &kernel,
1054            TASK_ADDR,
1055            &offsets,
1056            &ext_registry(),
1057            &LockSlowpathRegistry::default(),
1058            false,
1059            None,
1060        )
1061        .expect("task built");
1062        assert_eq!(e.core_cookie, None);
1063    }
1064
1065    #[test]
1066    fn walk_task_enrichment_signal_null_pointer_all_signal_fields_none() {
1067        // signal_kva == 0 short-circuits before translate; the whole
1068        // signal-derived septuple is None.
1069        let mut buf = vec![0u8; 0x2000];
1070        plant_happy_task(&mut buf);
1071        let offsets = fixture_offsets();
1072        put_u64(&mut buf, TASK_ADDR, offsets.task_struct_signal, 0);
1073        let kernel = build_kernel(&mut buf);
1074        let e = walk_task_enrichment(
1075            &kernel,
1076            TASK_ADDR,
1077            &offsets,
1078            &ext_registry(),
1079            &LockSlowpathRegistry::default(),
1080            false,
1081            None,
1082        )
1083        .expect("task built");
1084        assert_eq!(e.nr_threads, None);
1085        assert_eq!(e.signal_nvcsw, None);
1086        assert_eq!(e.signal_nivcsw, None);
1087        assert_eq!(e.signal_utime, None);
1088        assert_eq!(e.signal_stime, None);
1089        assert_eq!(e.pgid, None);
1090        assert_eq!(e.sid, None);
1091        // The rest of the task still populated.
1092        assert_eq!(e.pid, 4321);
1093    }
1094
1095    #[test]
1096    fn walk_task_enrichment_signal_translate_fail_all_signal_fields_none() {
1097        // signal_kva != 0 but out of bounds => translate_any_kva None
1098        // (distinct path from the signal_kva == 0 short-circuit).
1099        let mut buf = vec![0u8; 0x2000];
1100        plant_happy_task(&mut buf);
1101        let offsets = fixture_offsets();
1102        // 0xF000_0000 is far above mem.size() (0x2000); cr3_pa = 0 walk fails.
1103        put_u64(&mut buf, TASK_ADDR, offsets.task_struct_signal, 0xF000_0000);
1104        let kernel = build_kernel(&mut buf);
1105        let e = walk_task_enrichment(
1106            &kernel,
1107            TASK_ADDR,
1108            &offsets,
1109            &ext_registry(),
1110            &LockSlowpathRegistry::default(),
1111            false,
1112            None,
1113        )
1114        .expect("task built");
1115        assert_eq!(
1116            (
1117                e.nr_threads,
1118                e.signal_nvcsw,
1119                e.signal_nivcsw,
1120                e.signal_utime,
1121                e.signal_stime,
1122                e.pgid,
1123                e.sid
1124            ),
1125            (None, None, None, None, None, None, None)
1126        );
1127        assert_eq!(e.tgid, 4300);
1128    }
1129
1130    #[test]
1131    fn read_pid_nr_at_index_null_slot_returns_none_via_pgid() {
1132        // PGID slot zeroed => pgid None (the common non-leader case);
1133        // SID slot populated => sid Some. Pins the pids_off + idx*8
1134        // index math discriminating PGID (idx 2) from SID (idx 3).
1135        let mut buf = vec![0u8; 0x2000];
1136        plant_happy_task(&mut buf);
1137        let offsets = fixture_offsets();
1138        put_u64(
1139            &mut buf,
1140            SIGNAL_ADDR,
1141            offsets.signal_struct_pids + pid_type::PGID * 8,
1142            0,
1143        );
1144        let kernel = build_kernel(&mut buf);
1145        let e = walk_task_enrichment(
1146            &kernel,
1147            TASK_ADDR,
1148            &offsets,
1149            &ext_registry(),
1150            &LockSlowpathRegistry::default(),
1151            false,
1152            None,
1153        )
1154        .expect("task built");
1155        assert_eq!(e.pgid, None);
1156        assert_eq!(e.sid, Some(1));
1157    }
1158
1159    #[test]
1160    fn follow_task_for_pid_and_comm_null_and_translate_fail() {
1161        let offsets = fixture_offsets();
1162
1163        // Sub-case A: both pointer fields NULL (init_task case).
1164        let mut buf_a = vec![0u8; 0x2000];
1165        plant_happy_task(&mut buf_a);
1166        put_u64(&mut buf_a, TASK_ADDR, offsets.task_struct_group_leader, 0);
1167        put_u64(&mut buf_a, TASK_ADDR, offsets.task_struct_real_parent, 0);
1168        let kernel_a = build_kernel(&mut buf_a);
1169        let ea = walk_task_enrichment(
1170            &kernel_a,
1171            TASK_ADDR,
1172            &offsets,
1173            &ext_registry(),
1174            &LockSlowpathRegistry::default(),
1175            false,
1176            None,
1177        )
1178        .expect("task built");
1179        assert_eq!(ea.group_leader_pid, None);
1180        assert_eq!(ea.real_parent_pid, None);
1181        assert_eq!(ea.real_parent_comm, None);
1182
1183        // Sub-case B: non-null but out-of-bounds => translate fail.
1184        let mut buf_b = vec![0u8; 0x2000];
1185        plant_happy_task(&mut buf_b);
1186        put_u64(
1187            &mut buf_b,
1188            TASK_ADDR,
1189            offsets.task_struct_group_leader,
1190            0xF000_0000,
1191        );
1192        put_u64(
1193            &mut buf_b,
1194            TASK_ADDR,
1195            offsets.task_struct_real_parent,
1196            0xF000_0000,
1197        );
1198        let kernel_b = build_kernel(&mut buf_b);
1199        let eb = walk_task_enrichment(
1200            &kernel_b,
1201            TASK_ADDR,
1202            &offsets,
1203            &ext_registry(),
1204            &LockSlowpathRegistry::default(),
1205            false,
1206            None,
1207        )
1208        .expect("task built");
1209        assert_eq!(eb.group_leader_pid, None);
1210        assert_eq!(eb.real_parent_pid, None);
1211        assert_eq!(eb.real_parent_comm, None);
1212
1213        // Sub-case C: valid parent (positive control) — the
1214        // happy-path layout already points both at the mini task.
1215        let mut buf_c = vec![0u8; 0x2000];
1216        plant_happy_task(&mut buf_c);
1217        let kernel_c = build_kernel(&mut buf_c);
1218        let ec = walk_task_enrichment(
1219            &kernel_c,
1220            TASK_ADDR,
1221            &offsets,
1222            &ext_registry(),
1223            &LockSlowpathRegistry::default(),
1224            false,
1225            None,
1226        )
1227        .expect("task built");
1228        assert_eq!(ec.group_leader_pid, Some(1));
1229        assert_eq!(ec.real_parent_pid, Some(1));
1230        assert_eq!(ec.real_parent_comm.as_deref(), Some("init"));
1231    }
1232
1233    #[test]
1234    fn read_comm_no_nul_reads_full_16_bytes_and_nul_truncates() {
1235        let offsets = fixture_offsets();
1236
1237        // Nul-terminated: truncates at the nul.
1238        let mut buf_t = vec![0u8; 0x2000];
1239        plant_happy_task(&mut buf_t);
1240        put_comm(&mut buf_t, TASK_ADDR, offsets.task_struct_comm, b"short\0");
1241        let kernel_t = build_kernel(&mut buf_t);
1242        let et = walk_task_enrichment(
1243            &kernel_t,
1244            TASK_ADDR,
1245            &offsets,
1246            &ext_registry(),
1247            &LockSlowpathRegistry::default(),
1248            false,
1249            None,
1250        )
1251        .expect("task built");
1252        assert_eq!(et.comm, "short");
1253
1254        // No nul within the 16-byte window: reads the full 16 bytes.
1255        let mut buf_n = vec![0u8; 0x2000];
1256        plant_happy_task(&mut buf_n);
1257        put_comm(
1258            &mut buf_n,
1259            TASK_ADDR,
1260            offsets.task_struct_comm,
1261            b"sixteencharcomm!",
1262        );
1263        let kernel_n = build_kernel(&mut buf_n);
1264        let en = walk_task_enrichment(
1265            &kernel_n,
1266            TASK_ADDR,
1267            &offsets,
1268            &ext_registry(),
1269            &LockSlowpathRegistry::default(),
1270            false,
1271            None,
1272        )
1273        .expect("task built");
1274        assert_eq!(en.comm, "sixteencharcomm!");
1275        assert_eq!(en.comm.len(), 16);
1276
1277        // Invalid UTF-8 before the nul: from_utf8_lossy substitutes
1278        // the replacement char rather than panicking.
1279        let mut buf_l = vec![0u8; 0x2000];
1280        plant_happy_task(&mut buf_l);
1281        put_comm(
1282            &mut buf_l,
1283            TASK_ADDR,
1284            offsets.task_struct_comm,
1285            &[0xFF, 0x00],
1286        );
1287        let kernel_l = build_kernel(&mut buf_l);
1288        let el = walk_task_enrichment(
1289            &kernel_l,
1290            TASK_ADDR,
1291            &offsets,
1292            &ext_registry(),
1293            &LockSlowpathRegistry::default(),
1294            false,
1295            None,
1296        )
1297        .expect("task built");
1298        assert_eq!(el.comm, "\u{FFFD}");
1299    }
1300
1301    #[test]
1302    fn lock_slowpath_match_pc_supplied_sets_field_and_nonmatch_clears() {
1303        let offsets = fixture_offsets();
1304        let locks = LockSlowpathRegistry {
1305            queued_spin_lock_slowpath: Some(0x5_0000),
1306            ..Default::default()
1307        };
1308
1309        // Matching PC inside the qsl window => field set.
1310        let mut buf_m = vec![0u8; 0x2000];
1311        plant_happy_task(&mut buf_m);
1312        let kernel_m = build_kernel(&mut buf_m);
1313        let em = walk_task_enrichment(
1314            &kernel_m,
1315            TASK_ADDR,
1316            &offsets,
1317            &ext_registry(),
1318            &locks,
1319            false,
1320            Some(0x5_0010),
1321        )
1322        .expect("task built");
1323        assert_eq!(
1324            em.lock_slowpath_match.as_deref(),
1325            Some("queued_spin_lock_slowpath")
1326        );
1327
1328        // Non-matching PC => field stays None.
1329        let mut buf_x = vec![0u8; 0x2000];
1330        plant_happy_task(&mut buf_x);
1331        let kernel_x = build_kernel(&mut buf_x);
1332        let ex = walk_task_enrichment(
1333            &kernel_x,
1334            TASK_ADDR,
1335            &offsets,
1336            &ext_registry(),
1337            &locks,
1338            false,
1339            Some(0x9_9999),
1340        )
1341        .expect("task built");
1342        assert_eq!(ex.lock_slowpath_match, None);
1343    }
1344
1345    #[test]
1346    fn lock_slowpath_match_pc_window_overflow_returns_none() {
1347        // A symbol KVA near u64::MAX whose `s + 4096` window wraps
1348        // must not falsely match a PC that would be inside the
1349        // non-wrapping window. checked_add returning None => no match.
1350        let r = LockSlowpathRegistry {
1351            queued_spin_lock_slowpath: Some(u64::MAX - 100),
1352            ..Default::default()
1353        };
1354        assert_eq!(r.match_pc(u64::MAX - 50), None);
1355        assert_eq!(r.match_pc(u64::MAX), None);
1356    }
1357}