ktstr/monitor/
scx_walker.rs

1//! Host-side rq->scx + DSQ + task enumeration walkers for the
2//! failure dump.
3//!
4//! Entry points:
5//!
6//! 1. [`walk_rq_scx`] — for one CPU's `struct rq.scx`, captures the
7//!    scalar fields the kernel's own `scx_dump_state` reads
8//!    (nr_running, flags, cpu_released, ops_qseq, kick_sync) plus
9//!    nr_immed, clock, and the curr task pid+comm. Walks
10//!    `rq->scx.runnable_list` and emits a list of
11//!    [`super::dump::TaskWalkerEntry`] tuples for each runnable task —
12//!    these feed directly into the per-task enrichment capture
13//!    pipeline.
14//!
15//! 2. [`walk_local_dsqs`] — per-CPU local DSQs at
16//!    `rq->scx.local_dsq`. Runs unconditionally — local DSQs are
17//!    initialized at boot (`init_dsq`, called for every possible CPU
18//!    from `init_sched_ext_class`) and exist whether or not a scheduler
19//!    is attached, so this surfaces local-DSQ state even when
20//!    `*scx_root == 0`.
21//!
22//! 3. [`walk_dsqs`] — sched-rooted DSQs reachable from `*scx_root`
23//!    (excluding per-CPU local DSQs, which [`walk_local_dsqs`]
24//!    handles separately):
25//!    - per-CPU bypass DSQs via `scx_sched_pcpu.bypass_dsq`
26//!    - per-node global DSQs via `scx_sched.pnode[node]->global_dsq`
27//!    - user-allocated DSQs via the `scx_sched.dsq_hash` rhashtable
28//!
29//!    For each DSQ captures the scalar state (id, nr, seq) and walks
30//!    its `list_head` to enumerate queued tasks. The kernel's own
31//!    `scx_dump_state` does NOT enumerate per-DSQ depths — this
32//!    walker surfaces queue depth and per-task ordering that the
33//!    in-tree dump path does not.
34//!
35//! 4. [`walk_scx_tasks_global`] — walks the kernel's global
36//!    `scx_tasks` LIST_HEAD via each task's `scx.tasks_node`.
37//!    Surfaces every task owned by an scx_sched, surviving the
38//!    per-rq runnable_list drain that scheduler teardown
39//!    (`scx_bypass`) triggers.
40//!
41//! All walkers are best-effort: any address that fails to translate
42//! (slab page race, PA out of bounds) yields a partial result rather
43//! than aborting. Cycle protection on every `list_head` walk is
44//! two-layer: a `HashSet` of visited node addresses breaks on the
45//! first revisited node (a corrupt / non-canonical ring), with
46//! `MAX_NODES_PER_LIST` as a hard count backstop; the rhashtable walk
47//! caps total bucket-table chain length at `MAX_RHT_NODES`. List heads
48//! that are static kernel symbols (`scx_tasks`) are slid by the runtime
49//! virtual KASLR offset before the terminator comparison, since the
50//! list's `.next` pointers are runtime addresses.
51//!
52//! # Lock-free reads
53//!
54//! These walkers run from the freeze coordinator after the vCPU
55//! rendezvous. All vCPUs are parked at a known KVM exit; the host
56//! reads guest memory directly with no in-guest synchronization. The
57//! kernel-side locks (`scx_dispatch_q.lock` raw_spinlock,
58//! `rhashtable.mutex`) are not honored — the freeze rendezvous IS
59//! the synchronization. A torn read can still happen if a vCPU was
60//! mid-write at the freeze instant; the walker treats torn results
61//! as best-effort partial output.
62
63use serde::{Deserialize, Serialize};
64
65use super::btf_offsets::{RHT_PTR_LOCK_BIT, SCX_DSQ_LNODE_ITER_CURSOR, ScxWalkerOffsets};
66use super::dump::TaskWalkerEntry;
67use super::guest::GuestKernel;
68use super::idr::translate_any_kva;
69use super::reader::{GuestMem, WalkContext};
70
71/// Maximum entries any single list_head walk visits before bailing
72/// with what's been collected. Bounds CPU + memory cost on a
73/// corrupt-pointer chain that loops back on itself or runs into
74/// arbitrary slab. 4096 is generous: real per-CPU runnable_list has
75/// at most ~num_threads entries on a given CPU, capped well below
76/// this; user DSQs can in principle hold millions of tasks but the
77/// per-DSQ walker still bails at this cap so a million-entry DSQ
78/// surfaces 4096 task entries plus the `nr` count (truncation
79/// surfaces via `truncated: true` on [`DsqState`]).
80const MAX_NODES_PER_LIST: u32 = 4096;
81
82/// Maximum total node visits across all rhashtable buckets in the
83/// `dsq_hash` walk. Bounds the cost of a runaway bucket chain.
84/// Mainline ScxLib creates at most a few hundred user DSQs.
85const MAX_RHT_NODES: u32 = 8192;
86
87/// Maximum number of buckets the rhashtable walker enumerates.
88/// `bucket_table.size` is normally a small power of two
89/// (rhashtable starts at 16, grows by 2x); a pathological torn
90/// read could surface a huge value. Caps the bucket walk at 64K to
91/// protect freeze-path latency.
92const MAX_RHT_BUCKETS: u32 = 65_536;
93
94/// Maximum nodes any single rhashtable bucket chain visits before
95/// bailing. A healthy rhashtable holds ~1 element per bucket on
96/// average; a pathological chain of 1024 entries in one bucket is
97/// orders of magnitude beyond legitimate use and almost certainly
98/// indicates a corrupted `next` chain or torn read. Bounded
99/// independently of [`MAX_RHT_NODES`] so a single runaway bucket
100/// cannot starve the walk's per-bucket budget on the way to the
101/// global cap. The condition `chain_visited < PER_BUCKET_CHAIN_CAP`
102/// admits exactly 1024 body executions: chain_visited starts at 0
103/// and increments inside the loop body, so the comparison reads
104/// 0,1,...,1023 across the 1024 iterations and exits on the next
105/// check (1024 < 1024 is false).
106const PER_BUCKET_CHAIN_CAP: u32 = 1024;
107
108/// Snapshot of one CPU's `struct rq.scx` state at freeze time.
109#[derive(Debug, Clone, Default, Serialize, Deserialize)]
110#[non_exhaustive]
111pub struct RqScxState {
112    /// CPU index (0-based) this state describes.
113    pub cpu: u32,
114    /// `rq->scx.nr_running`.
115    pub nr_running: u32,
116    /// `rq->scx.flags`.
117    pub flags: u32,
118    /// `rq->scx.cpu_released` — `true` when the kernel released
119    /// the CPU back to the BPF scheduler (see `scx_pre_release_cpu`
120    /// in kernel/sched/ext.c).
121    pub cpu_released: bool,
122    /// `rq->scx.ops_qseq`.
123    pub ops_qseq: u64,
124    /// `rq->scx.kick_sync` — the per-rq pick-next sequence counter.
125    /// Version-renamed in the kernel: named `pnt_seq` on v6.12–v6.18
126    /// and `kick_sync` on v6.19+. `ScxRqOffsets::from_btf` resolves
127    /// the offset via a `kick_sync`-then-`pnt_seq` fallback, so this
128    /// is Some (captured) across the supported range. None only when
129    /// neither name resolves (stripped BTF / sched_ext not built in).
130    /// Skipped on serde when None so those dumps stay tight.
131    #[serde(default, skip_serializing_if = "Option::is_none")]
132    pub kick_sync: Option<u64>,
133    /// `rq->scx.nr_immed` — count of ENQ_IMMED tasks on local_dsq.
134    /// A feature-branch (`for-7.1`) field, absent on every release
135    /// tag in the supported range (v6.12 → v7.0-rc5); the offset
136    /// resolves None there and the JSON elides the field. Distinct
137    /// provenance from [`Self::kick_sync`], which is present across
138    /// the range via its `pnt_seq` fallback.
139    #[serde(default, skip_serializing_if = "Option::is_none")]
140    pub nr_immed: Option<u32>,
141    /// `rq->scx.clock` — per-CPU scx_rq clock (the value
142    /// `scx_bpf_now()` returns) at the freeze instant. Optional
143    /// because the field was added by the `scx_bpf_now()` series in
144    /// v6.14 (commit 3a9910b5904d); v6.12 and v6.13 release kernels
145    /// have no equivalent member on `struct scx_rq`. None when the
146    /// BTF lookup of `rq->scx.clock` resolves absent — consumers
147    /// that need the value gate on Some.
148    #[serde(default, skip_serializing_if = "Option::is_none")]
149    pub rq_clock: Option<u64>,
150    /// `rq->curr->pid` — the currently-running task. `None` when
151    /// the curr pointer didn't translate (idle or torn read).
152    #[serde(default, skip_serializing_if = "Option::is_none")]
153    pub curr_pid: Option<i32>,
154    /// `rq->curr->comm`. Mirrors `curr_pid`.
155    #[serde(default, skip_serializing_if = "Option::is_none")]
156    pub curr_comm: Option<String>,
157    /// `task_struct` KVAs of every entry walked off
158    /// `rq->scx.runnable_list`. The freeze coordinator passes this
159    /// vec into the per-task enrichment capture so the same
160    /// task list drives both rq->scx state AND per-task records.
161    pub runnable_task_kvas: Vec<u64>,
162    /// True when the runnable_list walk hit the
163    /// `MAX_NODES_PER_LIST` safety cap before reaching the head
164    /// — typical only on a corrupted chain.
165    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
166    pub runnable_truncated: bool,
167}
168
169/// Snapshot of one DSQ's state — built-in (per-CPU local, per-CPU
170/// bypass, per-node global) or user-allocated.
171#[derive(Debug, Clone, Default, Serialize, Deserialize)]
172#[non_exhaustive]
173pub struct DsqState {
174    /// `scx_dispatch_q.id` — built-in DSQs use synthetic ids
175    /// (`SCX_DSQ_LOCAL`, `SCX_DSQ_GLOBAL` per-node); user DSQs
176    /// carry the BPF-allocated id.
177    pub id: u64,
178    /// Operator-facing tag describing where the DSQ came from:
179    /// `"local cpu N"`, `"bypass cpu N"`, `"global node N"`, or
180    /// `"user"`. Aligned with the kernel's own `scx_dump_state`
181    /// terminology where comparable.
182    pub origin: String,
183    /// `scx_dispatch_q.nr` — number of tasks currently queued.
184    pub nr: u32,
185    /// `scx_dispatch_q.seq` — BPF-iter seq counter, used by the
186    /// dual-snapshot delta to distinguish dead vs busy DSQs:
187    /// `Δnr=0 + Δseq=0` is a dead DSQ; `Δseq>>Δ(seq-nr)` indicates
188    /// unbounded queue growth.
189    pub seq: u32,
190    /// `task_struct` KVAs walked off the DSQ's `list_head`. Same
191    /// shape as [`RqScxState::runnable_task_kvas`] — feeds into
192    /// the same per-task enrichment pipeline.
193    pub task_kvas: Vec<u64>,
194    /// True when the DSQ list walk hit the
195    /// `MAX_NODES_PER_LIST` cap before reaching the head.
196    /// Distinct from `nr`: the kernel may report `nr` larger than
197    /// our walk cap on legitimately-deep DSQs.
198    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
199    pub truncated: bool,
200}
201
202/// Top-level scheduler state captured from `*scx_root`.
203#[derive(Debug, Clone, Default, Serialize, Deserialize)]
204#[non_exhaustive]
205pub struct ScxSchedState {
206    /// `scx_sched.aborting`. `true` when the scheduler is in the
207    /// abort path; `bypass_depth` typically rises here.
208    pub aborting: bool,
209    /// `scx_sched.bypass_depth`. Nesting depth of the bypass-mode
210    /// stack; non-zero means the kernel is dispatching tasks
211    /// without consulting the BPF scheduler.
212    pub bypass_depth: i32,
213    /// `scx_sched.exit_kind` — the SCX_EXIT_* enum value latched
214    /// at `scx_error()` time. 0 means no exit yet; non-zero values
215    /// match `enum scx_exit_kind` in `kernel/sched/ext_internal.h`
216    /// (e.g. `SCX_EXIT_ERROR = 1024`, `SCX_EXIT_ERROR_BPF = 1025`,
217    /// `SCX_EXIT_ERROR_STALL = 1026`).
218    pub exit_kind: u32,
219    /// `scx_sched.watchdog_timeout` (jiffies) at the snapshot
220    /// instant. `None` when the field was not captured — either
221    /// because the live `read_scx_sched_state` path was taken on a
222    /// kernel that still exposes `watchdog_timeout` only via the
223    /// monitor's `WatchdogOverride` plumbing (not as a BTF field on
224    /// every release), or because the BPF .bss fallback was used
225    /// without the snapshot var set. Some when populated via the
226    /// probe BPF .bss snapshot
227    /// (`ktstr_exit_watchdog_timeout`).
228    #[serde(default, skip_serializing_if = "Option::is_none")]
229    pub watchdog_timeout: Option<u64>,
230    /// Provenance tag identifying which path produced this state.
231    /// `None` for the default-built / serde-deserialized case where
232    /// the source isn't recorded; `Some("live")` when populated by
233    /// `read_scx_sched_state` reading `*scx_root` directly;
234    /// `Some("bss_snapshot")` when populated from the probe BPF
235    /// .bss snapshot fallback (the `ktstr_exit_*` vars). Lets the
236    /// dump consumer distinguish "scheduler was alive at freeze
237    /// time" from "scheduler had already torn down and we read the
238    /// pre-teardown snapshot the BPF probe latched".
239    #[serde(default, skip_serializing_if = "Option::is_none")]
240    pub source: Option<String>,
241    /// Kernel virtual address of the `scx_sched` instance these
242    /// values describe. `None` when not captured. Same provenance
243    /// rule as [`Self::source`]: live path stamps the resolved
244    /// `*scx_root` value; the BPF .bss snapshot stamps the
245    /// `ktstr_exit_sched_kva` field. Lets a consumer correlate
246    /// dumps across reloads (a different scx_sched instance has a
247    /// different KVA).
248    #[serde(default, skip_serializing_if = "Option::is_none")]
249    pub sched_kva: Option<u64>,
250}
251
252/// Walk one CPU's `rq->scx` state. Reads the scalar fields and
253/// the runnable_list, returning [`RqScxState`] plus a vec of
254/// [`TaskWalkerEntry`] entries for the per-task enrichment pipeline.
255///
256/// `cpu` is the 0-based CPU index; `rq_kva` and `rq_pa` address that
257/// CPU's `struct rq`. Caller resolves both via
258/// `runqueues + per_cpu_offset[cpu]` (KVA) plus the corresponding PA
259/// via `compute_rq_pas`.
260///
261/// Cap on visited nodes: `MAX_NODES_PER_LIST`. A truncated walk
262/// surfaces via [`RqScxState::runnable_truncated`].
263///
264/// Returns `None` when any of the offset sub-groups required for
265/// scalar reads (`rq`, `scx_rq`, `task`) is absent — the walker
266/// cannot synthesize partial scalars meaningfully without the rq /
267/// scx_rq base offsets. Per-CPU runnable_list walking additionally
268/// requires `see` (sched_ext_entity); when only `see` is missing the
269/// scalar capture still lands but the runnable_list walk yields
270/// nothing.
271#[allow(dead_code)]
272pub fn walk_rq_scx(
273    kernel: &GuestKernel,
274    cpu: u32,
275    rq_kva: u64,
276    rq_pa: u64,
277    offsets: &ScxWalkerOffsets,
278) -> Option<(RqScxState, Vec<TaskWalkerEntry>)> {
279    let rq_offs = offsets.rq.as_ref()?;
280    let scx_rq_offs = offsets.scx_rq.as_ref()?;
281    let task_offs = offsets.task.as_ref()?;
282
283    let mem = kernel.mem();
284    let walk = kernel.walk_context();
285
286    let scx_off = rq_offs.scx;
287
288    // Scalar reads off rq + scx_rq.
289    let nr_running = mem.read_u32(rq_pa, scx_off + scx_rq_offs.nr_running);
290    let flags = mem.read_u32(rq_pa, scx_off + scx_rq_offs.flags);
291    let cpu_released = mem.read_u8(rq_pa, scx_off + scx_rq_offs.cpu_released) != 0;
292    let ops_qseq = mem.read_u64(rq_pa, scx_off + scx_rq_offs.ops_qseq);
293    // nr_immed is a for-7.1 feature-branch field, absent on every
294    // release tag in the supported range (v6.12 → v7.0-rc5), so its
295    // offset resolves None there. kick_sync, by contrast, resolves
296    // Some across the range via the `pnt_seq` fallback in
297    // ScxRqOffsets::from_btf. Gate each read on Some so a truly-
298    // absent field (stripped BTF / sched_ext not built in) is not
299    // fabricated from rq_pa+0 (which would alias the local_dsq head
300    // pointer — a non-zero garbage read the dump would render as
301    // legitimate kernel state).
302    let kick_sync = scx_rq_offs
303        .kick_sync
304        .map(|off| mem.read_u64(rq_pa, scx_off + off));
305    let nr_immed = scx_rq_offs
306        .nr_immed
307        .map(|off| mem.read_u32(rq_pa, scx_off + off));
308    // rq->scx.clock added in v6.14 (commit 3a9910b5904d). Gate the
309    // read on Some(off): on v6.12/v6.13 the offset is None and the
310    // walker must NOT fall back to rq_pa+0 (would alias local_dsq's
311    // raw_spinlock — non-zero junk rendered as a legitimate clock
312    // reading). The downstream RqScxState carries an Option<u64> so
313    // the JSON elides scx_rq_clock on unsupported kernels.
314    let rq_clock = scx_rq_offs
315        .clock
316        .map(|off| mem.read_u64(rq_pa, scx_off + off));
317
318    // curr task — pointer follow.
319    let curr_kva = mem.read_u64(rq_pa, rq_offs.curr);
320    let (curr_pid, curr_comm) =
321        read_task_pid_comm(mem, walk, curr_kva, task_offs.pid, task_offs.comm);
322
323    // Walk runnable_list when sched_ext_entity offsets are available.
324    // Without `see` we can still report scalar state but cannot
325    // container_of a runnable_node back to its task_struct.
326    let (runnable_task_kvas, runnable_truncated) = if let Some(see_offs) = offsets.see.as_ref() {
327        let list_head_off = scx_off + scx_rq_offs.runnable_list;
328        let head_kva = rq_kva.wrapping_add(list_head_off as u64);
329        let head_pa = rq_pa.wrapping_add(list_head_off as u64);
330
331        // container_of offset within task_struct: each runnable_node
332        // is at task + task_struct.scx + see.runnable_node.
333        let runnable_node_off_in_task = task_offs.scx + see_offs.runnable_node;
334
335        walk_list_head_for_task_kvas(mem, walk, head_kva, head_pa, runnable_node_off_in_task)
336    } else {
337        (Vec::new(), false)
338    };
339
340    let walker_entries: Vec<TaskWalkerEntry> = runnable_task_kvas
341        .iter()
342        .map(|&task_kva| TaskWalkerEntry {
343            task_kva,
344            // Runnable on this CPU's scx — eligible for the
345            // pi_boosted_out_of_scx flag.
346            is_runnable_in_scx: true,
347            // running_pc only known for the curr task; the
348            // freeze coordinator can fill that via
349            // VcpuRegSnapshot.instruction_pointer at a higher
350            // level. The walker leaves it None.
351            running_pc: None,
352        })
353        .collect();
354
355    let state = RqScxState {
356        cpu,
357        nr_running,
358        flags,
359        cpu_released,
360        ops_qseq,
361        kick_sync,
362        nr_immed,
363        rq_clock,
364        curr_pid,
365        curr_comm,
366        runnable_task_kvas,
367        runnable_truncated,
368    };
369
370    Some((state, walker_entries))
371}
372
373/// Read scalar `scx_sched` fields off `*scx_root`.
374///
375/// `scx_root` is a kernel-text-mapped pointer at the resolved KVA;
376/// `*scx_root` points at the active `struct scx_sched`. Returns
377/// `None` when scx_root is unset (no scheduler attached), the read
378/// fails, or the `scx_sched` offset sub-group is missing from BTF.
379///
380/// Emits `tracing::debug!` at each gate that returns `None` so an
381/// operator parsing the failure-dump trace can pinpoint exactly
382/// where the read aborted: BTF sub-group missing, scx_root_kva
383/// zero, dereferenced sched_kva zero, or sched_kva translate
384/// failure.
385#[allow(dead_code)]
386pub fn read_scx_sched_state(
387    kernel: &GuestKernel,
388    scx_root_kva: u64,
389    offsets: &ScxWalkerOffsets,
390) -> Option<(u64, ScxSchedState)> {
391    let Some(sched_offs) = offsets.sched.as_ref() else {
392        tracing::debug!(
393            "read_scx_sched_state: ScxSchedOffsets BTF sub-group missing — \
394             vmlinux lacks `struct scx_sched` (kernel without sched_ext or stripped vmlinux)",
395        );
396        return None;
397    };
398
399    let mem = kernel.mem();
400    let walk = kernel.walk_context();
401
402    if scx_root_kva == 0 {
403        tracing::debug!(
404            "read_scx_sched_state: scx_root_kva is 0 — vmlinux had no \
405             `scx_root` symbol (pre-6.16 kernel or stripped vmlinux)",
406        );
407        return None;
408    }
409
410    let root_pa = kernel.text_kva_to_pa(scx_root_kva);
411    let sched_kva = mem.read_u64(root_pa, 0);
412    if sched_kva == 0 {
413        tracing::debug!(
414            scx_root_kva = format_args!("{:#x}", scx_root_kva),
415            root_pa = format_args!("{:#x}", root_pa),
416            "read_scx_sched_state: *scx_root == 0 — no scheduler attached at the freeze instant",
417        );
418        return None;
419    }
420    let Some(sched_pa) = translate_any_kva(
421        mem,
422        walk.cr3_pa,
423        walk.page_offset,
424        sched_kva,
425        walk.l5,
426        walk.tcr_el1,
427    ) else {
428        tracing::debug!(
429            sched_kva = format_args!("{:#x}", sched_kva),
430            "read_scx_sched_state: translate_any_kva failed for sched_kva — \
431             page-table walk yielded no PA (slab page race or torn read)",
432        );
433        return None;
434    };
435
436    // `aborting` and `bypass_depth` are dev-only fields (absent on
437    // every release tag in our supported range). Gate each read on
438    // its offset being Some — falling back to 0 / false matches the
439    // semantics of reading from a kernel that never had the field
440    // (no in-flight abort, no bypass nesting). The downstream
441    // ScxSchedState carries plain bool/i32 because those defaults
442    // are meaningful and serializable; an Option wrapper would just
443    // complicate every consumer for no extra signal on release
444    // kernels.
445    let aborting = sched_offs
446        .aborting
447        .map(|off| mem.read_u8(sched_pa, off) != 0)
448        .unwrap_or(false);
449    let bypass_depth = sched_offs
450        .bypass_depth
451        .map(|off| mem.read_u32(sched_pa, off) as i32)
452        .unwrap_or(0);
453    // `exit_kind` is `atomic_t`; the value lives in the `counter`
454    // field at offset 0 of atomic_t. We're already at the
455    // outer-struct offset of `exit_kind`, so a u32 read at that
456    // offset reads the `counter` directly. Mandatory on every
457    // kernel that has `scx_sched`.
458    let exit_kind = mem.read_u32(sched_pa, sched_offs.exit_kind);
459
460    Some((
461        sched_kva,
462        ScxSchedState {
463            aborting,
464            bypass_depth,
465            exit_kind,
466            // Live read from `*scx_root` doesn't capture
467            // `watchdog_timeout`. The BTF sub-group does not carry
468            // an offset for the field today (it would need to be
469            // added to `ScxSchedOffsets`), and the host tracks the
470            // configured timeout via the `WatchdogOverride` plumbing
471            // anyway. Leave None; the BPF .bss snapshot's
472            // `ktstr_exit_watchdog_timeout` path populates this when
473            // the live read is unavailable.
474            watchdog_timeout: None,
475            source: Some(SCX_SCHED_STATE_SOURCE_LIVE.to_string()),
476            sched_kva: Some(sched_kva),
477        },
478    ))
479}
480
481/// Provenance tag for [`ScxSchedState::source`] when the state was
482/// read directly from `*scx_root` via `read_scx_sched_state`. The
483/// scheduler was alive at freeze time and the host walked its slab
484/// page directly. Pinned as a constant so the dump's display layer
485/// and tests reference the same string without drift.
486pub const SCX_SCHED_STATE_SOURCE_LIVE: &str = "live";
487
488/// Provenance tag for [`ScxSchedState::source`] when the state was
489/// reconstructed from the probe BPF program's `.bss` snapshot
490/// (`ktstr_exit_*` vars). The scheduler had already torn down by
491/// freeze time (`*scx_root == 0`), so the live walker returned None
492/// and the host fell back to the snapshot the BPF tp_btf handler
493/// captured at err-exit time.
494pub const SCX_SCHED_STATE_SOURCE_BSS: &str = "bss_snapshot";
495
496/// `SCX_TASK_CURSOR` flag value (`1 << 31`) on `sched_ext_entity.flags`.
497/// Cursor entries are stack-allocated `sched_ext_entity` placeholders
498/// that `scx_task_iter_start` inserts
499/// into `scx_tasks` to mark the iterator's progress; they are NOT
500/// embedded in any `task_struct` so the global walker must skip them
501/// to avoid container_of producing a bogus task KVA. Pinned per
502/// `SCX_TASK_CURSOR` in `include/linux/sched/ext.h`.
503const SCX_TASK_CURSOR: u32 = 1 << 31;
504/// Walk the kernel's global `scx_tasks` LIST_HEAD and recover every
505/// task linked into it via `task_struct.scx.tasks_node`.
506///
507/// `scx_tasks` is `static LIST_HEAD(scx_tasks)` in
508/// `kernel/sched/ext.c`. Tasks are added in
509/// `scx_post_fork` (`list_add_tail(&p->scx.tasks_node, &scx_tasks)`)
510/// and removed in `sched_ext_dead`
511/// (`list_del_init(&p->scx.tasks_node)`). The list outlives the
512/// per-rq `runnable_list` because `scx_bypass`
513/// drains runnable_list during
514/// scheduler teardown without touching `scx_tasks` — making this
515/// the durable task source for failure-dump enrichment.
516///
517/// Cursor entries (`scx_task_iter_start` inserts a stack-allocated
518/// `sched_ext_entity` with `flags = SCX_TASK_CURSOR` into
519/// `scx_tasks` while iterating) are skipped via the
520/// `tasks_node_off_in_see` parameter — the walker reads
521/// `sched_ext_entity.flags` for each list entry and skips entries
522/// whose flag is set.
523///
524/// `scx_tasks_kva` is the symbol KVA of the global LIST_HEAD;
525/// `tasks_node_off_in_task` is the byte offset of `tasks_node`
526/// within `task_struct` (`task.scx + see.tasks_node`);
527/// `tasks_node_off_in_see` is the byte offset of `tasks_node`
528/// within `sched_ext_entity` (`see.tasks_node` alone — used to
529/// recover the see base for cursor-flag testing on entries that
530/// are not embedded in a `task_struct`); `flags_off_in_see` is the
531/// byte offset of `flags` within `sched_ext_entity`.
532///
533/// `kaslr_offset` is the runtime virtual KASLR slide: `scx_tasks_kva`
534/// is the LINK-TIME symbol value, but the list's `.next` pointers are
535/// runtime addresses (link + slide), so the loop terminator compares
536/// against the RUNTIME head `slid_kernel_kva(scx_tasks_kva, kaslr_offset)`.
537/// Pass `0` when KASLR is off / not yet derived (identity slide).
538///
539/// Returns an empty vec when `scx_tasks_kva` is 0 (symbol absent —
540/// stripped vmlinux or kernel without sched_ext) or when the list
541/// head reads as empty (tasks_node points at itself).
542///
543/// Termination: the canonical circular list closes through the runtime
544/// head on the normal path. A `HashSet` of visited nodes is the
545/// hostile-guest backstop that bounds a corrupt / non-canonical ring to
546/// its unique-node set; `MAX_NODES_PER_LIST` is a further hard cap.
547#[allow(dead_code)]
548pub fn walk_scx_tasks_global(
549    kernel: &GuestKernel,
550    scx_tasks_kva: u64,
551    kaslr_offset: u64,
552    tasks_node_off_in_task: usize,
553    tasks_node_off_in_see: usize,
554    flags_off_in_see: usize,
555) -> Vec<u64> {
556    if scx_tasks_kva == 0 {
557        tracing::debug!(
558            "walk_scx_tasks_global: scx_tasks_kva is 0 — vmlinux had no \
559             `scx_tasks` symbol (kernel without sched_ext or stripped vmlinux)",
560        );
561        return Vec::new();
562    }
563    let mem = kernel.mem();
564    let walk = kernel.walk_context();
565
566    // head_pa reads head.next via the LINK-TIME kva: text_kva_to_pa maps a
567    // link-time kernel-image kva to its PA (phys_base already accounts for
568    // the physical load address), so this correctly reads the LIST_HEAD's
569    // first field (list_head.next) regardless of KASLR.
570    let head_pa = kernel.text_kva_to_pa(scx_tasks_kva);
571    // The terminator, however, must compare against the RUNTIME head: the
572    // .next pointers stored in the list are runtime kvas (link + virtual
573    // KASLR slide), filled by the kernel's list_add_tail(&p->scx.tasks_node,
574    // &scx_tasks). slid_kernel_kva applies the high-half slide (identity
575    // when kaslr_offset == 0). Comparing runtime .next against the link-time
576    // head would never match under KASLR-on (the default), so the canonical
577    // ring would never close and the walk would fall back to the seen-set /
578    // count backstop on every boot — and would process the runtime head
579    // node itself (reading garbage `flags` from .data before &scx_tasks and
580    // emitting a phantom task). With the runtime head the loop terminates ON
581    // the head and never processes it.
582    let head_kva = crate::monitor::symbols::slid_kernel_kva(scx_tasks_kva, kaslr_offset);
583
584    let mut task_kvas: Vec<u64> = Vec::new();
585    let mut node_kva = mem.read_u64(head_pa, 0);
586    if node_kva == 0 {
587        tracing::debug!(
588            scx_tasks_kva = format_args!("{:#x}", scx_tasks_kva),
589            head_pa = format_args!("{:#x}", head_pa),
590            "walk_scx_tasks_global: head.next read as 0 — list-head bytes \
591             unmapped or torn read; no tasks harvested",
592        );
593        return task_kvas;
594    }
595
596    let mut visited: u32 = 0;
597    // Address-cycle backstop. The canonical list closes through the runtime
598    // head_kva (above) on the normal path. This HashSet bounds a genuinely
599    // corrupt / non-canonical ring — a torn read, or a final node.next that
600    // points at an interior node instead of the head — to its unique-node
601    // set rather than cycling to MAX_NODES_PER_LIST, the hostile-guest-safe
602    // list-walk contract. (Before the runtime-head fix this path fired on
603    // every KASLR-on boot, the symptom that surfaced the symbol bug.)
604    let mut seen: std::collections::HashSet<u64> = std::collections::HashSet::new();
605    while node_kva != head_kva {
606        if visited >= MAX_NODES_PER_LIST {
607            return task_kvas;
608        }
609        if !seen.insert(node_kva) {
610            tracing::debug!(
611                node_kva = format_args!("{:#x}", node_kva),
612                unique = seen.len(),
613                "walk_scx_tasks_global: revisited node — list does not close \
614                 through head_kva; stopping at the unique-node set rather than \
615                 cycling to MAX_NODES_PER_LIST",
616            );
617            return task_kvas;
618        }
619        visited += 1;
620
621        // Recover the sched_ext_entity base for this list entry so we
622        // can read its `flags`. For task-embedded entries this base is
623        // inside a task_struct (`task_kva + task.scx`); for cursor
624        // entries this base is a stack-allocated sched_ext_entity.
625        // Either way, `see_kva = node_kva - see.tasks_node`.
626        let see_kva = node_kva.wrapping_sub(tasks_node_off_in_see as u64);
627        let cursor = match translate_any_kva(
628            mem,
629            walk.cr3_pa,
630            walk.page_offset,
631            see_kva,
632            walk.l5,
633            walk.tcr_el1,
634        ) {
635            Some(see_pa) => {
636                let flags = mem.read_u32(see_pa, flags_off_in_see);
637                flags & SCX_TASK_CURSOR != 0
638            }
639            // Translate failure on the see base — be conservative and
640            // treat as not-cursor so the entry surfaces; downstream
641            // walk_task_enrichment will revalidate via translate and
642            // drop it cleanly if the address is bogus.
643            None => false,
644        };
645
646        if !cursor {
647            // container_of: task_kva = node_kva - tasks_node_off_in_task.
648            let task_kva = node_kva.wrapping_sub(tasks_node_off_in_task as u64);
649            task_kvas.push(task_kva);
650        }
651
652        // Advance to the next node via the list_head.next pointer
653        // at offset 0 of the tasks_node list_head.
654        let Some(node_pa) = translate_any_kva(
655            mem,
656            walk.cr3_pa,
657            walk.page_offset,
658            node_kva,
659            walk.l5,
660            walk.tcr_el1,
661        ) else {
662            return task_kvas;
663        };
664        let next_kva = mem.read_u64(node_pa, 0);
665        if next_kva == 0 {
666            return task_kvas;
667        }
668        node_kva = next_kva;
669    }
670    task_kvas
671}
672
673/// Walk every per-CPU local DSQ — the DSQs embedded in `rq->scx.local_dsq`.
674///
675/// This is a strict subset of [`walk_dsqs`]'s pass 1, extracted so
676/// the dump path can call it INDEPENDENTLY of `*scx_root`. Per-CPU
677/// local DSQs are kernel-initialized at boot (`init_dsq`, called for
678/// every possible CPU from `init_sched_ext_class` in the
679/// `__init` path), so they exist even when no scheduler is attached
680/// (`*scx_root == NULL`) and survive scheduler teardown's bypass
681/// drain.
682///
683/// Returns one `DsqState` per CPU whose translate succeeds, plus a
684/// flat vec of [`TaskWalkerEntry`] for the per-task enrichment
685/// pipeline (these entries carry `is_runnable_in_scx: false` —
686/// tasks queued on a DSQ are staged for dispatch, not yet runnable
687/// in the rq->scx sense).
688///
689/// `rq_kvas`, `rq_pas`, and `per_cpu_offsets` index by CPU id
690/// (parallel arrays, same shape `walk_rq_scx` consumes). The walker
691/// skips BSS-zero-tail CPUs by checking the per-CPU offset directly
692/// (`per_cpu_offsets[cpu] == 0 && cpu > 0`) — those entries fall out
693/// of un-written `__per_cpu_offset[]` slots past `nr_cpu_ids` and
694/// would otherwise surface a phantom DSQ row at the bare `runqueues`
695/// symbol KVA. Comparing the resolved `rq_kva` against `rq_kvas[0]`
696/// would miss the alias on x86_64 SMP: `setup_per_cpu_areas`
697/// (`arch/x86/kernel/setup_percpu.c`) writes
698/// `__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]` with
699/// `delta = pcpu_base_addr - __per_cpu_start` non-zero, so CPU 0's
700/// `rq_kva` is `runqueues + delta` while a BSS-zero-tail CPU's is
701/// `runqueues + 0` — the two differ and `rq_kva == rq_kvas[0]` would
702/// let the phantom row through. Mirrors the canonical `cpu_off == 0
703/// && cpu_index > 0` guard
704/// `super::bpf_map::read_percpu_array_value` applies for percpu
705/// reads, expressed against the same `__per_cpu_offset[]` array.
706///
707/// Empty arrays mean "no CPUs walked successfully"; the caller's
708/// freeze-path retry guard normally rejects empty inputs before
709/// reaching this pass.
710///
711/// `None` return when any required offset sub-group is missing
712/// (`rq`, `scx_rq`, `dsq`, `dsq_lnode`, `task`, `see` — the same
713/// leaf set [`walk_dsqs`]'s pass 1 needs). A partial offset set
714/// is the same gating condition that blinds every other DSQ pass.
715#[allow(dead_code)]
716pub fn walk_local_dsqs(
717    kernel: &GuestKernel,
718    rq_kvas: &[u64],
719    rq_pas: &[u64],
720    per_cpu_offsets: &[u64],
721    offsets: &ScxWalkerOffsets,
722) -> Option<(Vec<DsqState>, Vec<TaskWalkerEntry>)> {
723    let Some(rq_offs) = offsets.rq.as_ref() else {
724        tracing::debug!(
725            "walk_local_dsqs: ScxWalkerOffsets.rq sub-group missing — \
726             local DSQ pass blinded",
727        );
728        return None;
729    };
730    let Some(scx_rq_offs) = offsets.scx_rq.as_ref() else {
731        tracing::debug!(
732            "walk_local_dsqs: ScxWalkerOffsets.scx_rq sub-group missing — \
733             local DSQ pass blinded",
734        );
735        return None;
736    };
737    let Some(dsq_offs) = offsets.dsq.as_ref() else {
738        tracing::debug!(
739            "walk_local_dsqs: ScxWalkerOffsets.dsq sub-group missing — \
740             local DSQ pass blinded",
741        );
742        return None;
743    };
744    let Some(dsq_lnode_offs) = offsets.dsq_lnode.as_ref() else {
745        tracing::debug!(
746            "walk_local_dsqs: ScxWalkerOffsets.dsq_lnode sub-group missing — \
747             local DSQ pass blinded",
748        );
749        return None;
750    };
751    let Some(task_offs) = offsets.task.as_ref() else {
752        tracing::debug!(
753            "walk_local_dsqs: ScxWalkerOffsets.task sub-group missing — \
754             local DSQ pass blinded",
755        );
756        return None;
757    };
758    let Some(see_offs) = offsets.see.as_ref() else {
759        tracing::debug!(
760            "walk_local_dsqs: ScxWalkerOffsets.see sub-group missing — \
761             local DSQ pass blinded",
762        );
763        return None;
764    };
765
766    let mem = kernel.mem();
767    let walk = kernel.walk_context();
768
769    let mut states: Vec<DsqState> = Vec::new();
770    let mut entries: Vec<TaskWalkerEntry> = Vec::new();
771
772    for (cpu, (&rq_kva, &rq_pa)) in rq_kvas.iter().zip(rq_pas.iter()).enumerate() {
773        // BSS-zero-tail guard: kernel `setup_per_cpu_areas`
774        // only writes `__per_cpu_offset[cpu]` for CPUs in
775        // `for_each_possible_cpu`, leaving slots beyond
776        // `nr_cpu_ids` at the BSS-initialized 0. The caller
777        // builds rq_kvas via `runqueues + per_cpu_offset[cpu]`,
778        // so a BSS-zero-tail entry produces an rq_kva of
779        // `runqueues + 0` instead of CPU 0's
780        // `runqueues + __per_cpu_offset[0]`. The two are NOT
781        // equal on x86_64 SMP because
782        // `__per_cpu_offset[0] = pcpu_base_addr - __per_cpu_start`
783        // is non-zero (`arch/x86/kernel/setup_percpu.c`); a
784        // resolved-rq_kva comparison would let the phantom
785        // BSS-zero entry through. Check the per-CPU offset
786        // directly instead — `cpu_off == 0 && cpu > 0` is the
787        // canonical guard
788        // [`super::bpf_map::read_percpu_array_value`] uses for
789        // percpu reads and the matching guard at the per-CPU
790        // bypass DSQ pass below. A `per_cpu_offsets` slice
791        // shorter than `rq_kvas` (length-mismatched caller)
792        // is treated conservatively: an absent offset for
793        // `cpu > 0` skips the slot, since the walker can't
794        // distinguish a real CPU from a BSS-zero tail without
795        // the offset.
796        let cpu_off = per_cpu_offsets.get(cpu).copied();
797        match cpu_off {
798            Some(off) if off == 0 && cpu > 0 => continue,
799            None if cpu > 0 => continue,
800            _ => {}
801        }
802        let local_dsq_off = rq_offs.scx + scx_rq_offs.local_dsq;
803        let dsq_kva = rq_kva.wrapping_add(local_dsq_off as u64);
804        let dsq_pa = rq_pa.wrapping_add(local_dsq_off as u64);
805        if let Some((state, e)) = walk_one_dsq(
806            mem,
807            walk,
808            dsq_kva,
809            dsq_pa,
810            || format!("local cpu {cpu}"),
811            dsq_offs,
812            dsq_lnode_offs,
813            task_offs,
814            see_offs,
815        ) {
816            entries.extend(e);
817            states.push(state);
818        }
819    }
820
821    Some((states, entries))
822}
823
824/// Walk every DSQ reachable from a `scx_sched` (the bypass / global
825/// / user-hash passes — NOT per-CPU local DSQs) and produce one
826/// `DsqState` per DSQ plus a flat vec of `TaskWalkerEntry` rows for
827/// the per-task enrichment pipeline.
828///
829/// Per-CPU local DSQs (`rq->scx.local_dsq`) are NOT walked here —
830/// they live in each rq independently of `*scx_root`, so callers
831/// invoke [`walk_local_dsqs`] separately and unconditionally for
832/// the local pass. This split lets the dump path surface local DSQ
833/// state even when no scheduler is attached
834/// (`*scx_root == NULL`) — the local_dsq struct is initialized at
835/// boot per `init_dsq` (called from `init_sched_ext_class`) for every
836/// possible CPU, so it has well-defined contents long before any
837/// scheduler attaches.
838///
839/// Walks (in this order, gated on the relevant sub-group offsets
840/// being present):
841///   1. Per-CPU bypass DSQs at `scx_sched_pcpu.bypass_dsq` for
842///      every CPU (needs `sched`, `sched_pcpu`, plus the leaf set).
843///   2. Per-node global DSQs at `scx_sched.pnode[node]->global_dsq`
844///      for every NUMA node (needs `sched`, `sched_pnode`, plus
845///      leaf set).
846///   3. User-allocated DSQs walked through `scx_sched.dsq_hash`
847///      (needs `sched`, `rht`, plus leaf set).
848///
849/// Each pass is independent: missing offsets for one pass blind
850/// only that pass. A translate failure on one DSQ leaves it out of
851/// the result without affecting the others.
852#[allow(dead_code)]
853pub fn walk_dsqs(
854    kernel: &GuestKernel,
855    sched_pa: u64,
856    per_cpu_offsets: &[u64],
857    nr_nodes: u32,
858    offsets: &ScxWalkerOffsets,
859) -> (Vec<DsqState>, Vec<TaskWalkerEntry>) {
860    let mem = kernel.mem();
861    let walk = kernel.walk_context();
862
863    let mut dsq_states: Vec<DsqState> = Vec::new();
864    let mut all_entries: Vec<TaskWalkerEntry> = Vec::new();
865
866    // Leaf offsets common to every pass — all three DSQ-walking
867    // passes feed `walk_one_dsq` which needs these. If any leaf
868    // group is missing, no pass can run.
869    let (Some(dsq_offs), Some(dsq_lnode_offs), Some(task_offs), Some(see_offs)) = (
870        offsets.dsq.as_ref(),
871        offsets.dsq_lnode.as_ref(),
872        offsets.task.as_ref(),
873        offsets.see.as_ref(),
874    ) else {
875        return (dsq_states, all_entries);
876    };
877
878    // Pass 1: per-CPU bypass DSQs. The percpu base lives at
879    // sched->pcpu, dereferenced as a __percpu pointer; each CPU's
880    // address is `pcpu_base + per_cpu_offset[cpu] +
881    // scx_sched_pcpu.bypass_dsq`.
882    //
883    // Both `sched_offs.pcpu` (v6.18+) and `pcpu_offs.bypass_dsq`
884    // (dev-only) are kernel-version-gated. Skip the entire pass
885    // unless both offsets resolved — partial state would compute
886    // a bogus DSQ KVA from `sched_pa + 0` (aliasing dsq_hash) and
887    // surface phantom DSQ entries.
888    if let (Some(sched_offs), Some(pcpu_offs)) =
889        (offsets.sched.as_ref(), offsets.sched_pcpu.as_ref())
890        && let (Some(sched_pcpu_off), Some(bypass_dsq_off)) =
891            (sched_offs.pcpu, pcpu_offs.bypass_dsq)
892    {
893        let pcpu_kva = mem.read_u64(sched_pa, sched_pcpu_off);
894        if pcpu_kva != 0 {
895            for (cpu, &cpu_off) in per_cpu_offsets.iter().enumerate() {
896                // Skip out-of-range CPUs — same heuristic as
897                // read_percpu_array_value (cpu_off==0 && cpu_index>0
898                // means BSS-zero tail).
899                if cpu_off == 0 && cpu > 0 {
900                    continue;
901                }
902                let dsq_kva = pcpu_kva
903                    .wrapping_add(cpu_off)
904                    .wrapping_add(bypass_dsq_off as u64);
905                if let Some(dsq_pa) = translate_any_kva(
906                    mem,
907                    walk.cr3_pa,
908                    walk.page_offset,
909                    dsq_kva,
910                    walk.l5,
911                    walk.tcr_el1,
912                ) && let Some((state, entries)) = walk_one_dsq(
913                    mem,
914                    walk,
915                    dsq_kva,
916                    dsq_pa,
917                    || format!("bypass cpu {cpu}"),
918                    dsq_offs,
919                    dsq_lnode_offs,
920                    task_offs,
921                    see_offs,
922                ) {
923                    all_entries.extend(entries);
924                    dsq_states.push(state);
925                }
926            }
927        }
928    }
929
930    // Pass 2: per-node global DSQs. `sched->pnode` is a pointer
931    // to an array of `struct scx_sched_pnode *` of length nr_nodes.
932    // Both `sched_offs.pnode` and `pnode_offs.global_dsq` are
933    // dev-only — skip the pass unless both resolved.
934    if let (Some(sched_offs), Some(pnode_offs)) =
935        (offsets.sched.as_ref(), offsets.sched_pnode.as_ref())
936        && let (Some(sched_pnode_off), Some(global_dsq_off)) =
937            (sched_offs.pnode, pnode_offs.global_dsq)
938    {
939        let pnode_kva = mem.read_u64(sched_pa, sched_pnode_off);
940        if pnode_kva != 0
941            && let Some(pnode_arr_pa) = translate_any_kva(
942                mem,
943                walk.cr3_pa,
944                walk.page_offset,
945                pnode_kva,
946                walk.l5,
947                walk.tcr_el1,
948            )
949        {
950            for node in 0..nr_nodes as u64 {
951                let pnode_ptr_kva = mem.read_u64(pnode_arr_pa, (node * 8) as usize);
952                if pnode_ptr_kva == 0 {
953                    continue;
954                }
955                let Some(pnode_pa) = translate_any_kva(
956                    mem,
957                    walk.cr3_pa,
958                    walk.page_offset,
959                    pnode_ptr_kva,
960                    walk.l5,
961                    walk.tcr_el1,
962                ) else {
963                    continue;
964                };
965                let dsq_kva = pnode_ptr_kva.wrapping_add(global_dsq_off as u64);
966                let dsq_pa = pnode_pa.wrapping_add(global_dsq_off as u64);
967                if let Some((state, entries)) = walk_one_dsq(
968                    mem,
969                    walk,
970                    dsq_kva,
971                    dsq_pa,
972                    || format!("global node {node}"),
973                    dsq_offs,
974                    dsq_lnode_offs,
975                    task_offs,
976                    see_offs,
977                ) {
978                    all_entries.extend(entries);
979                    dsq_states.push(state);
980                }
981            }
982        }
983    }
984
985    // Pass 3: user-allocated DSQs via the scx_sched.dsq_hash
986    // rhashtable. Walks at most MAX_RHT_NODES nodes total across
987    // all buckets.
988    if let (Some(sched_offs), Some(rht_offs)) = (offsets.sched.as_ref(), offsets.rht.as_ref()) {
989        // dsq_hash is embedded in scx_sched (not a pointer), so its
990        // PA is the sched_pa with the field offset added directly —
991        // same pattern Pass 2 uses for pnode->global_dsq. Computing a
992        // KVA here would require sched_kva which the caller already
993        // discarded; translating sched_pa as a KVA would underflow
994        // page_offset and silently empty the user-DSQ list.
995        let rht_pa = sched_pa.wrapping_add(sched_offs.dsq_hash as u64);
996        let (user_dsqs, user_dsqs_truncated) =
997            walk_user_dsq_hash(mem, walk, rht_pa, rht_offs, dsq_offs);
998        if user_dsqs_truncated {
999            // Surface the cap-hit so an operator parsing the
1000            // failure dump trace sees that the user-DSQ list is
1001            // partial. Without this log the dump silently
1002            // omits the tail of the dsq_hash bucket table or
1003            // the tail of one bucket's chain.
1004            tracing::warn!(
1005                visited = user_dsqs.len(),
1006                cap_buckets = MAX_RHT_BUCKETS,
1007                cap_nodes = MAX_RHT_NODES,
1008                "walk_user_dsq_hash: truncated — bucket-table or node cap fired; \
1009                 dsq_kvas list is incomplete",
1010            );
1011        }
1012        for dsq_kva in user_dsqs {
1013            let Some(dsq_pa) = translate_any_kva(
1014                mem,
1015                walk.cr3_pa,
1016                walk.page_offset,
1017                dsq_kva,
1018                walk.l5,
1019                walk.tcr_el1,
1020            ) else {
1021                continue;
1022            };
1023            if let Some((state, entries)) = walk_one_dsq(
1024                mem,
1025                walk,
1026                dsq_kva,
1027                dsq_pa,
1028                || "user".to_string(),
1029                dsq_offs,
1030                dsq_lnode_offs,
1031                task_offs,
1032                see_offs,
1033            ) {
1034                all_entries.extend(entries);
1035                dsq_states.push(state);
1036            }
1037        }
1038    }
1039
1040    (dsq_states, all_entries)
1041}
1042
1043/// Walk one `scx_dispatch_q`. Returns the DSQ scalar state plus the
1044/// task entries on its `list`.
1045///
1046/// `origin` is taken as a `FnOnce` closure so the per-call
1047/// `format!("local cpu {cpu}")` / `format!("bypass cpu {cpu}")` /
1048/// `format!("global node {node}")` heap allocation only fires
1049/// after the `dsq_pa == 0` early-out has been cleared. Eagerly
1050/// formatting at every caller wasted one short-string allocation
1051/// per skipped DSQ on every freeze.
1052///
1053/// Returns `None` when `dsq_pa == 0`. Reading at PA 0 would
1054/// surface the boot-page contents as DSQ scalars (`id`, `nr`,
1055/// `seq`) and an all-zero list-head as an apparently-empty queue
1056/// — indistinguishable from a real empty DSQ. The early check
1057/// rejects that case so the caller does not push a phantom
1058/// DsqState row built from PA-0 garbage.
1059#[allow(clippy::too_many_arguments)]
1060fn walk_one_dsq(
1061    mem: &GuestMem,
1062    walk: WalkContext,
1063    dsq_kva: u64,
1064    dsq_pa: u64,
1065    origin: impl FnOnce() -> String,
1066    dsq_offs: &super::btf_offsets::ScxDispatchQOffsets,
1067    dsq_lnode_offs: &super::btf_offsets::ScxDsqListNodeOffsets,
1068    task_offs: &super::btf_offsets::TaskStructCoreOffsets,
1069    see_offs: &super::btf_offsets::SchedExtEntityOffsets,
1070) -> Option<(DsqState, Vec<TaskWalkerEntry>)> {
1071    if dsq_pa == 0 {
1072        tracing::debug!(
1073            dsq_kva = format_args!("{:#x}", dsq_kva),
1074            "walk_one_dsq: dsq_pa == 0 — would alias the boot page; \
1075             skipping to avoid surfacing phantom all-zero DSQ state",
1076        );
1077        return None;
1078    }
1079    let origin = origin();
1080    let id = mem.read_u64(dsq_pa, dsq_offs.id);
1081    let nr = mem.read_u32(dsq_pa, dsq_offs.nr);
1082    let seq = mem.read_u32(dsq_pa, dsq_offs.seq);
1083
1084    // List head at dsq + list.
1085    let head_kva = dsq_kva.wrapping_add(dsq_offs.list as u64);
1086    let head_pa = dsq_pa.wrapping_add(dsq_offs.list as u64);
1087
1088    // The DSQ list links sched_ext_entity.dsq_list.node fields
1089    // (struct list_head inside scx_dsq_list_node inside
1090    // sched_ext_entity inside task_struct). container_of computes:
1091    //   task_kva = node_kva
1092    //            - task.scx
1093    //            - see.dsq_list
1094    //            - dsq_lnode.node
1095    let dsq_node_off_in_task = task_offs.scx + see_offs.dsq_list + dsq_lnode_offs.node;
1096
1097    let (task_kvas, truncated) = walk_list_head_for_dsq_task_kvas(
1098        mem,
1099        walk,
1100        head_kva,
1101        head_pa,
1102        dsq_node_off_in_task,
1103        dsq_lnode_offs,
1104    );
1105
1106    let entries: Vec<TaskWalkerEntry> = task_kvas
1107        .iter()
1108        .map(|&task_kva| TaskWalkerEntry {
1109            task_kva,
1110            // Tasks queued on a DSQ are NOT on the per-CPU
1111            // runnable_list — they're staged for dispatch but not
1112            // yet runnable in the rq->scx sense. The
1113            // pi_boosted_out_of_scx flag only fires for
1114            // runnable_list tasks (the scenario it diagnoses is a
1115            // task that should have left the runnable_list when
1116            // its sched_class changed but didn't).
1117            is_runnable_in_scx: false,
1118            running_pc: None,
1119        })
1120        .collect();
1121
1122    Some((
1123        DsqState {
1124            id,
1125            origin,
1126            nr,
1127            seq,
1128            task_kvas,
1129            truncated,
1130        },
1131        entries,
1132    ))
1133}
1134
1135/// Walk a generic `list_head` chain starting at `head_kva`/`head_pa`,
1136/// recovering each task_struct KVA via container_of with
1137/// `runnable_node_off_in_task` as the field offset within
1138/// task_struct.
1139///
1140/// Returns (task_kvas, truncated). `truncated` is true when the
1141/// MAX_NODES_PER_LIST cap kicked in before the walk closed back to
1142/// the head.
1143fn walk_list_head_for_task_kvas(
1144    mem: &GuestMem,
1145    walk: WalkContext,
1146    head_kva: u64,
1147    head_pa: u64,
1148    runnable_node_off_in_task: usize,
1149) -> (Vec<u64>, bool) {
1150    let mut task_kvas = Vec::new();
1151    let mut node_kva = mem.read_u64(head_pa, 0);
1152    if node_kva == 0 {
1153        return (task_kvas, false);
1154    }
1155
1156    let mut visited: u32 = 0;
1157    // Address-cycle backstop (see module doc): a corrupt / non-canonical
1158    // ring whose final .next points at an interior node instead of the
1159    // head bounds to its unique-node set and flags the partial walk,
1160    // rather than cycling to MAX_NODES_PER_LIST.
1161    let mut seen: std::collections::HashSet<u64> = std::collections::HashSet::new();
1162    while node_kva != head_kva {
1163        if visited >= MAX_NODES_PER_LIST {
1164            return (task_kvas, true);
1165        }
1166        if !seen.insert(node_kva) {
1167            return (task_kvas, true);
1168        }
1169        visited += 1;
1170
1171        // container_of: task_kva = node_kva - runnable_node_off_in_task
1172        let task_kva = node_kva.wrapping_sub(runnable_node_off_in_task as u64);
1173        task_kvas.push(task_kva);
1174
1175        // Step to next node — translate node_kva, read .next at offset 0.
1176        let Some(node_pa) = translate_any_kva(
1177            mem,
1178            walk.cr3_pa,
1179            walk.page_offset,
1180            node_kva,
1181            walk.l5,
1182            walk.tcr_el1,
1183        ) else {
1184            return (task_kvas, false);
1185        };
1186        let next_kva = mem.read_u64(node_pa, 0);
1187        if next_kva == 0 {
1188            return (task_kvas, false);
1189        }
1190        node_kva = next_kva;
1191    }
1192    (task_kvas, false)
1193}
1194
1195/// Walk a DSQ's `list` chain (a list of `scx_dsq_list_node.node`
1196/// entries embedded in `sched_ext_entity.dsq_list`). Skips iterator
1197/// cursor entries marked with `SCX_DSQ_LNODE_ITER_CURSOR`.
1198fn walk_list_head_for_dsq_task_kvas(
1199    mem: &GuestMem,
1200    walk: WalkContext,
1201    head_kva: u64,
1202    head_pa: u64,
1203    dsq_node_off_in_task: usize,
1204    dsq_lnode_offs: &super::btf_offsets::ScxDsqListNodeOffsets,
1205) -> (Vec<u64>, bool) {
1206    let mut task_kvas = Vec::new();
1207    let mut node_kva = mem.read_u64(head_pa, 0);
1208    if node_kva == 0 {
1209        return (task_kvas, false);
1210    }
1211
1212    let mut visited: u32 = 0;
1213    // Address-cycle backstop (see module doc): a corrupt / non-canonical
1214    // DSQ ring bounds to its unique-node set and flags the partial walk,
1215    // rather than cycling to MAX_NODES_PER_LIST.
1216    let mut seen: std::collections::HashSet<u64> = std::collections::HashSet::new();
1217    while node_kva != head_kva {
1218        if visited >= MAX_NODES_PER_LIST {
1219            return (task_kvas, true);
1220        }
1221        if !seen.insert(node_kva) {
1222            return (task_kvas, true);
1223        }
1224        visited += 1;
1225
1226        // The list_head we're walking is `scx_dsq_list_node.node`
1227        // (the inner `struct list_head`). Recover the parent
1228        // scx_dsq_list_node start by subtracting `dsq_lnode.node`
1229        // — fixed at 0 in current kernels but we read it from the
1230        // offsets struct for forward-compatibility.
1231        let lnode_kva = node_kva.wrapping_sub(dsq_lnode_offs.node as u64);
1232
1233        // Read the lnode's flags to skip iterator-cursor entries.
1234        // is_cursor: Some(true) → cursor (skip), Some(false) → real
1235        // task entry (push), None → translate failed and we cannot
1236        // distinguish. When the cursor flag cannot be read, treat
1237        // the entry as a cursor and skip it: pushing it would
1238        // record a phantom `task_kva = node_kva - dsq_node_off_in_task`
1239        // built from a node whose enclosing sched_ext_entity isn't
1240        // mappable, which downstream task enrichment would surface
1241        // as bogus pid/comm reads at an arbitrary address.
1242        let is_cursor = match translate_any_kva(
1243            mem,
1244            walk.cr3_pa,
1245            walk.page_offset,
1246            lnode_kva,
1247            walk.l5,
1248            walk.tcr_el1,
1249        ) {
1250            Some(lnode_pa) => {
1251                let lnode_flags = mem.read_u32(lnode_pa, dsq_lnode_offs.flags);
1252                Some(lnode_flags & SCX_DSQ_LNODE_ITER_CURSOR != 0)
1253            }
1254            None => None,
1255        };
1256
1257        let skip_entry = match is_cursor {
1258            Some(true) => true,   // cursor entry — advance without recording
1259            Some(false) => false, // real task entry — push and advance
1260            None => true,         // cursor-detection unreliable — skip rather than push bogus
1261        };
1262
1263        if !skip_entry {
1264            // Real task entry: container_of from the inner list_head's
1265            // node_kva back to task_struct. The full offset within
1266            // task_struct is task.scx + see.dsq_list + dsq_lnode.node.
1267            let task_kva = node_kva.wrapping_sub(dsq_node_off_in_task as u64);
1268            task_kvas.push(task_kva);
1269        }
1270
1271        // Advance to the next node. The list_head.next pointer
1272        // lives at offset 0 of the inner list_head we landed on,
1273        // which is `node_kva` itself.
1274        let Some(node_pa) = translate_any_kva(
1275            mem,
1276            walk.cr3_pa,
1277            walk.page_offset,
1278            node_kva,
1279            walk.l5,
1280            walk.tcr_el1,
1281        ) else {
1282            return (task_kvas, false);
1283        };
1284        let next_kva = mem.read_u64(node_pa, 0);
1285        if next_kva == 0 {
1286            return (task_kvas, false);
1287        }
1288        node_kva = next_kva;
1289    }
1290    (task_kvas, false)
1291}
1292
1293/// Walk the user-allocated DSQ rhashtable rooted at `rht_pa`.
1294///
1295/// `rht_pa` is the DRAM-relative offset of the embedded
1296/// `struct rhashtable` inside `scx_sched.dsq_hash`. The caller
1297/// computes it as `sched_pa + sched.dsq_hash` — the field is embedded
1298/// (not a pointer), so its PA is just the containing struct's PA plus
1299/// the field offset. The walker reads `tbl` (bucket_table pointer),
1300/// then for each of `tbl.buckets[i]` it strips the LSB tag
1301/// (`RHT_PTR_LOCK_BIT`) and chases the `rhash_head.next` chain. For
1302/// each node the walker computes
1303/// `dsq_kva = node_kva - scx_dispatch_q.hash_node` (container_of).
1304///
1305/// Caps:
1306/// - bucket count at [`MAX_RHT_BUCKETS`]
1307/// - total nodes visited at [`MAX_RHT_NODES`]
1308/// - per-bucket chain length at [`PER_BUCKET_CHAIN_CAP`]
1309///
1310/// Returns `(dsq_kvas, truncated)`. `truncated` is `true` when any
1311/// cap fired before the walk could reach the natural end of every
1312/// bucket chain — either `bucket_table.size > MAX_RHT_BUCKETS`,
1313/// `total_nodes >= MAX_RHT_NODES` mid-walk, or a per-bucket chain
1314/// reached its `PER_BUCKET_CHAIN_CAP` cap. Without this signal,
1315/// callers cannot distinguish "small DSQ count" from "cap silently
1316/// dropped tail entries" — see DsqState.truncated for the same
1317/// pattern on per-DSQ task lists.
1318fn walk_user_dsq_hash(
1319    mem: &GuestMem,
1320    walk: WalkContext,
1321    rht_pa: u64,
1322    rht_offs: &super::btf_offsets::RhashtableOffsets,
1323    dsq_offs: &super::btf_offsets::ScxDispatchQOffsets,
1324) -> (Vec<u64>, bool) {
1325    let mut dsq_kvas = Vec::new();
1326
1327    let tbl_kva = mem.read_u64(rht_pa, rht_offs.tbl);
1328    if tbl_kva == 0 {
1329        return (dsq_kvas, false);
1330    }
1331    let Some(tbl_pa) = translate_any_kva(
1332        mem,
1333        walk.cr3_pa,
1334        walk.page_offset,
1335        tbl_kva,
1336        walk.l5,
1337        walk.tcr_el1,
1338    ) else {
1339        return (dsq_kvas, false);
1340    };
1341
1342    let size = mem.read_u32(tbl_pa, rht_offs.bucket_table_size);
1343    let bucket_count = size.min(MAX_RHT_BUCKETS) as u64;
1344    // A bucket_table.size larger than the bucket cap means we'll
1345    // walk only the first MAX_RHT_BUCKETS buckets and the tail is
1346    // silently dropped. Surface that as truncation up front.
1347    let mut truncated = size as u64 > bucket_count;
1348    let buckets_off = rht_offs.bucket_table_buckets;
1349
1350    let mut total_nodes: u32 = 0;
1351    for i in 0..bucket_count {
1352        if total_nodes >= MAX_RHT_NODES {
1353            // Hit the global node cap — remaining buckets unwalked.
1354            return (dsq_kvas, true);
1355        }
1356        let entry_off = buckets_off + (i as usize) * 8;
1357        let raw_ptr = mem.read_u64(tbl_pa, entry_off);
1358        // Strip the LSB lock-bit tag. NULL or pure tag (0 with
1359        // bit 0 unset) means empty bucket.
1360        let head_kva = raw_ptr & !RHT_PTR_LOCK_BIT;
1361        if head_kva == 0 {
1362            continue;
1363        }
1364        // Chase the `rhash_head.next` chain. Each node is a
1365        // `rhash_head` embedded in scx_dispatch_q at
1366        // `hash_node`; container_of yields the dsq KVA.
1367        let mut node_kva = head_kva;
1368        let mut chain_visited: u32 = 0;
1369        let mut chain_terminated_naturally = false;
1370        while node_kva != 0 && total_nodes < MAX_RHT_NODES && chain_visited < PER_BUCKET_CHAIN_CAP {
1371            chain_visited += 1;
1372            total_nodes += 1;
1373            let dsq_kva = node_kva.wrapping_sub(dsq_offs.hash_node as u64);
1374            dsq_kvas.push(dsq_kva);
1375            let Some(node_pa) = translate_any_kva(
1376                mem,
1377                walk.cr3_pa,
1378                walk.page_offset,
1379                node_kva,
1380                walk.l5,
1381                walk.tcr_el1,
1382            ) else {
1383                // Translate failure — chain ended for this bucket
1384                // without hitting a cap. Not a truncation signal.
1385                chain_terminated_naturally = true;
1386                break;
1387            };
1388            let next_raw = mem.read_u64(node_pa, rht_offs.rhash_head_next);
1389            // The chain terminator is a "nulls" pointer with bit 0
1390            // set encoding the bucket index; treat any LSB-tagged
1391            // pointer as terminator.
1392            if next_raw & RHT_PTR_LOCK_BIT != 0 || next_raw == 0 {
1393                chain_terminated_naturally = true;
1394                break;
1395            }
1396            node_kva = next_raw;
1397        }
1398        // The loop exited; if it wasn't via a natural terminator
1399        // (LSB-tagged pointer / NULL / translate failure), one of
1400        // the two caps fired (chain_visited >= PER_BUCKET_CHAIN_CAP
1401        // or total_nodes >= MAX_RHT_NODES) and we silently dropped
1402        // the rest of this bucket's chain.
1403        if !chain_terminated_naturally {
1404            truncated = true;
1405        }
1406    }
1407
1408    (dsq_kvas, truncated)
1409}
1410
1411/// Read `(pid, comm)` for a `task_struct *` after a NULL-check and
1412/// translate. Returns `(None, None)` on NULL or untranslatable.
1413fn read_task_pid_comm(
1414    mem: &GuestMem,
1415    walk: WalkContext,
1416    task_kva: u64,
1417    pid_off: usize,
1418    comm_off: usize,
1419) -> (Option<i32>, Option<String>) {
1420    if task_kva == 0 {
1421        return (None, None);
1422    }
1423    let Some(task_pa) = translate_any_kva(
1424        mem,
1425        walk.cr3_pa,
1426        walk.page_offset,
1427        task_kva,
1428        walk.l5,
1429        walk.tcr_el1,
1430    ) else {
1431        return (None, None);
1432    };
1433    let pid = mem.read_u32(task_pa, pid_off) as i32;
1434    let mut buf = [0u8; 16];
1435    mem.read_bytes(task_pa + comm_off as u64, &mut buf);
1436    let n = buf.iter().position(|&b| b == 0).unwrap_or(16);
1437    let comm = String::from_utf8_lossy(&buf[..n]).to_string();
1438    (Some(pid), Some(comm))
1439}
1440
1441#[cfg(test)]
1442#[path = "scx_walker_tests.rs"]
1443mod tests;