ktstr/monitor/scx_walker.rs
1//! Host-side rq->scx + DSQ + task enumeration walkers for the
2//! failure dump.
3//!
4//! Entry points:
5//!
6//! 1. [`walk_rq_scx`] — for one CPU's `struct rq.scx`, captures the
7//! scalar fields the kernel's own `scx_dump_state` reads
8//! (nr_running, flags, cpu_released, ops_qseq, kick_sync) plus
9//! nr_immed, clock, and the curr task pid+comm. Walks
10//! `rq->scx.runnable_list` and emits a list of
11//! [`super::dump::TaskWalkerEntry`] tuples for each runnable task —
12//! these feed directly into the per-task enrichment capture
13//! pipeline.
14//!
15//! 2. [`walk_local_dsqs`] — per-CPU local DSQs at
16//! `rq->scx.local_dsq`. Runs unconditionally — local DSQs are
17//! initialized at boot (`init_dsq`, called for every possible CPU
18//! from `init_sched_ext_class`) and exist whether or not a scheduler
19//! is attached, so this surfaces local-DSQ state even when
20//! `*scx_root == 0`.
21//!
22//! 3. [`walk_dsqs`] — sched-rooted DSQs reachable from `*scx_root`
23//! (excluding per-CPU local DSQs, which [`walk_local_dsqs`]
24//! handles separately):
25//! - per-CPU bypass DSQs via `scx_sched_pcpu.bypass_dsq`
26//! - per-node global DSQs via `scx_sched.pnode[node]->global_dsq`
27//! - user-allocated DSQs via the `scx_sched.dsq_hash` rhashtable
28//!
29//! For each DSQ captures the scalar state (id, nr, seq) and walks
30//! its `list_head` to enumerate queued tasks. The kernel's own
31//! `scx_dump_state` does NOT enumerate per-DSQ depths — this
32//! walker surfaces queue depth and per-task ordering that the
33//! in-tree dump path does not.
34//!
35//! 4. [`walk_scx_tasks_global`] — walks the kernel's global
36//! `scx_tasks` LIST_HEAD via each task's `scx.tasks_node`.
37//! Surfaces every task owned by an scx_sched, surviving the
38//! per-rq runnable_list drain that scheduler teardown
39//! (`scx_bypass`) triggers.
40//!
41//! All walkers are best-effort: any address that fails to translate
42//! (slab page race, PA out of bounds) yields a partial result rather
43//! than aborting. Cycle protection on every `list_head` walk is
44//! two-layer: a `HashSet` of visited node addresses breaks on the
45//! first revisited node (a corrupt / non-canonical ring), with
46//! `MAX_NODES_PER_LIST` as a hard count backstop; the rhashtable walk
47//! caps total bucket-table chain length at `MAX_RHT_NODES`. List heads
48//! that are static kernel symbols (`scx_tasks`) are slid by the runtime
49//! virtual KASLR offset before the terminator comparison, since the
50//! list's `.next` pointers are runtime addresses.
51//!
52//! # Lock-free reads
53//!
54//! These walkers run from the freeze coordinator after the vCPU
55//! rendezvous. All vCPUs are parked at a known KVM exit; the host
56//! reads guest memory directly with no in-guest synchronization. The
57//! kernel-side locks (`scx_dispatch_q.lock` raw_spinlock,
58//! `rhashtable.mutex`) are not honored — the freeze rendezvous IS
59//! the synchronization. A torn read can still happen if a vCPU was
60//! mid-write at the freeze instant; the walker treats torn results
61//! as best-effort partial output.
62
63use serde::{Deserialize, Serialize};
64
65use super::btf_offsets::{RHT_PTR_LOCK_BIT, SCX_DSQ_LNODE_ITER_CURSOR, ScxWalkerOffsets};
66use super::dump::TaskWalkerEntry;
67use super::guest::GuestKernel;
68use super::idr::translate_any_kva;
69use super::reader::{GuestMem, WalkContext};
70
71/// Maximum entries any single list_head walk visits before bailing
72/// with what's been collected. Bounds CPU + memory cost on a
73/// corrupt-pointer chain that loops back on itself or runs into
74/// arbitrary slab. 4096 is generous: real per-CPU runnable_list has
75/// at most ~num_threads entries on a given CPU, capped well below
76/// this; user DSQs can in principle hold millions of tasks but the
77/// per-DSQ walker still bails at this cap so a million-entry DSQ
78/// surfaces 4096 task entries plus the `nr` count (truncation
79/// surfaces via `truncated: true` on [`DsqState`]).
80const MAX_NODES_PER_LIST: u32 = 4096;
81
82/// Maximum total node visits across all rhashtable buckets in the
83/// `dsq_hash` walk. Bounds the cost of a runaway bucket chain.
84/// Mainline ScxLib creates at most a few hundred user DSQs.
85const MAX_RHT_NODES: u32 = 8192;
86
87/// Maximum number of buckets the rhashtable walker enumerates.
88/// `bucket_table.size` is normally a small power of two
89/// (rhashtable starts at 16, grows by 2x); a pathological torn
90/// read could surface a huge value. Caps the bucket walk at 64K to
91/// protect freeze-path latency.
92const MAX_RHT_BUCKETS: u32 = 65_536;
93
94/// Maximum nodes any single rhashtable bucket chain visits before
95/// bailing. A healthy rhashtable holds ~1 element per bucket on
96/// average; a pathological chain of 1024 entries in one bucket is
97/// orders of magnitude beyond legitimate use and almost certainly
98/// indicates a corrupted `next` chain or torn read. Bounded
99/// independently of [`MAX_RHT_NODES`] so a single runaway bucket
100/// cannot starve the walk's per-bucket budget on the way to the
101/// global cap. The condition `chain_visited < PER_BUCKET_CHAIN_CAP`
102/// admits exactly 1024 body executions: chain_visited starts at 0
103/// and increments inside the loop body, so the comparison reads
104/// 0,1,...,1023 across the 1024 iterations and exits on the next
105/// check (1024 < 1024 is false).
106const PER_BUCKET_CHAIN_CAP: u32 = 1024;
107
108/// Snapshot of one CPU's `struct rq.scx` state at freeze time.
109#[derive(Debug, Clone, Default, Serialize, Deserialize)]
110#[non_exhaustive]
111pub struct RqScxState {
112 /// CPU index (0-based) this state describes.
113 pub cpu: u32,
114 /// `rq->scx.nr_running`.
115 pub nr_running: u32,
116 /// `rq->scx.flags`.
117 pub flags: u32,
118 /// `rq->scx.cpu_released` — `true` when the kernel released
119 /// the CPU back to the BPF scheduler (see `scx_pre_release_cpu`
120 /// in kernel/sched/ext.c).
121 pub cpu_released: bool,
122 /// `rq->scx.ops_qseq`.
123 pub ops_qseq: u64,
124 /// `rq->scx.kick_sync` — the per-rq pick-next sequence counter.
125 /// Version-renamed in the kernel: named `pnt_seq` on v6.12–v6.18
126 /// and `kick_sync` on v6.19+. `ScxRqOffsets::from_btf` resolves
127 /// the offset via a `kick_sync`-then-`pnt_seq` fallback, so this
128 /// is Some (captured) across the supported range. None only when
129 /// neither name resolves (stripped BTF / sched_ext not built in).
130 /// Skipped on serde when None so those dumps stay tight.
131 #[serde(default, skip_serializing_if = "Option::is_none")]
132 pub kick_sync: Option<u64>,
133 /// `rq->scx.nr_immed` — count of ENQ_IMMED tasks on local_dsq.
134 /// A feature-branch (`for-7.1`) field, absent on every release
135 /// tag in the supported range (v6.12 → v7.0-rc5); the offset
136 /// resolves None there and the JSON elides the field. Distinct
137 /// provenance from [`Self::kick_sync`], which is present across
138 /// the range via its `pnt_seq` fallback.
139 #[serde(default, skip_serializing_if = "Option::is_none")]
140 pub nr_immed: Option<u32>,
141 /// `rq->scx.clock` — per-CPU scx_rq clock (the value
142 /// `scx_bpf_now()` returns) at the freeze instant. Optional
143 /// because the field was added by the `scx_bpf_now()` series in
144 /// v6.14 (commit 3a9910b5904d); v6.12 and v6.13 release kernels
145 /// have no equivalent member on `struct scx_rq`. None when the
146 /// BTF lookup of `rq->scx.clock` resolves absent — consumers
147 /// that need the value gate on Some.
148 #[serde(default, skip_serializing_if = "Option::is_none")]
149 pub rq_clock: Option<u64>,
150 /// `rq->curr->pid` — the currently-running task. `None` when
151 /// the curr pointer didn't translate (idle or torn read).
152 #[serde(default, skip_serializing_if = "Option::is_none")]
153 pub curr_pid: Option<i32>,
154 /// `rq->curr->comm`. Mirrors `curr_pid`.
155 #[serde(default, skip_serializing_if = "Option::is_none")]
156 pub curr_comm: Option<String>,
157 /// `task_struct` KVAs of every entry walked off
158 /// `rq->scx.runnable_list`. The freeze coordinator passes this
159 /// vec into the per-task enrichment capture so the same
160 /// task list drives both rq->scx state AND per-task records.
161 pub runnable_task_kvas: Vec<u64>,
162 /// True when the runnable_list walk hit the
163 /// `MAX_NODES_PER_LIST` safety cap before reaching the head
164 /// — typical only on a corrupted chain.
165 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
166 pub runnable_truncated: bool,
167}
168
169/// Snapshot of one DSQ's state — built-in (per-CPU local, per-CPU
170/// bypass, per-node global) or user-allocated.
171#[derive(Debug, Clone, Default, Serialize, Deserialize)]
172#[non_exhaustive]
173pub struct DsqState {
174 /// `scx_dispatch_q.id` — built-in DSQs use synthetic ids
175 /// (`SCX_DSQ_LOCAL`, `SCX_DSQ_GLOBAL` per-node); user DSQs
176 /// carry the BPF-allocated id.
177 pub id: u64,
178 /// Operator-facing tag describing where the DSQ came from:
179 /// `"local cpu N"`, `"bypass cpu N"`, `"global node N"`, or
180 /// `"user"`. Aligned with the kernel's own `scx_dump_state`
181 /// terminology where comparable.
182 pub origin: String,
183 /// `scx_dispatch_q.nr` — number of tasks currently queued.
184 pub nr: u32,
185 /// `scx_dispatch_q.seq` — BPF-iter seq counter, used by the
186 /// dual-snapshot delta to distinguish dead vs busy DSQs:
187 /// `Δnr=0 + Δseq=0` is a dead DSQ; `Δseq>>Δ(seq-nr)` indicates
188 /// unbounded queue growth.
189 pub seq: u32,
190 /// `task_struct` KVAs walked off the DSQ's `list_head`. Same
191 /// shape as [`RqScxState::runnable_task_kvas`] — feeds into
192 /// the same per-task enrichment pipeline.
193 pub task_kvas: Vec<u64>,
194 /// True when the DSQ list walk hit the
195 /// `MAX_NODES_PER_LIST` cap before reaching the head.
196 /// Distinct from `nr`: the kernel may report `nr` larger than
197 /// our walk cap on legitimately-deep DSQs.
198 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
199 pub truncated: bool,
200}
201
202/// Top-level scheduler state captured from `*scx_root`.
203#[derive(Debug, Clone, Default, Serialize, Deserialize)]
204#[non_exhaustive]
205pub struct ScxSchedState {
206 /// `scx_sched.aborting`. `true` when the scheduler is in the
207 /// abort path; `bypass_depth` typically rises here.
208 pub aborting: bool,
209 /// `scx_sched.bypass_depth`. Nesting depth of the bypass-mode
210 /// stack; non-zero means the kernel is dispatching tasks
211 /// without consulting the BPF scheduler.
212 pub bypass_depth: i32,
213 /// `scx_sched.exit_kind` — the SCX_EXIT_* enum value latched
214 /// at `scx_error()` time. 0 means no exit yet; non-zero values
215 /// match `enum scx_exit_kind` in `kernel/sched/ext_internal.h`
216 /// (e.g. `SCX_EXIT_ERROR = 1024`, `SCX_EXIT_ERROR_BPF = 1025`,
217 /// `SCX_EXIT_ERROR_STALL = 1026`).
218 pub exit_kind: u32,
219 /// `scx_sched.watchdog_timeout` (jiffies) at the snapshot
220 /// instant. `None` when the field was not captured — either
221 /// because the live `read_scx_sched_state` path was taken on a
222 /// kernel that still exposes `watchdog_timeout` only via the
223 /// monitor's `WatchdogOverride` plumbing (not as a BTF field on
224 /// every release), or because the BPF .bss fallback was used
225 /// without the snapshot var set. Some when populated via the
226 /// probe BPF .bss snapshot
227 /// (`ktstr_exit_watchdog_timeout`).
228 #[serde(default, skip_serializing_if = "Option::is_none")]
229 pub watchdog_timeout: Option<u64>,
230 /// Provenance tag identifying which path produced this state.
231 /// `None` for the default-built / serde-deserialized case where
232 /// the source isn't recorded; `Some("live")` when populated by
233 /// `read_scx_sched_state` reading `*scx_root` directly;
234 /// `Some("bss_snapshot")` when populated from the probe BPF
235 /// .bss snapshot fallback (the `ktstr_exit_*` vars). Lets the
236 /// dump consumer distinguish "scheduler was alive at freeze
237 /// time" from "scheduler had already torn down and we read the
238 /// pre-teardown snapshot the BPF probe latched".
239 #[serde(default, skip_serializing_if = "Option::is_none")]
240 pub source: Option<String>,
241 /// Kernel virtual address of the `scx_sched` instance these
242 /// values describe. `None` when not captured. Same provenance
243 /// rule as [`Self::source`]: live path stamps the resolved
244 /// `*scx_root` value; the BPF .bss snapshot stamps the
245 /// `ktstr_exit_sched_kva` field. Lets a consumer correlate
246 /// dumps across reloads (a different scx_sched instance has a
247 /// different KVA).
248 #[serde(default, skip_serializing_if = "Option::is_none")]
249 pub sched_kva: Option<u64>,
250}
251
252/// Walk one CPU's `rq->scx` state. Reads the scalar fields and
253/// the runnable_list, returning [`RqScxState`] plus a vec of
254/// [`TaskWalkerEntry`] entries for the per-task enrichment pipeline.
255///
256/// `cpu` is the 0-based CPU index; `rq_kva` and `rq_pa` address that
257/// CPU's `struct rq`. Caller resolves both via
258/// `runqueues + per_cpu_offset[cpu]` (KVA) plus the corresponding PA
259/// via `compute_rq_pas`.
260///
261/// Cap on visited nodes: `MAX_NODES_PER_LIST`. A truncated walk
262/// surfaces via [`RqScxState::runnable_truncated`].
263///
264/// Returns `None` when any of the offset sub-groups required for
265/// scalar reads (`rq`, `scx_rq`, `task`) is absent — the walker
266/// cannot synthesize partial scalars meaningfully without the rq /
267/// scx_rq base offsets. Per-CPU runnable_list walking additionally
268/// requires `see` (sched_ext_entity); when only `see` is missing the
269/// scalar capture still lands but the runnable_list walk yields
270/// nothing.
271#[allow(dead_code)]
272pub fn walk_rq_scx(
273 kernel: &GuestKernel,
274 cpu: u32,
275 rq_kva: u64,
276 rq_pa: u64,
277 offsets: &ScxWalkerOffsets,
278) -> Option<(RqScxState, Vec<TaskWalkerEntry>)> {
279 let rq_offs = offsets.rq.as_ref()?;
280 let scx_rq_offs = offsets.scx_rq.as_ref()?;
281 let task_offs = offsets.task.as_ref()?;
282
283 let mem = kernel.mem();
284 let walk = kernel.walk_context();
285
286 let scx_off = rq_offs.scx;
287
288 // Scalar reads off rq + scx_rq.
289 let nr_running = mem.read_u32(rq_pa, scx_off + scx_rq_offs.nr_running);
290 let flags = mem.read_u32(rq_pa, scx_off + scx_rq_offs.flags);
291 let cpu_released = mem.read_u8(rq_pa, scx_off + scx_rq_offs.cpu_released) != 0;
292 let ops_qseq = mem.read_u64(rq_pa, scx_off + scx_rq_offs.ops_qseq);
293 // nr_immed is a for-7.1 feature-branch field, absent on every
294 // release tag in the supported range (v6.12 → v7.0-rc5), so its
295 // offset resolves None there. kick_sync, by contrast, resolves
296 // Some across the range via the `pnt_seq` fallback in
297 // ScxRqOffsets::from_btf. Gate each read on Some so a truly-
298 // absent field (stripped BTF / sched_ext not built in) is not
299 // fabricated from rq_pa+0 (which would alias the local_dsq head
300 // pointer — a non-zero garbage read the dump would render as
301 // legitimate kernel state).
302 let kick_sync = scx_rq_offs
303 .kick_sync
304 .map(|off| mem.read_u64(rq_pa, scx_off + off));
305 let nr_immed = scx_rq_offs
306 .nr_immed
307 .map(|off| mem.read_u32(rq_pa, scx_off + off));
308 // rq->scx.clock added in v6.14 (commit 3a9910b5904d). Gate the
309 // read on Some(off): on v6.12/v6.13 the offset is None and the
310 // walker must NOT fall back to rq_pa+0 (would alias local_dsq's
311 // raw_spinlock — non-zero junk rendered as a legitimate clock
312 // reading). The downstream RqScxState carries an Option<u64> so
313 // the JSON elides scx_rq_clock on unsupported kernels.
314 let rq_clock = scx_rq_offs
315 .clock
316 .map(|off| mem.read_u64(rq_pa, scx_off + off));
317
318 // curr task — pointer follow.
319 let curr_kva = mem.read_u64(rq_pa, rq_offs.curr);
320 let (curr_pid, curr_comm) =
321 read_task_pid_comm(mem, walk, curr_kva, task_offs.pid, task_offs.comm);
322
323 // Walk runnable_list when sched_ext_entity offsets are available.
324 // Without `see` we can still report scalar state but cannot
325 // container_of a runnable_node back to its task_struct.
326 let (runnable_task_kvas, runnable_truncated) = if let Some(see_offs) = offsets.see.as_ref() {
327 let list_head_off = scx_off + scx_rq_offs.runnable_list;
328 let head_kva = rq_kva.wrapping_add(list_head_off as u64);
329 let head_pa = rq_pa.wrapping_add(list_head_off as u64);
330
331 // container_of offset within task_struct: each runnable_node
332 // is at task + task_struct.scx + see.runnable_node.
333 let runnable_node_off_in_task = task_offs.scx + see_offs.runnable_node;
334
335 walk_list_head_for_task_kvas(mem, walk, head_kva, head_pa, runnable_node_off_in_task)
336 } else {
337 (Vec::new(), false)
338 };
339
340 let walker_entries: Vec<TaskWalkerEntry> = runnable_task_kvas
341 .iter()
342 .map(|&task_kva| TaskWalkerEntry {
343 task_kva,
344 // Runnable on this CPU's scx — eligible for the
345 // pi_boosted_out_of_scx flag.
346 is_runnable_in_scx: true,
347 // running_pc only known for the curr task; the
348 // freeze coordinator can fill that via
349 // VcpuRegSnapshot.instruction_pointer at a higher
350 // level. The walker leaves it None.
351 running_pc: None,
352 })
353 .collect();
354
355 let state = RqScxState {
356 cpu,
357 nr_running,
358 flags,
359 cpu_released,
360 ops_qseq,
361 kick_sync,
362 nr_immed,
363 rq_clock,
364 curr_pid,
365 curr_comm,
366 runnable_task_kvas,
367 runnable_truncated,
368 };
369
370 Some((state, walker_entries))
371}
372
373/// Read scalar `scx_sched` fields off `*scx_root`.
374///
375/// `scx_root` is a kernel-text-mapped pointer at the resolved KVA;
376/// `*scx_root` points at the active `struct scx_sched`. Returns
377/// `None` when scx_root is unset (no scheduler attached), the read
378/// fails, or the `scx_sched` offset sub-group is missing from BTF.
379///
380/// Emits `tracing::debug!` at each gate that returns `None` so an
381/// operator parsing the failure-dump trace can pinpoint exactly
382/// where the read aborted: BTF sub-group missing, scx_root_kva
383/// zero, dereferenced sched_kva zero, or sched_kva translate
384/// failure.
385#[allow(dead_code)]
386pub fn read_scx_sched_state(
387 kernel: &GuestKernel,
388 scx_root_kva: u64,
389 offsets: &ScxWalkerOffsets,
390) -> Option<(u64, ScxSchedState)> {
391 let Some(sched_offs) = offsets.sched.as_ref() else {
392 tracing::debug!(
393 "read_scx_sched_state: ScxSchedOffsets BTF sub-group missing — \
394 vmlinux lacks `struct scx_sched` (kernel without sched_ext or stripped vmlinux)",
395 );
396 return None;
397 };
398
399 let mem = kernel.mem();
400 let walk = kernel.walk_context();
401
402 if scx_root_kva == 0 {
403 tracing::debug!(
404 "read_scx_sched_state: scx_root_kva is 0 — vmlinux had no \
405 `scx_root` symbol (pre-6.16 kernel or stripped vmlinux)",
406 );
407 return None;
408 }
409
410 let root_pa = kernel.text_kva_to_pa(scx_root_kva);
411 let sched_kva = mem.read_u64(root_pa, 0);
412 if sched_kva == 0 {
413 tracing::debug!(
414 scx_root_kva = format_args!("{:#x}", scx_root_kva),
415 root_pa = format_args!("{:#x}", root_pa),
416 "read_scx_sched_state: *scx_root == 0 — no scheduler attached at the freeze instant",
417 );
418 return None;
419 }
420 let Some(sched_pa) = translate_any_kva(
421 mem,
422 walk.cr3_pa,
423 walk.page_offset,
424 sched_kva,
425 walk.l5,
426 walk.tcr_el1,
427 ) else {
428 tracing::debug!(
429 sched_kva = format_args!("{:#x}", sched_kva),
430 "read_scx_sched_state: translate_any_kva failed for sched_kva — \
431 page-table walk yielded no PA (slab page race or torn read)",
432 );
433 return None;
434 };
435
436 // `aborting` and `bypass_depth` are dev-only fields (absent on
437 // every release tag in our supported range). Gate each read on
438 // its offset being Some — falling back to 0 / false matches the
439 // semantics of reading from a kernel that never had the field
440 // (no in-flight abort, no bypass nesting). The downstream
441 // ScxSchedState carries plain bool/i32 because those defaults
442 // are meaningful and serializable; an Option wrapper would just
443 // complicate every consumer for no extra signal on release
444 // kernels.
445 let aborting = sched_offs
446 .aborting
447 .map(|off| mem.read_u8(sched_pa, off) != 0)
448 .unwrap_or(false);
449 let bypass_depth = sched_offs
450 .bypass_depth
451 .map(|off| mem.read_u32(sched_pa, off) as i32)
452 .unwrap_or(0);
453 // `exit_kind` is `atomic_t`; the value lives in the `counter`
454 // field at offset 0 of atomic_t. We're already at the
455 // outer-struct offset of `exit_kind`, so a u32 read at that
456 // offset reads the `counter` directly. Mandatory on every
457 // kernel that has `scx_sched`.
458 let exit_kind = mem.read_u32(sched_pa, sched_offs.exit_kind);
459
460 Some((
461 sched_kva,
462 ScxSchedState {
463 aborting,
464 bypass_depth,
465 exit_kind,
466 // Live read from `*scx_root` doesn't capture
467 // `watchdog_timeout`. The BTF sub-group does not carry
468 // an offset for the field today (it would need to be
469 // added to `ScxSchedOffsets`), and the host tracks the
470 // configured timeout via the `WatchdogOverride` plumbing
471 // anyway. Leave None; the BPF .bss snapshot's
472 // `ktstr_exit_watchdog_timeout` path populates this when
473 // the live read is unavailable.
474 watchdog_timeout: None,
475 source: Some(SCX_SCHED_STATE_SOURCE_LIVE.to_string()),
476 sched_kva: Some(sched_kva),
477 },
478 ))
479}
480
481/// Provenance tag for [`ScxSchedState::source`] when the state was
482/// read directly from `*scx_root` via `read_scx_sched_state`. The
483/// scheduler was alive at freeze time and the host walked its slab
484/// page directly. Pinned as a constant so the dump's display layer
485/// and tests reference the same string without drift.
486pub const SCX_SCHED_STATE_SOURCE_LIVE: &str = "live";
487
488/// Provenance tag for [`ScxSchedState::source`] when the state was
489/// reconstructed from the probe BPF program's `.bss` snapshot
490/// (`ktstr_exit_*` vars). The scheduler had already torn down by
491/// freeze time (`*scx_root == 0`), so the live walker returned None
492/// and the host fell back to the snapshot the BPF tp_btf handler
493/// captured at err-exit time.
494pub const SCX_SCHED_STATE_SOURCE_BSS: &str = "bss_snapshot";
495
496/// `SCX_TASK_CURSOR` flag value (`1 << 31`) on `sched_ext_entity.flags`.
497/// Cursor entries are stack-allocated `sched_ext_entity` placeholders
498/// that `scx_task_iter_start` inserts
499/// into `scx_tasks` to mark the iterator's progress; they are NOT
500/// embedded in any `task_struct` so the global walker must skip them
501/// to avoid container_of producing a bogus task KVA. Pinned per
502/// `SCX_TASK_CURSOR` in `include/linux/sched/ext.h`.
503const SCX_TASK_CURSOR: u32 = 1 << 31;
504/// Walk the kernel's global `scx_tasks` LIST_HEAD and recover every
505/// task linked into it via `task_struct.scx.tasks_node`.
506///
507/// `scx_tasks` is `static LIST_HEAD(scx_tasks)` in
508/// `kernel/sched/ext.c`. Tasks are added in
509/// `scx_post_fork` (`list_add_tail(&p->scx.tasks_node, &scx_tasks)`)
510/// and removed in `sched_ext_dead`
511/// (`list_del_init(&p->scx.tasks_node)`). The list outlives the
512/// per-rq `runnable_list` because `scx_bypass`
513/// drains runnable_list during
514/// scheduler teardown without touching `scx_tasks` — making this
515/// the durable task source for failure-dump enrichment.
516///
517/// Cursor entries (`scx_task_iter_start` inserts a stack-allocated
518/// `sched_ext_entity` with `flags = SCX_TASK_CURSOR` into
519/// `scx_tasks` while iterating) are skipped via the
520/// `tasks_node_off_in_see` parameter — the walker reads
521/// `sched_ext_entity.flags` for each list entry and skips entries
522/// whose flag is set.
523///
524/// `scx_tasks_kva` is the symbol KVA of the global LIST_HEAD;
525/// `tasks_node_off_in_task` is the byte offset of `tasks_node`
526/// within `task_struct` (`task.scx + see.tasks_node`);
527/// `tasks_node_off_in_see` is the byte offset of `tasks_node`
528/// within `sched_ext_entity` (`see.tasks_node` alone — used to
529/// recover the see base for cursor-flag testing on entries that
530/// are not embedded in a `task_struct`); `flags_off_in_see` is the
531/// byte offset of `flags` within `sched_ext_entity`.
532///
533/// `kaslr_offset` is the runtime virtual KASLR slide: `scx_tasks_kva`
534/// is the LINK-TIME symbol value, but the list's `.next` pointers are
535/// runtime addresses (link + slide), so the loop terminator compares
536/// against the RUNTIME head `slid_kernel_kva(scx_tasks_kva, kaslr_offset)`.
537/// Pass `0` when KASLR is off / not yet derived (identity slide).
538///
539/// Returns an empty vec when `scx_tasks_kva` is 0 (symbol absent —
540/// stripped vmlinux or kernel without sched_ext) or when the list
541/// head reads as empty (tasks_node points at itself).
542///
543/// Termination: the canonical circular list closes through the runtime
544/// head on the normal path. A `HashSet` of visited nodes is the
545/// hostile-guest backstop that bounds a corrupt / non-canonical ring to
546/// its unique-node set; `MAX_NODES_PER_LIST` is a further hard cap.
547#[allow(dead_code)]
548pub fn walk_scx_tasks_global(
549 kernel: &GuestKernel,
550 scx_tasks_kva: u64,
551 kaslr_offset: u64,
552 tasks_node_off_in_task: usize,
553 tasks_node_off_in_see: usize,
554 flags_off_in_see: usize,
555) -> Vec<u64> {
556 if scx_tasks_kva == 0 {
557 tracing::debug!(
558 "walk_scx_tasks_global: scx_tasks_kva is 0 — vmlinux had no \
559 `scx_tasks` symbol (kernel without sched_ext or stripped vmlinux)",
560 );
561 return Vec::new();
562 }
563 let mem = kernel.mem();
564 let walk = kernel.walk_context();
565
566 // head_pa reads head.next via the LINK-TIME kva: text_kva_to_pa maps a
567 // link-time kernel-image kva to its PA (phys_base already accounts for
568 // the physical load address), so this correctly reads the LIST_HEAD's
569 // first field (list_head.next) regardless of KASLR.
570 let head_pa = kernel.text_kva_to_pa(scx_tasks_kva);
571 // The terminator, however, must compare against the RUNTIME head: the
572 // .next pointers stored in the list are runtime kvas (link + virtual
573 // KASLR slide), filled by the kernel's list_add_tail(&p->scx.tasks_node,
574 // &scx_tasks). slid_kernel_kva applies the high-half slide (identity
575 // when kaslr_offset == 0). Comparing runtime .next against the link-time
576 // head would never match under KASLR-on (the default), so the canonical
577 // ring would never close and the walk would fall back to the seen-set /
578 // count backstop on every boot — and would process the runtime head
579 // node itself (reading garbage `flags` from .data before &scx_tasks and
580 // emitting a phantom task). With the runtime head the loop terminates ON
581 // the head and never processes it.
582 let head_kva = crate::monitor::symbols::slid_kernel_kva(scx_tasks_kva, kaslr_offset);
583
584 let mut task_kvas: Vec<u64> = Vec::new();
585 let mut node_kva = mem.read_u64(head_pa, 0);
586 if node_kva == 0 {
587 tracing::debug!(
588 scx_tasks_kva = format_args!("{:#x}", scx_tasks_kva),
589 head_pa = format_args!("{:#x}", head_pa),
590 "walk_scx_tasks_global: head.next read as 0 — list-head bytes \
591 unmapped or torn read; no tasks harvested",
592 );
593 return task_kvas;
594 }
595
596 let mut visited: u32 = 0;
597 // Address-cycle backstop. The canonical list closes through the runtime
598 // head_kva (above) on the normal path. This HashSet bounds a genuinely
599 // corrupt / non-canonical ring — a torn read, or a final node.next that
600 // points at an interior node instead of the head — to its unique-node
601 // set rather than cycling to MAX_NODES_PER_LIST, the hostile-guest-safe
602 // list-walk contract. (Before the runtime-head fix this path fired on
603 // every KASLR-on boot, the symptom that surfaced the symbol bug.)
604 let mut seen: std::collections::HashSet<u64> = std::collections::HashSet::new();
605 while node_kva != head_kva {
606 if visited >= MAX_NODES_PER_LIST {
607 return task_kvas;
608 }
609 if !seen.insert(node_kva) {
610 tracing::debug!(
611 node_kva = format_args!("{:#x}", node_kva),
612 unique = seen.len(),
613 "walk_scx_tasks_global: revisited node — list does not close \
614 through head_kva; stopping at the unique-node set rather than \
615 cycling to MAX_NODES_PER_LIST",
616 );
617 return task_kvas;
618 }
619 visited += 1;
620
621 // Recover the sched_ext_entity base for this list entry so we
622 // can read its `flags`. For task-embedded entries this base is
623 // inside a task_struct (`task_kva + task.scx`); for cursor
624 // entries this base is a stack-allocated sched_ext_entity.
625 // Either way, `see_kva = node_kva - see.tasks_node`.
626 let see_kva = node_kva.wrapping_sub(tasks_node_off_in_see as u64);
627 let cursor = match translate_any_kva(
628 mem,
629 walk.cr3_pa,
630 walk.page_offset,
631 see_kva,
632 walk.l5,
633 walk.tcr_el1,
634 ) {
635 Some(see_pa) => {
636 let flags = mem.read_u32(see_pa, flags_off_in_see);
637 flags & SCX_TASK_CURSOR != 0
638 }
639 // Translate failure on the see base — be conservative and
640 // treat as not-cursor so the entry surfaces; downstream
641 // walk_task_enrichment will revalidate via translate and
642 // drop it cleanly if the address is bogus.
643 None => false,
644 };
645
646 if !cursor {
647 // container_of: task_kva = node_kva - tasks_node_off_in_task.
648 let task_kva = node_kva.wrapping_sub(tasks_node_off_in_task as u64);
649 task_kvas.push(task_kva);
650 }
651
652 // Advance to the next node via the list_head.next pointer
653 // at offset 0 of the tasks_node list_head.
654 let Some(node_pa) = translate_any_kva(
655 mem,
656 walk.cr3_pa,
657 walk.page_offset,
658 node_kva,
659 walk.l5,
660 walk.tcr_el1,
661 ) else {
662 return task_kvas;
663 };
664 let next_kva = mem.read_u64(node_pa, 0);
665 if next_kva == 0 {
666 return task_kvas;
667 }
668 node_kva = next_kva;
669 }
670 task_kvas
671}
672
673/// Walk every per-CPU local DSQ — the DSQs embedded in `rq->scx.local_dsq`.
674///
675/// This is a strict subset of [`walk_dsqs`]'s pass 1, extracted so
676/// the dump path can call it INDEPENDENTLY of `*scx_root`. Per-CPU
677/// local DSQs are kernel-initialized at boot (`init_dsq`, called for
678/// every possible CPU from `init_sched_ext_class` in the
679/// `__init` path), so they exist even when no scheduler is attached
680/// (`*scx_root == NULL`) and survive scheduler teardown's bypass
681/// drain.
682///
683/// Returns one `DsqState` per CPU whose translate succeeds, plus a
684/// flat vec of [`TaskWalkerEntry`] for the per-task enrichment
685/// pipeline (these entries carry `is_runnable_in_scx: false` —
686/// tasks queued on a DSQ are staged for dispatch, not yet runnable
687/// in the rq->scx sense).
688///
689/// `rq_kvas`, `rq_pas`, and `per_cpu_offsets` index by CPU id
690/// (parallel arrays, same shape `walk_rq_scx` consumes). The walker
691/// skips BSS-zero-tail CPUs by checking the per-CPU offset directly
692/// (`per_cpu_offsets[cpu] == 0 && cpu > 0`) — those entries fall out
693/// of un-written `__per_cpu_offset[]` slots past `nr_cpu_ids` and
694/// would otherwise surface a phantom DSQ row at the bare `runqueues`
695/// symbol KVA. Comparing the resolved `rq_kva` against `rq_kvas[0]`
696/// would miss the alias on x86_64 SMP: `setup_per_cpu_areas`
697/// (`arch/x86/kernel/setup_percpu.c`) writes
698/// `__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]` with
699/// `delta = pcpu_base_addr - __per_cpu_start` non-zero, so CPU 0's
700/// `rq_kva` is `runqueues + delta` while a BSS-zero-tail CPU's is
701/// `runqueues + 0` — the two differ and `rq_kva == rq_kvas[0]` would
702/// let the phantom row through. Mirrors the canonical `cpu_off == 0
703/// && cpu_index > 0` guard
704/// `super::bpf_map::read_percpu_array_value` applies for percpu
705/// reads, expressed against the same `__per_cpu_offset[]` array.
706///
707/// Empty arrays mean "no CPUs walked successfully"; the caller's
708/// freeze-path retry guard normally rejects empty inputs before
709/// reaching this pass.
710///
711/// `None` return when any required offset sub-group is missing
712/// (`rq`, `scx_rq`, `dsq`, `dsq_lnode`, `task`, `see` — the same
713/// leaf set [`walk_dsqs`]'s pass 1 needs). A partial offset set
714/// is the same gating condition that blinds every other DSQ pass.
715#[allow(dead_code)]
716pub fn walk_local_dsqs(
717 kernel: &GuestKernel,
718 rq_kvas: &[u64],
719 rq_pas: &[u64],
720 per_cpu_offsets: &[u64],
721 offsets: &ScxWalkerOffsets,
722) -> Option<(Vec<DsqState>, Vec<TaskWalkerEntry>)> {
723 let Some(rq_offs) = offsets.rq.as_ref() else {
724 tracing::debug!(
725 "walk_local_dsqs: ScxWalkerOffsets.rq sub-group missing — \
726 local DSQ pass blinded",
727 );
728 return None;
729 };
730 let Some(scx_rq_offs) = offsets.scx_rq.as_ref() else {
731 tracing::debug!(
732 "walk_local_dsqs: ScxWalkerOffsets.scx_rq sub-group missing — \
733 local DSQ pass blinded",
734 );
735 return None;
736 };
737 let Some(dsq_offs) = offsets.dsq.as_ref() else {
738 tracing::debug!(
739 "walk_local_dsqs: ScxWalkerOffsets.dsq sub-group missing — \
740 local DSQ pass blinded",
741 );
742 return None;
743 };
744 let Some(dsq_lnode_offs) = offsets.dsq_lnode.as_ref() else {
745 tracing::debug!(
746 "walk_local_dsqs: ScxWalkerOffsets.dsq_lnode sub-group missing — \
747 local DSQ pass blinded",
748 );
749 return None;
750 };
751 let Some(task_offs) = offsets.task.as_ref() else {
752 tracing::debug!(
753 "walk_local_dsqs: ScxWalkerOffsets.task sub-group missing — \
754 local DSQ pass blinded",
755 );
756 return None;
757 };
758 let Some(see_offs) = offsets.see.as_ref() else {
759 tracing::debug!(
760 "walk_local_dsqs: ScxWalkerOffsets.see sub-group missing — \
761 local DSQ pass blinded",
762 );
763 return None;
764 };
765
766 let mem = kernel.mem();
767 let walk = kernel.walk_context();
768
769 let mut states: Vec<DsqState> = Vec::new();
770 let mut entries: Vec<TaskWalkerEntry> = Vec::new();
771
772 for (cpu, (&rq_kva, &rq_pa)) in rq_kvas.iter().zip(rq_pas.iter()).enumerate() {
773 // BSS-zero-tail guard: kernel `setup_per_cpu_areas`
774 // only writes `__per_cpu_offset[cpu]` for CPUs in
775 // `for_each_possible_cpu`, leaving slots beyond
776 // `nr_cpu_ids` at the BSS-initialized 0. The caller
777 // builds rq_kvas via `runqueues + per_cpu_offset[cpu]`,
778 // so a BSS-zero-tail entry produces an rq_kva of
779 // `runqueues + 0` instead of CPU 0's
780 // `runqueues + __per_cpu_offset[0]`. The two are NOT
781 // equal on x86_64 SMP because
782 // `__per_cpu_offset[0] = pcpu_base_addr - __per_cpu_start`
783 // is non-zero (`arch/x86/kernel/setup_percpu.c`); a
784 // resolved-rq_kva comparison would let the phantom
785 // BSS-zero entry through. Check the per-CPU offset
786 // directly instead — `cpu_off == 0 && cpu > 0` is the
787 // canonical guard
788 // [`super::bpf_map::read_percpu_array_value`] uses for
789 // percpu reads and the matching guard at the per-CPU
790 // bypass DSQ pass below. A `per_cpu_offsets` slice
791 // shorter than `rq_kvas` (length-mismatched caller)
792 // is treated conservatively: an absent offset for
793 // `cpu > 0` skips the slot, since the walker can't
794 // distinguish a real CPU from a BSS-zero tail without
795 // the offset.
796 let cpu_off = per_cpu_offsets.get(cpu).copied();
797 match cpu_off {
798 Some(off) if off == 0 && cpu > 0 => continue,
799 None if cpu > 0 => continue,
800 _ => {}
801 }
802 let local_dsq_off = rq_offs.scx + scx_rq_offs.local_dsq;
803 let dsq_kva = rq_kva.wrapping_add(local_dsq_off as u64);
804 let dsq_pa = rq_pa.wrapping_add(local_dsq_off as u64);
805 if let Some((state, e)) = walk_one_dsq(
806 mem,
807 walk,
808 dsq_kva,
809 dsq_pa,
810 || format!("local cpu {cpu}"),
811 dsq_offs,
812 dsq_lnode_offs,
813 task_offs,
814 see_offs,
815 ) {
816 entries.extend(e);
817 states.push(state);
818 }
819 }
820
821 Some((states, entries))
822}
823
824/// Walk every DSQ reachable from a `scx_sched` (the bypass / global
825/// / user-hash passes — NOT per-CPU local DSQs) and produce one
826/// `DsqState` per DSQ plus a flat vec of `TaskWalkerEntry` rows for
827/// the per-task enrichment pipeline.
828///
829/// Per-CPU local DSQs (`rq->scx.local_dsq`) are NOT walked here —
830/// they live in each rq independently of `*scx_root`, so callers
831/// invoke [`walk_local_dsqs`] separately and unconditionally for
832/// the local pass. This split lets the dump path surface local DSQ
833/// state even when no scheduler is attached
834/// (`*scx_root == NULL`) — the local_dsq struct is initialized at
835/// boot per `init_dsq` (called from `init_sched_ext_class`) for every
836/// possible CPU, so it has well-defined contents long before any
837/// scheduler attaches.
838///
839/// Walks (in this order, gated on the relevant sub-group offsets
840/// being present):
841/// 1. Per-CPU bypass DSQs at `scx_sched_pcpu.bypass_dsq` for
842/// every CPU (needs `sched`, `sched_pcpu`, plus the leaf set).
843/// 2. Per-node global DSQs at `scx_sched.pnode[node]->global_dsq`
844/// for every NUMA node (needs `sched`, `sched_pnode`, plus
845/// leaf set).
846/// 3. User-allocated DSQs walked through `scx_sched.dsq_hash`
847/// (needs `sched`, `rht`, plus leaf set).
848///
849/// Each pass is independent: missing offsets for one pass blind
850/// only that pass. A translate failure on one DSQ leaves it out of
851/// the result without affecting the others.
852#[allow(dead_code)]
853pub fn walk_dsqs(
854 kernel: &GuestKernel,
855 sched_pa: u64,
856 per_cpu_offsets: &[u64],
857 nr_nodes: u32,
858 offsets: &ScxWalkerOffsets,
859) -> (Vec<DsqState>, Vec<TaskWalkerEntry>) {
860 let mem = kernel.mem();
861 let walk = kernel.walk_context();
862
863 let mut dsq_states: Vec<DsqState> = Vec::new();
864 let mut all_entries: Vec<TaskWalkerEntry> = Vec::new();
865
866 // Leaf offsets common to every pass — all three DSQ-walking
867 // passes feed `walk_one_dsq` which needs these. If any leaf
868 // group is missing, no pass can run.
869 let (Some(dsq_offs), Some(dsq_lnode_offs), Some(task_offs), Some(see_offs)) = (
870 offsets.dsq.as_ref(),
871 offsets.dsq_lnode.as_ref(),
872 offsets.task.as_ref(),
873 offsets.see.as_ref(),
874 ) else {
875 return (dsq_states, all_entries);
876 };
877
878 // Pass 1: per-CPU bypass DSQs. The percpu base lives at
879 // sched->pcpu, dereferenced as a __percpu pointer; each CPU's
880 // address is `pcpu_base + per_cpu_offset[cpu] +
881 // scx_sched_pcpu.bypass_dsq`.
882 //
883 // Both `sched_offs.pcpu` (v6.18+) and `pcpu_offs.bypass_dsq`
884 // (dev-only) are kernel-version-gated. Skip the entire pass
885 // unless both offsets resolved — partial state would compute
886 // a bogus DSQ KVA from `sched_pa + 0` (aliasing dsq_hash) and
887 // surface phantom DSQ entries.
888 if let (Some(sched_offs), Some(pcpu_offs)) =
889 (offsets.sched.as_ref(), offsets.sched_pcpu.as_ref())
890 && let (Some(sched_pcpu_off), Some(bypass_dsq_off)) =
891 (sched_offs.pcpu, pcpu_offs.bypass_dsq)
892 {
893 let pcpu_kva = mem.read_u64(sched_pa, sched_pcpu_off);
894 if pcpu_kva != 0 {
895 for (cpu, &cpu_off) in per_cpu_offsets.iter().enumerate() {
896 // Skip out-of-range CPUs — same heuristic as
897 // read_percpu_array_value (cpu_off==0 && cpu_index>0
898 // means BSS-zero tail).
899 if cpu_off == 0 && cpu > 0 {
900 continue;
901 }
902 let dsq_kva = pcpu_kva
903 .wrapping_add(cpu_off)
904 .wrapping_add(bypass_dsq_off as u64);
905 if let Some(dsq_pa) = translate_any_kva(
906 mem,
907 walk.cr3_pa,
908 walk.page_offset,
909 dsq_kva,
910 walk.l5,
911 walk.tcr_el1,
912 ) && let Some((state, entries)) = walk_one_dsq(
913 mem,
914 walk,
915 dsq_kva,
916 dsq_pa,
917 || format!("bypass cpu {cpu}"),
918 dsq_offs,
919 dsq_lnode_offs,
920 task_offs,
921 see_offs,
922 ) {
923 all_entries.extend(entries);
924 dsq_states.push(state);
925 }
926 }
927 }
928 }
929
930 // Pass 2: per-node global DSQs. `sched->pnode` is a pointer
931 // to an array of `struct scx_sched_pnode *` of length nr_nodes.
932 // Both `sched_offs.pnode` and `pnode_offs.global_dsq` are
933 // dev-only — skip the pass unless both resolved.
934 if let (Some(sched_offs), Some(pnode_offs)) =
935 (offsets.sched.as_ref(), offsets.sched_pnode.as_ref())
936 && let (Some(sched_pnode_off), Some(global_dsq_off)) =
937 (sched_offs.pnode, pnode_offs.global_dsq)
938 {
939 let pnode_kva = mem.read_u64(sched_pa, sched_pnode_off);
940 if pnode_kva != 0
941 && let Some(pnode_arr_pa) = translate_any_kva(
942 mem,
943 walk.cr3_pa,
944 walk.page_offset,
945 pnode_kva,
946 walk.l5,
947 walk.tcr_el1,
948 )
949 {
950 for node in 0..nr_nodes as u64 {
951 let pnode_ptr_kva = mem.read_u64(pnode_arr_pa, (node * 8) as usize);
952 if pnode_ptr_kva == 0 {
953 continue;
954 }
955 let Some(pnode_pa) = translate_any_kva(
956 mem,
957 walk.cr3_pa,
958 walk.page_offset,
959 pnode_ptr_kva,
960 walk.l5,
961 walk.tcr_el1,
962 ) else {
963 continue;
964 };
965 let dsq_kva = pnode_ptr_kva.wrapping_add(global_dsq_off as u64);
966 let dsq_pa = pnode_pa.wrapping_add(global_dsq_off as u64);
967 if let Some((state, entries)) = walk_one_dsq(
968 mem,
969 walk,
970 dsq_kva,
971 dsq_pa,
972 || format!("global node {node}"),
973 dsq_offs,
974 dsq_lnode_offs,
975 task_offs,
976 see_offs,
977 ) {
978 all_entries.extend(entries);
979 dsq_states.push(state);
980 }
981 }
982 }
983 }
984
985 // Pass 3: user-allocated DSQs via the scx_sched.dsq_hash
986 // rhashtable. Walks at most MAX_RHT_NODES nodes total across
987 // all buckets.
988 if let (Some(sched_offs), Some(rht_offs)) = (offsets.sched.as_ref(), offsets.rht.as_ref()) {
989 // dsq_hash is embedded in scx_sched (not a pointer), so its
990 // PA is the sched_pa with the field offset added directly —
991 // same pattern Pass 2 uses for pnode->global_dsq. Computing a
992 // KVA here would require sched_kva which the caller already
993 // discarded; translating sched_pa as a KVA would underflow
994 // page_offset and silently empty the user-DSQ list.
995 let rht_pa = sched_pa.wrapping_add(sched_offs.dsq_hash as u64);
996 let (user_dsqs, user_dsqs_truncated) =
997 walk_user_dsq_hash(mem, walk, rht_pa, rht_offs, dsq_offs);
998 if user_dsqs_truncated {
999 // Surface the cap-hit so an operator parsing the
1000 // failure dump trace sees that the user-DSQ list is
1001 // partial. Without this log the dump silently
1002 // omits the tail of the dsq_hash bucket table or
1003 // the tail of one bucket's chain.
1004 tracing::warn!(
1005 visited = user_dsqs.len(),
1006 cap_buckets = MAX_RHT_BUCKETS,
1007 cap_nodes = MAX_RHT_NODES,
1008 "walk_user_dsq_hash: truncated — bucket-table or node cap fired; \
1009 dsq_kvas list is incomplete",
1010 );
1011 }
1012 for dsq_kva in user_dsqs {
1013 let Some(dsq_pa) = translate_any_kva(
1014 mem,
1015 walk.cr3_pa,
1016 walk.page_offset,
1017 dsq_kva,
1018 walk.l5,
1019 walk.tcr_el1,
1020 ) else {
1021 continue;
1022 };
1023 if let Some((state, entries)) = walk_one_dsq(
1024 mem,
1025 walk,
1026 dsq_kva,
1027 dsq_pa,
1028 || "user".to_string(),
1029 dsq_offs,
1030 dsq_lnode_offs,
1031 task_offs,
1032 see_offs,
1033 ) {
1034 all_entries.extend(entries);
1035 dsq_states.push(state);
1036 }
1037 }
1038 }
1039
1040 (dsq_states, all_entries)
1041}
1042
1043/// Walk one `scx_dispatch_q`. Returns the DSQ scalar state plus the
1044/// task entries on its `list`.
1045///
1046/// `origin` is taken as a `FnOnce` closure so the per-call
1047/// `format!("local cpu {cpu}")` / `format!("bypass cpu {cpu}")` /
1048/// `format!("global node {node}")` heap allocation only fires
1049/// after the `dsq_pa == 0` early-out has been cleared. Eagerly
1050/// formatting at every caller wasted one short-string allocation
1051/// per skipped DSQ on every freeze.
1052///
1053/// Returns `None` when `dsq_pa == 0`. Reading at PA 0 would
1054/// surface the boot-page contents as DSQ scalars (`id`, `nr`,
1055/// `seq`) and an all-zero list-head as an apparently-empty queue
1056/// — indistinguishable from a real empty DSQ. The early check
1057/// rejects that case so the caller does not push a phantom
1058/// DsqState row built from PA-0 garbage.
1059#[allow(clippy::too_many_arguments)]
1060fn walk_one_dsq(
1061 mem: &GuestMem,
1062 walk: WalkContext,
1063 dsq_kva: u64,
1064 dsq_pa: u64,
1065 origin: impl FnOnce() -> String,
1066 dsq_offs: &super::btf_offsets::ScxDispatchQOffsets,
1067 dsq_lnode_offs: &super::btf_offsets::ScxDsqListNodeOffsets,
1068 task_offs: &super::btf_offsets::TaskStructCoreOffsets,
1069 see_offs: &super::btf_offsets::SchedExtEntityOffsets,
1070) -> Option<(DsqState, Vec<TaskWalkerEntry>)> {
1071 if dsq_pa == 0 {
1072 tracing::debug!(
1073 dsq_kva = format_args!("{:#x}", dsq_kva),
1074 "walk_one_dsq: dsq_pa == 0 — would alias the boot page; \
1075 skipping to avoid surfacing phantom all-zero DSQ state",
1076 );
1077 return None;
1078 }
1079 let origin = origin();
1080 let id = mem.read_u64(dsq_pa, dsq_offs.id);
1081 let nr = mem.read_u32(dsq_pa, dsq_offs.nr);
1082 let seq = mem.read_u32(dsq_pa, dsq_offs.seq);
1083
1084 // List head at dsq + list.
1085 let head_kva = dsq_kva.wrapping_add(dsq_offs.list as u64);
1086 let head_pa = dsq_pa.wrapping_add(dsq_offs.list as u64);
1087
1088 // The DSQ list links sched_ext_entity.dsq_list.node fields
1089 // (struct list_head inside scx_dsq_list_node inside
1090 // sched_ext_entity inside task_struct). container_of computes:
1091 // task_kva = node_kva
1092 // - task.scx
1093 // - see.dsq_list
1094 // - dsq_lnode.node
1095 let dsq_node_off_in_task = task_offs.scx + see_offs.dsq_list + dsq_lnode_offs.node;
1096
1097 let (task_kvas, truncated) = walk_list_head_for_dsq_task_kvas(
1098 mem,
1099 walk,
1100 head_kva,
1101 head_pa,
1102 dsq_node_off_in_task,
1103 dsq_lnode_offs,
1104 );
1105
1106 let entries: Vec<TaskWalkerEntry> = task_kvas
1107 .iter()
1108 .map(|&task_kva| TaskWalkerEntry {
1109 task_kva,
1110 // Tasks queued on a DSQ are NOT on the per-CPU
1111 // runnable_list — they're staged for dispatch but not
1112 // yet runnable in the rq->scx sense. The
1113 // pi_boosted_out_of_scx flag only fires for
1114 // runnable_list tasks (the scenario it diagnoses is a
1115 // task that should have left the runnable_list when
1116 // its sched_class changed but didn't).
1117 is_runnable_in_scx: false,
1118 running_pc: None,
1119 })
1120 .collect();
1121
1122 Some((
1123 DsqState {
1124 id,
1125 origin,
1126 nr,
1127 seq,
1128 task_kvas,
1129 truncated,
1130 },
1131 entries,
1132 ))
1133}
1134
1135/// Walk a generic `list_head` chain starting at `head_kva`/`head_pa`,
1136/// recovering each task_struct KVA via container_of with
1137/// `runnable_node_off_in_task` as the field offset within
1138/// task_struct.
1139///
1140/// Returns (task_kvas, truncated). `truncated` is true when the
1141/// MAX_NODES_PER_LIST cap kicked in before the walk closed back to
1142/// the head.
1143fn walk_list_head_for_task_kvas(
1144 mem: &GuestMem,
1145 walk: WalkContext,
1146 head_kva: u64,
1147 head_pa: u64,
1148 runnable_node_off_in_task: usize,
1149) -> (Vec<u64>, bool) {
1150 let mut task_kvas = Vec::new();
1151 let mut node_kva = mem.read_u64(head_pa, 0);
1152 if node_kva == 0 {
1153 return (task_kvas, false);
1154 }
1155
1156 let mut visited: u32 = 0;
1157 // Address-cycle backstop (see module doc): a corrupt / non-canonical
1158 // ring whose final .next points at an interior node instead of the
1159 // head bounds to its unique-node set and flags the partial walk,
1160 // rather than cycling to MAX_NODES_PER_LIST.
1161 let mut seen: std::collections::HashSet<u64> = std::collections::HashSet::new();
1162 while node_kva != head_kva {
1163 if visited >= MAX_NODES_PER_LIST {
1164 return (task_kvas, true);
1165 }
1166 if !seen.insert(node_kva) {
1167 return (task_kvas, true);
1168 }
1169 visited += 1;
1170
1171 // container_of: task_kva = node_kva - runnable_node_off_in_task
1172 let task_kva = node_kva.wrapping_sub(runnable_node_off_in_task as u64);
1173 task_kvas.push(task_kva);
1174
1175 // Step to next node — translate node_kva, read .next at offset 0.
1176 let Some(node_pa) = translate_any_kva(
1177 mem,
1178 walk.cr3_pa,
1179 walk.page_offset,
1180 node_kva,
1181 walk.l5,
1182 walk.tcr_el1,
1183 ) else {
1184 return (task_kvas, false);
1185 };
1186 let next_kva = mem.read_u64(node_pa, 0);
1187 if next_kva == 0 {
1188 return (task_kvas, false);
1189 }
1190 node_kva = next_kva;
1191 }
1192 (task_kvas, false)
1193}
1194
1195/// Walk a DSQ's `list` chain (a list of `scx_dsq_list_node.node`
1196/// entries embedded in `sched_ext_entity.dsq_list`). Skips iterator
1197/// cursor entries marked with `SCX_DSQ_LNODE_ITER_CURSOR`.
1198fn walk_list_head_for_dsq_task_kvas(
1199 mem: &GuestMem,
1200 walk: WalkContext,
1201 head_kva: u64,
1202 head_pa: u64,
1203 dsq_node_off_in_task: usize,
1204 dsq_lnode_offs: &super::btf_offsets::ScxDsqListNodeOffsets,
1205) -> (Vec<u64>, bool) {
1206 let mut task_kvas = Vec::new();
1207 let mut node_kva = mem.read_u64(head_pa, 0);
1208 if node_kva == 0 {
1209 return (task_kvas, false);
1210 }
1211
1212 let mut visited: u32 = 0;
1213 // Address-cycle backstop (see module doc): a corrupt / non-canonical
1214 // DSQ ring bounds to its unique-node set and flags the partial walk,
1215 // rather than cycling to MAX_NODES_PER_LIST.
1216 let mut seen: std::collections::HashSet<u64> = std::collections::HashSet::new();
1217 while node_kva != head_kva {
1218 if visited >= MAX_NODES_PER_LIST {
1219 return (task_kvas, true);
1220 }
1221 if !seen.insert(node_kva) {
1222 return (task_kvas, true);
1223 }
1224 visited += 1;
1225
1226 // The list_head we're walking is `scx_dsq_list_node.node`
1227 // (the inner `struct list_head`). Recover the parent
1228 // scx_dsq_list_node start by subtracting `dsq_lnode.node`
1229 // — fixed at 0 in current kernels but we read it from the
1230 // offsets struct for forward-compatibility.
1231 let lnode_kva = node_kva.wrapping_sub(dsq_lnode_offs.node as u64);
1232
1233 // Read the lnode's flags to skip iterator-cursor entries.
1234 // is_cursor: Some(true) → cursor (skip), Some(false) → real
1235 // task entry (push), None → translate failed and we cannot
1236 // distinguish. When the cursor flag cannot be read, treat
1237 // the entry as a cursor and skip it: pushing it would
1238 // record a phantom `task_kva = node_kva - dsq_node_off_in_task`
1239 // built from a node whose enclosing sched_ext_entity isn't
1240 // mappable, which downstream task enrichment would surface
1241 // as bogus pid/comm reads at an arbitrary address.
1242 let is_cursor = match translate_any_kva(
1243 mem,
1244 walk.cr3_pa,
1245 walk.page_offset,
1246 lnode_kva,
1247 walk.l5,
1248 walk.tcr_el1,
1249 ) {
1250 Some(lnode_pa) => {
1251 let lnode_flags = mem.read_u32(lnode_pa, dsq_lnode_offs.flags);
1252 Some(lnode_flags & SCX_DSQ_LNODE_ITER_CURSOR != 0)
1253 }
1254 None => None,
1255 };
1256
1257 let skip_entry = match is_cursor {
1258 Some(true) => true, // cursor entry — advance without recording
1259 Some(false) => false, // real task entry — push and advance
1260 None => true, // cursor-detection unreliable — skip rather than push bogus
1261 };
1262
1263 if !skip_entry {
1264 // Real task entry: container_of from the inner list_head's
1265 // node_kva back to task_struct. The full offset within
1266 // task_struct is task.scx + see.dsq_list + dsq_lnode.node.
1267 let task_kva = node_kva.wrapping_sub(dsq_node_off_in_task as u64);
1268 task_kvas.push(task_kva);
1269 }
1270
1271 // Advance to the next node. The list_head.next pointer
1272 // lives at offset 0 of the inner list_head we landed on,
1273 // which is `node_kva` itself.
1274 let Some(node_pa) = translate_any_kva(
1275 mem,
1276 walk.cr3_pa,
1277 walk.page_offset,
1278 node_kva,
1279 walk.l5,
1280 walk.tcr_el1,
1281 ) else {
1282 return (task_kvas, false);
1283 };
1284 let next_kva = mem.read_u64(node_pa, 0);
1285 if next_kva == 0 {
1286 return (task_kvas, false);
1287 }
1288 node_kva = next_kva;
1289 }
1290 (task_kvas, false)
1291}
1292
1293/// Walk the user-allocated DSQ rhashtable rooted at `rht_pa`.
1294///
1295/// `rht_pa` is the DRAM-relative offset of the embedded
1296/// `struct rhashtable` inside `scx_sched.dsq_hash`. The caller
1297/// computes it as `sched_pa + sched.dsq_hash` — the field is embedded
1298/// (not a pointer), so its PA is just the containing struct's PA plus
1299/// the field offset. The walker reads `tbl` (bucket_table pointer),
1300/// then for each of `tbl.buckets[i]` it strips the LSB tag
1301/// (`RHT_PTR_LOCK_BIT`) and chases the `rhash_head.next` chain. For
1302/// each node the walker computes
1303/// `dsq_kva = node_kva - scx_dispatch_q.hash_node` (container_of).
1304///
1305/// Caps:
1306/// - bucket count at [`MAX_RHT_BUCKETS`]
1307/// - total nodes visited at [`MAX_RHT_NODES`]
1308/// - per-bucket chain length at [`PER_BUCKET_CHAIN_CAP`]
1309///
1310/// Returns `(dsq_kvas, truncated)`. `truncated` is `true` when any
1311/// cap fired before the walk could reach the natural end of every
1312/// bucket chain — either `bucket_table.size > MAX_RHT_BUCKETS`,
1313/// `total_nodes >= MAX_RHT_NODES` mid-walk, or a per-bucket chain
1314/// reached its `PER_BUCKET_CHAIN_CAP` cap. Without this signal,
1315/// callers cannot distinguish "small DSQ count" from "cap silently
1316/// dropped tail entries" — see DsqState.truncated for the same
1317/// pattern on per-DSQ task lists.
1318fn walk_user_dsq_hash(
1319 mem: &GuestMem,
1320 walk: WalkContext,
1321 rht_pa: u64,
1322 rht_offs: &super::btf_offsets::RhashtableOffsets,
1323 dsq_offs: &super::btf_offsets::ScxDispatchQOffsets,
1324) -> (Vec<u64>, bool) {
1325 let mut dsq_kvas = Vec::new();
1326
1327 let tbl_kva = mem.read_u64(rht_pa, rht_offs.tbl);
1328 if tbl_kva == 0 {
1329 return (dsq_kvas, false);
1330 }
1331 let Some(tbl_pa) = translate_any_kva(
1332 mem,
1333 walk.cr3_pa,
1334 walk.page_offset,
1335 tbl_kva,
1336 walk.l5,
1337 walk.tcr_el1,
1338 ) else {
1339 return (dsq_kvas, false);
1340 };
1341
1342 let size = mem.read_u32(tbl_pa, rht_offs.bucket_table_size);
1343 let bucket_count = size.min(MAX_RHT_BUCKETS) as u64;
1344 // A bucket_table.size larger than the bucket cap means we'll
1345 // walk only the first MAX_RHT_BUCKETS buckets and the tail is
1346 // silently dropped. Surface that as truncation up front.
1347 let mut truncated = size as u64 > bucket_count;
1348 let buckets_off = rht_offs.bucket_table_buckets;
1349
1350 let mut total_nodes: u32 = 0;
1351 for i in 0..bucket_count {
1352 if total_nodes >= MAX_RHT_NODES {
1353 // Hit the global node cap — remaining buckets unwalked.
1354 return (dsq_kvas, true);
1355 }
1356 let entry_off = buckets_off + (i as usize) * 8;
1357 let raw_ptr = mem.read_u64(tbl_pa, entry_off);
1358 // Strip the LSB lock-bit tag. NULL or pure tag (0 with
1359 // bit 0 unset) means empty bucket.
1360 let head_kva = raw_ptr & !RHT_PTR_LOCK_BIT;
1361 if head_kva == 0 {
1362 continue;
1363 }
1364 // Chase the `rhash_head.next` chain. Each node is a
1365 // `rhash_head` embedded in scx_dispatch_q at
1366 // `hash_node`; container_of yields the dsq KVA.
1367 let mut node_kva = head_kva;
1368 let mut chain_visited: u32 = 0;
1369 let mut chain_terminated_naturally = false;
1370 while node_kva != 0 && total_nodes < MAX_RHT_NODES && chain_visited < PER_BUCKET_CHAIN_CAP {
1371 chain_visited += 1;
1372 total_nodes += 1;
1373 let dsq_kva = node_kva.wrapping_sub(dsq_offs.hash_node as u64);
1374 dsq_kvas.push(dsq_kva);
1375 let Some(node_pa) = translate_any_kva(
1376 mem,
1377 walk.cr3_pa,
1378 walk.page_offset,
1379 node_kva,
1380 walk.l5,
1381 walk.tcr_el1,
1382 ) else {
1383 // Translate failure — chain ended for this bucket
1384 // without hitting a cap. Not a truncation signal.
1385 chain_terminated_naturally = true;
1386 break;
1387 };
1388 let next_raw = mem.read_u64(node_pa, rht_offs.rhash_head_next);
1389 // The chain terminator is a "nulls" pointer with bit 0
1390 // set encoding the bucket index; treat any LSB-tagged
1391 // pointer as terminator.
1392 if next_raw & RHT_PTR_LOCK_BIT != 0 || next_raw == 0 {
1393 chain_terminated_naturally = true;
1394 break;
1395 }
1396 node_kva = next_raw;
1397 }
1398 // The loop exited; if it wasn't via a natural terminator
1399 // (LSB-tagged pointer / NULL / translate failure), one of
1400 // the two caps fired (chain_visited >= PER_BUCKET_CHAIN_CAP
1401 // or total_nodes >= MAX_RHT_NODES) and we silently dropped
1402 // the rest of this bucket's chain.
1403 if !chain_terminated_naturally {
1404 truncated = true;
1405 }
1406 }
1407
1408 (dsq_kvas, truncated)
1409}
1410
1411/// Read `(pid, comm)` for a `task_struct *` after a NULL-check and
1412/// translate. Returns `(None, None)` on NULL or untranslatable.
1413fn read_task_pid_comm(
1414 mem: &GuestMem,
1415 walk: WalkContext,
1416 task_kva: u64,
1417 pid_off: usize,
1418 comm_off: usize,
1419) -> (Option<i32>, Option<String>) {
1420 if task_kva == 0 {
1421 return (None, None);
1422 }
1423 let Some(task_pa) = translate_any_kva(
1424 mem,
1425 walk.cr3_pa,
1426 walk.page_offset,
1427 task_kva,
1428 walk.l5,
1429 walk.tcr_el1,
1430 ) else {
1431 return (None, None);
1432 };
1433 let pid = mem.read_u32(task_pa, pid_off) as i32;
1434 let mut buf = [0u8; 16];
1435 mem.read_bytes(task_pa + comm_off as u64, &mut buf);
1436 let n = buf.iter().position(|&b| b == 0).unwrap_or(16);
1437 let comm = String::from_utf8_lossy(&buf[..n]).to_string();
1438 (Some(pid), Some(comm))
1439}
1440
1441#[cfg(test)]
1442#[path = "scx_walker_tests.rs"]
1443mod tests;