ktstr/worker_ready_wait.rs
1//! Test-side poll helper for the worker ready marker. Separated from
2//! [`crate::worker_ready`] because this helper references
3//! [`crate::scenario::payload_run::PayloadHandle`]; the bin crate
4//! `ktstr-jemalloc-alloc-worker` pulls `worker_ready.rs` in via
5//! `#[path]` and must stay dependency-free (see that module's doc for
6//! why). This module is library-only.
7
8use crate::scenario::payload_run::PayloadHandle;
9use crate::worker_ready::{WORKER_READY_MARKER_PREFIX, worker_ready_marker_path};
10
11/// Poll for the worker's ready marker with a deadline, returning
12/// early if the worker exits before writing the marker or after
13/// writing but before the caller's subsequent dispatch.
14///
15/// Event-driven via `inotify` on the marker's parent directory
16/// (`IN_CREATE | IN_MOVED_TO`). The wait wakes on the actual file-
17/// create edge — kernel-scheduling-tick latency — instead of a
18/// 10 ms poll tail. inotify is set up BEFORE the initial existence
19/// probe so a marker that lands between probe and watch-add still
20/// fires the watch on a subsequent edge; the loop body re-checks
21/// existence on every wake so a stale watch edge from an unrelated
22/// file in the same directory doesn't false-fire.
23///
24/// The caller supplies `role` (e.g. `"worker"`, `"churn worker"`) and
25/// `exit_code_legend` (a variant-specific decoder for the
26/// worker-binary exit codes the caller wants printed in the error
27/// message). Worker cleanup on timeout happens via
28/// [`PayloadHandle::drop`] when the caller's `Result` error
29/// propagates — calling `PayloadHandle::kill(self)` here would take
30/// the handle by value, which we can't do behind an `&mut` borrow.
31///
32/// Consolidates what used to be two near-identical 20-line poll
33/// loops in `tests/jemalloc_probe_tests.rs` — a rename of the marker
34/// path, a change in poll interval, or a new early-exit shape now
35/// edits one site instead of two.
36pub fn wait_for_worker_ready(
37 worker: &mut PayloadHandle,
38 worker_pid: u32,
39 timeout: std::time::Duration,
40 role: &str,
41 exit_code_legend: &str,
42) -> anyhow::Result<()> {
43 use nix::poll::{PollFd, PollFlags, PollTimeout, poll};
44 use nix::sys::inotify::{AddWatchFlags, InitFlags, Inotify};
45 use std::os::unix::io::AsFd;
46
47 let ready_path = worker_ready_marker_path(worker_pid);
48 let deadline = std::time::Instant::now() + timeout;
49 // Parent directory of the marker. WORKER_READY_MARKER_PREFIX
50 // encodes the dir + filename prefix; the marker itself lives
51 // under /tmp so the watch attaches there. Pulling the dir from
52 // the prefix (rather than hardcoding `/tmp`) keeps the wait
53 // correct if the prefix moves.
54 let marker_dir = std::path::Path::new(WORKER_READY_MARKER_PREFIX)
55 .parent()
56 .unwrap_or_else(|| std::path::Path::new("/tmp"));
57 // Set up the inotify watch BEFORE the existence probe so a
58 // marker that lands between probe and watch is still observable
59 // on a subsequent unrelated edge (the loop re-checks existence
60 // on every wake). `IN_NONBLOCK` so read_events returns EAGAIN
61 // when the queue is empty instead of blocking — we drive
62 // wake-vs-timeout via poll(2). Fall back to the legacy 10 ms
63 // sleep when inotify_init / add_watch fail (a stripped kernel
64 // without CONFIG_INOTIFY_USER is the only known failure mode
65 // and worker tests must not be blocked by it).
66 let inotify_result =
67 Inotify::init(InitFlags::IN_CLOEXEC | InitFlags::IN_NONBLOCK).and_then(|i| {
68 i.add_watch(
69 marker_dir,
70 AddWatchFlags::IN_CREATE | AddWatchFlags::IN_MOVED_TO,
71 )?;
72 Ok(i)
73 });
74 while !std::path::Path::new(&ready_path).exists() {
75 if let Some((_, metrics)) = worker.try_wait()? {
76 anyhow::bail!(
77 "{role} pid={worker_pid} exited before creating ready marker \
78 {ready_path} (exit_code={} — see stderr; worker exit codes: \
79 {exit_code_legend})",
80 metrics.exit_code,
81 );
82 }
83 let now = std::time::Instant::now();
84 if now >= deadline {
85 anyhow::bail!(
86 "{role} pid={worker_pid} did not create ready marker {ready_path} \
87 within {timeout:?}",
88 );
89 }
90 let remaining_ms = deadline
91 .duration_since(now)
92 .as_millis()
93 .min(u16::MAX as u128) as u16;
94 match inotify_result.as_ref() {
95 Ok(inotify) => {
96 let fd = inotify.as_fd();
97 let mut pollfds = [PollFd::new(fd, PollFlags::POLLIN)];
98 let _ = poll(&mut pollfds, PollTimeout::from(remaining_ms));
99 // Drain pending events so the next poll doesn't
100 // re-fire on the same edge. We don't inspect the
101 // event payload — the path-exists check at the top
102 // of the loop is the source of truth.
103 let _ = inotify.read_events();
104 }
105 Err(_) => {
106 // CONFIG_INOTIFY_USER unavailable. Fall back to the
107 // legacy 10 ms sleep so the wait still completes —
108 // covered by the deadline check above.
109 std::thread::sleep(
110 std::time::Duration::from_millis(10).min(deadline.duration_since(now)),
111 );
112 }
113 }
114 }
115 // Narrow-race close: the worker may have written the marker and
116 // then died between the write and the caller's next probe
117 // dispatch (unusual — the worker is supposed to park — but a
118 // fatal Drop or kernel SIGKILL could still fire). One more
119 // try_wait surfaces that case with an actionable error instead
120 // of letting the caller burn wall-time on a dead pid.
121 if let Some((_, metrics)) = worker.try_wait()? {
122 anyhow::bail!(
123 "{role} pid={worker_pid} exited after writing ready marker but \
124 before the caller's next dispatch (exit_code={} — see stderr)",
125 metrics.exit_code,
126 );
127 }
128 Ok(())
129}