ktstr/
worker_ready_wait.rs

1//! Test-side poll helper for the worker ready marker. Separated from
2//! [`crate::worker_ready`] because this helper references
3//! [`crate::scenario::payload_run::PayloadHandle`]; the bin crate
4//! `ktstr-jemalloc-alloc-worker` pulls `worker_ready.rs` in via
5//! `#[path]` and must stay dependency-free (see that module's doc for
6//! why). This module is library-only.
7
8use crate::scenario::payload_run::PayloadHandle;
9use crate::worker_ready::{WORKER_READY_MARKER_PREFIX, worker_ready_marker_path};
10
11/// Poll for the worker's ready marker with a deadline, returning
12/// early if the worker exits before writing the marker or after
13/// writing but before the caller's subsequent dispatch.
14///
15/// Event-driven via `inotify` on the marker's parent directory
16/// (`IN_CREATE | IN_MOVED_TO`). The wait wakes on the actual file-
17/// create edge — kernel-scheduling-tick latency — instead of a
18/// 10 ms poll tail. inotify is set up BEFORE the initial existence
19/// probe so a marker that lands between probe and watch-add still
20/// fires the watch on a subsequent edge; the loop body re-checks
21/// existence on every wake so a stale watch edge from an unrelated
22/// file in the same directory doesn't false-fire.
23///
24/// The caller supplies `role` (e.g. `"worker"`, `"churn worker"`) and
25/// `exit_code_legend` (a variant-specific decoder for the
26/// worker-binary exit codes the caller wants printed in the error
27/// message). Worker cleanup on timeout happens via
28/// [`PayloadHandle::drop`] when the caller's `Result` error
29/// propagates — calling `PayloadHandle::kill(self)` here would take
30/// the handle by value, which we can't do behind an `&mut` borrow.
31///
32/// Consolidates what used to be two near-identical 20-line poll
33/// loops in `tests/jemalloc_probe_tests.rs` — a rename of the marker
34/// path, a change in poll interval, or a new early-exit shape now
35/// edits one site instead of two.
36pub fn wait_for_worker_ready(
37    worker: &mut PayloadHandle,
38    worker_pid: u32,
39    timeout: std::time::Duration,
40    role: &str,
41    exit_code_legend: &str,
42) -> anyhow::Result<()> {
43    use nix::poll::{PollFd, PollFlags, PollTimeout, poll};
44    use nix::sys::inotify::{AddWatchFlags, InitFlags, Inotify};
45    use std::os::unix::io::AsFd;
46
47    let ready_path = worker_ready_marker_path(worker_pid);
48    let deadline = std::time::Instant::now() + timeout;
49    // Parent directory of the marker. WORKER_READY_MARKER_PREFIX
50    // encodes the dir + filename prefix; the marker itself lives
51    // under /tmp so the watch attaches there. Pulling the dir from
52    // the prefix (rather than hardcoding `/tmp`) keeps the wait
53    // correct if the prefix moves.
54    let marker_dir = std::path::Path::new(WORKER_READY_MARKER_PREFIX)
55        .parent()
56        .unwrap_or_else(|| std::path::Path::new("/tmp"));
57    // Set up the inotify watch BEFORE the existence probe so a
58    // marker that lands between probe and watch is still observable
59    // on a subsequent unrelated edge (the loop re-checks existence
60    // on every wake). `IN_NONBLOCK` so read_events returns EAGAIN
61    // when the queue is empty instead of blocking — we drive
62    // wake-vs-timeout via poll(2). Fall back to the legacy 10 ms
63    // sleep when inotify_init / add_watch fail (a stripped kernel
64    // without CONFIG_INOTIFY_USER is the only known failure mode
65    // and worker tests must not be blocked by it).
66    let inotify_result =
67        Inotify::init(InitFlags::IN_CLOEXEC | InitFlags::IN_NONBLOCK).and_then(|i| {
68            i.add_watch(
69                marker_dir,
70                AddWatchFlags::IN_CREATE | AddWatchFlags::IN_MOVED_TO,
71            )?;
72            Ok(i)
73        });
74    while !std::path::Path::new(&ready_path).exists() {
75        if let Some((_, metrics)) = worker.try_wait()? {
76            anyhow::bail!(
77                "{role} pid={worker_pid} exited before creating ready marker \
78                 {ready_path} (exit_code={} — see stderr; worker exit codes: \
79                 {exit_code_legend})",
80                metrics.exit_code,
81            );
82        }
83        let now = std::time::Instant::now();
84        if now >= deadline {
85            anyhow::bail!(
86                "{role} pid={worker_pid} did not create ready marker {ready_path} \
87                 within {timeout:?}",
88            );
89        }
90        let remaining_ms = deadline
91            .duration_since(now)
92            .as_millis()
93            .min(u16::MAX as u128) as u16;
94        match inotify_result.as_ref() {
95            Ok(inotify) => {
96                let fd = inotify.as_fd();
97                let mut pollfds = [PollFd::new(fd, PollFlags::POLLIN)];
98                let _ = poll(&mut pollfds, PollTimeout::from(remaining_ms));
99                // Drain pending events so the next poll doesn't
100                // re-fire on the same edge. We don't inspect the
101                // event payload — the path-exists check at the top
102                // of the loop is the source of truth.
103                let _ = inotify.read_events();
104            }
105            Err(_) => {
106                // CONFIG_INOTIFY_USER unavailable. Fall back to the
107                // legacy 10 ms sleep so the wait still completes —
108                // covered by the deadline check above.
109                std::thread::sleep(
110                    std::time::Duration::from_millis(10).min(deadline.duration_since(now)),
111                );
112            }
113        }
114    }
115    // Narrow-race close: the worker may have written the marker and
116    // then died between the write and the caller's next probe
117    // dispatch (unusual — the worker is supposed to park — but a
118    // fatal Drop or kernel SIGKILL could still fire). One more
119    // try_wait surfaces that case with an actionable error instead
120    // of letting the caller burn wall-time on a dead pid.
121    if let Some((_, metrics)) = worker.try_wait()? {
122        anyhow::bail!(
123            "{role} pid={worker_pid} exited after writing ready marker but \
124             before the caller's next dispatch (exit_code={} — see stderr)",
125            metrics.exit_code,
126        );
127    }
128    Ok(())
129}