ktstr/ctprof_compare/
pattern.rs

1//! Token-based name normalization used by [`crate::ctprof_compare`]
2//! to fold ephemeral digit/hex suffixes into pattern skeletons.
3//!
4//! Two callers feed this module:
5//! - thread / process axes ([`super::GroupBy::Comm`] /
6//!   [`super::GroupBy::Pcomm`]) use [`pattern_key`] /
7//!   [`pattern_counts_union`] / [`pattern_display_label`]; the
8//!   smaps_rollup keying uses only [`pattern_key`].
9//! - the cgroup axis applies a three-layer pipeline: layers 1-2
10//!   ([`apply_systemd_template`] → [`cgroup_skeleton_tokens`]) are
11//!   wrapped by [`cgroup_normalize_skeleton`]; layer 3
12//!   ([`tighten_group`]) runs separately in
13//!   [`crate::ctprof_compare::build_groups`].
14//!
15//! The rules are pure (no kernel introspection, no IO) so they
16//! sit in their own module without pulling the data-type or
17//! compute layers along.
18
19use std::collections::BTreeMap;
20use std::sync::LazyLock;
21
22use regex::Regex;
23
24use crate::ctprof::{CtprofSnapshot, ThreadState};
25
26/// Placeholder for a pure-digit token (rule 1 of the token-based
27/// normalizer). Replaces a token of all ASCII digits.
28const TOKEN_DIGIT_PLACEHOLDER: &str = "{N}";
29
30/// Placeholder for a hex-like token (rule 2 of the token-based
31/// normalizer). Replaces a token whose chars are all in `[0-9a-f]`,
32/// length ≥ 2, and contain at least one digit.
33const TOKEN_HEX_PLACEHOLDER: &str = "{H}";
34
35/// Placeholder for a systemd template instance whose value is an
36/// opaque ID (rule applied by [`apply_systemd_template`] in cgroup
37/// layer 1). For example, `user@0.service` and `user@1001.service`
38/// both normalize to `user@{I}.service` because their instances
39/// (`0`, `1001`) carry no `[._-]` separators that would suggest a
40/// structured service name.
41const TOKEN_INSTANCE_PLACEHOLDER: &str = "{I}";
42
43/// Rule 1 pattern: pure ASCII digits.
44static TOKEN_RULE_PURE_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[0-9]+$").unwrap());
45
46/// Rule 2 pattern: hex-like (all chars in `[0-9a-f]`, length ≥ 2).
47/// The "must contain at least one digit" check is applied
48/// separately because anchored character-class repetition does
49/// not natively express that constraint.
50static TOKEN_RULE_HEX_LIKE: LazyLock<Regex> =
51    LazyLock::new(|| Regex::new(r"^[0-9a-f]{2,}$").unwrap());
52
53/// Rule 3 pattern: alpha prefix (length ≥ 1) followed by
54/// trailing digits. Capture group 1 is the alpha prefix.
55static TOKEN_RULE_ALPHA_PREFIX_DIGITS: LazyLock<Regex> =
56    LazyLock::new(|| Regex::new(r"^([A-Za-z]+)[0-9]+$").unwrap());
57
58/// Rule 4 pattern: leading digits followed by an alpha suffix
59/// (length ≥ 1). Capture group 1 is the alpha suffix. Catches
60/// kworker high-priority bound workers (`1H`, `0H`, `2H` etc. —
61/// the `H` suffix added by `format_worker_id` in
62/// `kernel/workqueue.c` when the worker pool's nice value is
63/// negative).
64static TOKEN_RULE_DIGITS_ALPHA_SUFFIX: LazyLock<Regex> =
65    LazyLock::new(|| Regex::new(r"^[0-9]+([A-Za-z]+)$").unwrap());
66
67/// Token-classification rule. The token-based normalizer
68/// ([`pattern_key`]) walks segments produced by
69/// [`split_into_segments`] and applies the first rule that
70/// matches each token. Rules are checked in order; the first
71/// match wins. Rule patterns are direct regex translations of
72/// the thread-name normalization rules.
73pub(super) fn classify_token(t: &str) -> String {
74    if t.is_empty() {
75        return String::new();
76    }
77    // Rule 1: pure digits → `{N}`.
78    if TOKEN_RULE_PURE_DIGITS.is_match(t) {
79        return TOKEN_DIGIT_PLACEHOLDER.to_string();
80    }
81    // Rule 2: hex-like (all chars in [0-9a-f], length ≥ 2,
82    // contains at least one digit) → `{H}`. The regex enforces
83    // the character set + length; the `.contains` check enforces
84    // the "must have at least one digit" gate that the spec
85    // requires. Pure-alpha tokens like `abc` fail the digit check;
86    // pure-digit tokens fall through to rule 1 first.
87    if TOKEN_RULE_HEX_LIKE.is_match(t) && t.chars().any(|c| c.is_ascii_digit()) {
88        return TOKEN_HEX_PLACEHOLDER.to_string();
89    }
90    // Rule 3: alpha prefix + trailing digits → `prefix{N}`. The
91    // captured group is the alpha prefix; the trailing digit run
92    // is replaced with the placeholder. Single-letter alpha
93    // prefixes like `u8` (`kworker/u8:7`) qualify because the
94    // spec sets the prefix lower bound at 1.
95    if let Some(caps) = TOKEN_RULE_ALPHA_PREFIX_DIGITS.captures(t) {
96        return format!("{}{}", &caps[1], TOKEN_DIGIT_PLACEHOLDER);
97    }
98    // Rule 4: leading digits + alpha suffix → `{N}suffix`. The
99    // captured group is the alpha suffix. Comes AFTER rule 2 so
100    // hex-like tokens (`1a`, `0f`) take precedence over the
101    // leading-digit-suffix interpretation.
102    if let Some(caps) = TOKEN_RULE_DIGITS_ALPHA_SUFFIX.captures(t) {
103        return format!("{}{}", TOKEN_DIGIT_PLACEHOLDER, &caps[1]);
104    }
105    // Otherwise: keep literal.
106    t.to_string()
107}
108
109/// One segment of a tokenized string: either a non-separator run
110/// (a token) or a run of separator characters.
111#[derive(Debug, Clone, Copy, PartialEq, Eq)]
112pub(super) enum Segment<'a> {
113    Token(&'a str),
114    Separator(&'a str),
115}
116
117/// Returns true for any character treated as a token separator by
118/// the token-based normalizer. The set is `[.\-_/:@+\[\]\s]+` —
119/// ASCII punctuation `.`, `-`, `_`, `/`, `:`, `@`, `+`, `[`, `]`
120/// plus any Unicode whitespace. The `+` decoration kworker uses on
121/// active workers (`kworker/<cpu>:<id>+<wq>` per `wq_worker_comm`
122/// in `kernel/workqueue.c`) is a separator so the digit tokens on
123/// either side normalize independently. Brackets appear in
124/// process names set via `prctl(PR_SET_NAME)` (kernel threads in
125/// userspace tooling render as `[ksoftirqd/0]`, etc.) AND in the
126/// literal-mode smaps key shape `pcomm[tgid]` produced by
127/// [`crate::ctprof_compare::collect_smaps_rollup`] under
128/// [`crate::ctprof_compare::CompareOptions::no_thread_normalize`];
129/// treating brackets as separators allows the digit / hex tokens
130/// inside them to normalize independently from the surrounding
131/// alpha tokens.
132pub(super) fn is_token_separator(c: char) -> bool {
133    matches!(c, '.' | '-' | '_' | '/' | ':' | '@' | '+' | '[' | ']') || c.is_whitespace()
134}
135
136/// Walk the input and emit alternating token / separator runs.
137/// Empty input yields zero segments. Maximal runs are emitted —
138/// `a..b` produces `[Token("a"), Separator(".."), Token("b")]`.
139pub(super) fn split_into_segments(s: &str) -> Vec<Segment<'_>> {
140    let mut out = Vec::new();
141    if s.is_empty() {
142        return out;
143    }
144    let mut chars = s.char_indices().peekable();
145    while let Some(&(start, first_c)) = chars.peek() {
146        let is_sep = is_token_separator(first_c);
147        let mut end = start;
148        while let Some(&(idx, c)) = chars.peek() {
149            if is_token_separator(c) != is_sep {
150                break;
151            }
152            end = idx + c.len_utf8();
153            chars.next();
154        }
155        let slice = &s[start..end];
156        if is_sep {
157            out.push(Segment::Separator(slice));
158        } else {
159            out.push(Segment::Token(slice));
160        }
161    }
162    out
163}
164
165/// Compute the token-normalized skeleton for a name string.
166///
167/// Consumed by [`crate::ctprof_compare::GroupBy::Comm`]
168/// (thread-name grouping),
169/// [`crate::ctprof_compare::GroupBy::Pcomm`] (process-name
170/// grouping), and
171/// [`crate::ctprof_compare::collect_smaps_rollup`] (per-pcomm
172/// smaps aggregation) — each callsite passes a different field
173/// (`t.comm`, `t.pcomm`, `t.pcomm` respectively) and applies its
174/// own callsite-level policy on top of the skeleton this function
175/// returns.
176///
177/// Splits the input on a separator class (`[.\-_/:@+\[\]\s]+`),
178/// classifies each non-separator token by `classify_token`, and
179/// rejoins with the original separator runs preserved verbatim.
180/// The first matching rule wins per token:
181///
182/// 1. Pure digits → `{N}` (e.g. `42` → `{N}`).
183/// 2. Hex-like (all chars `[0-9a-f]`, length ≥ 2, contains at
184///    least one digit) → `{H}` (e.g. `abc123def` → `{H}`).
185/// 3. Alpha prefix + trailing digits (`^[A-Za-z]+\d+$`, alpha
186///    prefix length ≥ 1) → `prefix{N}` (e.g. `worker7` →
187///    `worker{N}`, `u8` → `u{N}`).
188/// 4. Leading digits + alpha suffix (`^\d+[A-Za-z]+$`) →
189///    `{N}suffix` (e.g. `1H` → `{N}H`, `100Hz` → `{N}Hz`).
190/// 5. Otherwise: keep literal.
191///
192/// Two names that produce the same skeleton group together at
193/// the bucket layer. The singleton-revert policy ("if only one
194/// thread / process matches a skeleton, revert to literal") is a
195/// callsite policy enforced by
196/// [`crate::ctprof_compare::build_groups`] — `pattern_key` itself
197/// always returns the skeleton, leaving callsites free to override
198/// (and indeed [`crate::ctprof_compare::collect_smaps_rollup`]
199/// does NOT singleton-revert; see its doc for why).
200///
201/// Examples:
202/// - `whirly-gig-15` → `whirly-gig-{N}`.
203/// - `kworker/0:0-wq_reclaim` → `kworker/{N}:{N}-wq_reclaim`.
204/// - `kworker/u8:7` → `kworker/u{N}:{N}` (single-letter alpha
205///   prefix `u` qualifies under rule 3).
206/// - `session-a1234` → `session-{H}` (hex-like).
207/// - `BPF_CUBIC` → `BPF_CUBIC` (pure alpha, no digits).
208/// - `bloop-tangler` → `bloop-tangler` (pure alpha).
209pub fn pattern_key(name: &str) -> String {
210    let segments = split_into_segments(name);
211    let mut out = String::new();
212    for seg in segments {
213        match seg {
214            Segment::Token(t) => out.push_str(&classify_token(t)),
215            Segment::Separator(s) => out.push_str(s),
216        }
217    }
218    out
219}
220
221/// Cgroup layer 1: systemd `template@instance.service`
222/// normalization. Walks the path, finding each
223/// `@<instance>.service` segment (bounded by `/` or end-of-string).
224/// If the instance contains any of `[._-]`, it is a structured
225/// service name and the segment is kept verbatim. Otherwise, the
226/// instance is treated as an opaque ID and the segment is rewritten
227/// to `@{I}.service`.
228///
229/// Examples:
230/// - `/user.slice/user-0.slice/user@0.service/boot.scope`
231///   → `/user.slice/user-0.slice/user@{I}.service/boot.scope`
232///   (`0` has no `[._-]`).
233/// - `/critical.slice/launcher@foo.bar.baz.service`
234///   → unchanged (instance `foo.bar.baz` has `.`).
235pub(super) fn apply_systemd_template(path: &str) -> String {
236    let mut out = String::new();
237    let mut rest = path;
238    while let Some(at_idx) = rest.find('@') {
239        out.push_str(&rest[..at_idx]);
240        out.push('@');
241        let after_at = &rest[at_idx + 1..];
242        // Bound the instance segment by the next `/` (or end-of-input).
243        let segment_end = after_at.find('/').unwrap_or(after_at.len());
244        let segment = &after_at[..segment_end];
245        if let Some(instance) = segment.strip_suffix(".service") {
246            if instance.is_empty() || instance.contains(['.', '_', '-']) {
247                // Structured instance — keep verbatim.
248                out.push_str(segment);
249            } else {
250                // Opaque ID — normalize.
251                out.push_str(TOKEN_INSTANCE_PLACEHOLDER);
252                out.push_str(".service");
253            }
254            rest = &after_at[segment_end..];
255        } else {
256            // No `.service` suffix on this segment — leave the `@`
257            // and continue scanning after it.
258            rest = after_at;
259        }
260    }
261    out.push_str(rest);
262    out
263}
264
265/// Cgroup layer 2: token-based normalization. Identical to
266/// [`pattern_key`] but operates on a cgroup path string. Returns
267/// the post-Layer-1 token list alongside the normalized skeleton —
268/// the token list is consumed by [`tighten_group`] to revert
269/// constant-across-members positions to literals (Layer 3).
270pub(super) fn cgroup_skeleton_tokens(post_l1: &str) -> (String, Vec<String>) {
271    let segments = split_into_segments(post_l1);
272    let mut skeleton = String::new();
273    let mut tokens = Vec::new();
274    for seg in segments {
275        match seg {
276            Segment::Token(t) => {
277                tokens.push(t.to_string());
278                skeleton.push_str(&classify_token(t));
279            }
280            Segment::Separator(s) => {
281                skeleton.push_str(s);
282            }
283        }
284    }
285    (skeleton, tokens)
286}
287
288/// Cgroup layer 3 (tighten): for a multi-member group sharing the
289/// same Layer-2 skeleton, revert a token position to its literal
290/// form only when its value is identical across every member AND
291/// that value is a pure literal (`classify_token` returns it
292/// unchanged). Positions that vary across members, or whose
293/// constant value is instance-classified (`{N}` / `{H}` /
294/// `prefix{N}` / `{N}suffix`), keep their Layer-2 placeholder.
295///
296/// Members carry both their post-Layer-1 path (used to recover
297/// separator runs verbatim from a representative member) and their
298/// per-position token list (compared across members for the
299/// position-by-position equality check). All members share the
300/// same number of tokens and the same separator structure by
301/// construction — they share a Layer-2 skeleton.
302///
303/// Returns the tightened skeleton; if every position varies
304/// (nothing to tighten), the result equals the input skeleton.
305pub(super) fn tighten_group(post_l1_paths: &[String], member_tokens: &[Vec<String>]) -> String {
306    let representative = match post_l1_paths.first() {
307        Some(p) => p,
308        None => return String::new(),
309    };
310    let segments = split_into_segments(representative);
311    let mut out = String::new();
312    let mut token_pos = 0;
313    for seg in segments {
314        match seg {
315            Segment::Token(_) => {
316                let first = &member_tokens[0][token_pos];
317                let classified = classify_token(first);
318                let all_equal = member_tokens
319                    .iter()
320                    .all(|tokens| &tokens[token_pos] == first);
321                if all_equal && classified == *first {
322                    out.push_str(first);
323                } else {
324                    out.push_str(&classified);
325                }
326                token_pos += 1;
327            }
328            Segment::Separator(s) => {
329                out.push_str(s);
330            }
331        }
332    }
333    out
334}
335
336/// Compute the cgroup grouping key for a path under
337/// [`crate::ctprof_compare::GroupBy::Cgroup`] aggregation. Applies
338/// Layer 1 (systemd template) and Layer 2 (token normalization).
339/// Layer 3 (tighten) runs separately on multi-member groups inside
340/// [`crate::ctprof_compare::build_groups`].
341///
342/// Returns `(layer2_skeleton, post_l1_path, post_l1_tokens)`. The
343/// skeleton is the join key; the post-L1 path and tokens feed
344/// [`tighten_group`] for groups with ≥ 2 members.
345pub(super) fn cgroup_normalize_skeleton(path: &str) -> (String, String, Vec<String>) {
346    let post_l1 = apply_systemd_template(path);
347    let (skeleton, tokens) = cgroup_skeleton_tokens(&post_l1);
348    (skeleton, post_l1, tokens)
349}
350
351/// Compute the operator-facing display label for a pattern-aware
352/// group, given the union of baseline+candidate member comms. For
353/// buckets with ≥ 2 distinct member names, runs grex over the
354/// sorted union to emit a regex that exactly matches the
355/// constituent thread names. For singleton or all-identical
356/// buckets, returns the join key unchanged so the rendered label
357/// equals what would have shown under literal grouping.
358///
359/// Empty `members` returns `key` — defensive against synthetic
360/// inputs; production builds populate `members` for every
361/// bucket.
362///
363/// When built without the `pretty-labels` feature, the grex
364/// regex-synthesis path is compiled out and the function returns
365/// the join key for every input — same bucket identity, just no
366/// rendered regex.
367pub fn pattern_display_label(key: &str, members: &[String]) -> String {
368    if members.len() < 2 {
369        return key.to_string();
370    }
371    #[cfg(feature = "pretty-labels")]
372    {
373        let regex = grex::RegExpBuilder::from(members).build();
374        if regex.len() <= key.len() {
375            return regex;
376        }
377    }
378    key.to_string()
379}
380
381/// Build the union frequency map for pattern-aware grouping
382/// ([`crate::ctprof_compare::GroupBy::Comm`] or
383/// [`crate::ctprof_compare::GroupBy::Pcomm`]) across the
384/// baseline + candidate snapshots. The frequency gate that
385/// promotes a `pattern_key` from per-thread literal to a clustered
386/// bucket must be evaluated against the UNION of both
387/// snapshots' threads — otherwise a pattern that has 1 thread
388/// in baseline + 3 threads in candidate would join under a
389/// `worker-{N}` key in candidate but a literal `worker-7` key in
390/// baseline, and `compare()` would surface the row as
391/// only-in-candidate. Computing the count from the union ensures
392/// the same key is used on both sides.
393///
394/// `field` selects which [`ThreadState`] string feeds the count:
395/// `|t| t.comm.as_str()` for `Comm`, `|t| t.pcomm.as_str()` for
396/// `Pcomm`. The two axes share the same union-frequency contract
397/// so one helper covers both.
398pub(super) fn pattern_counts_union(
399    baseline: &CtprofSnapshot,
400    candidate: &CtprofSnapshot,
401    field: fn(&ThreadState) -> &str,
402) -> BTreeMap<String, usize> {
403    let mut counts: BTreeMap<String, usize> = BTreeMap::new();
404    for t in baseline.threads.iter().chain(candidate.threads.iter()) {
405        *counts.entry(pattern_key(field(t))).or_insert(0) += 1;
406    }
407    counts
408}