ktstr/ctprof_compare/pattern.rs
1//! Token-based name normalization used by [`crate::ctprof_compare`]
2//! to fold ephemeral digit/hex suffixes into pattern skeletons.
3//!
4//! Two callers feed this module:
5//! - thread / process axes ([`super::GroupBy::Comm`] /
6//! [`super::GroupBy::Pcomm`]) use [`pattern_key`] /
7//! [`pattern_counts_union`] / [`pattern_display_label`]; the
8//! smaps_rollup keying uses only [`pattern_key`].
9//! - the cgroup axis applies a three-layer pipeline: layers 1-2
10//! ([`apply_systemd_template`] → [`cgroup_skeleton_tokens`]) are
11//! wrapped by [`cgroup_normalize_skeleton`]; layer 3
12//! ([`tighten_group`]) runs separately in
13//! [`crate::ctprof_compare::build_groups`].
14//!
15//! The rules are pure (no kernel introspection, no IO) so they
16//! sit in their own module without pulling the data-type or
17//! compute layers along.
18
19use std::collections::BTreeMap;
20use std::sync::LazyLock;
21
22use regex::Regex;
23
24use crate::ctprof::{CtprofSnapshot, ThreadState};
25
26/// Placeholder for a pure-digit token (rule 1 of the token-based
27/// normalizer). Replaces a token of all ASCII digits.
28const TOKEN_DIGIT_PLACEHOLDER: &str = "{N}";
29
30/// Placeholder for a hex-like token (rule 2 of the token-based
31/// normalizer). Replaces a token whose chars are all in `[0-9a-f]`,
32/// length ≥ 2, and contain at least one digit.
33const TOKEN_HEX_PLACEHOLDER: &str = "{H}";
34
35/// Placeholder for a systemd template instance whose value is an
36/// opaque ID (rule applied by [`apply_systemd_template`] in cgroup
37/// layer 1). For example, `user@0.service` and `user@1001.service`
38/// both normalize to `user@{I}.service` because their instances
39/// (`0`, `1001`) carry no `[._-]` separators that would suggest a
40/// structured service name.
41const TOKEN_INSTANCE_PLACEHOLDER: &str = "{I}";
42
43/// Rule 1 pattern: pure ASCII digits.
44static TOKEN_RULE_PURE_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[0-9]+$").unwrap());
45
46/// Rule 2 pattern: hex-like (all chars in `[0-9a-f]`, length ≥ 2).
47/// The "must contain at least one digit" check is applied
48/// separately because anchored character-class repetition does
49/// not natively express that constraint.
50static TOKEN_RULE_HEX_LIKE: LazyLock<Regex> =
51 LazyLock::new(|| Regex::new(r"^[0-9a-f]{2,}$").unwrap());
52
53/// Rule 3 pattern: alpha prefix (length ≥ 1) followed by
54/// trailing digits. Capture group 1 is the alpha prefix.
55static TOKEN_RULE_ALPHA_PREFIX_DIGITS: LazyLock<Regex> =
56 LazyLock::new(|| Regex::new(r"^([A-Za-z]+)[0-9]+$").unwrap());
57
58/// Rule 4 pattern: leading digits followed by an alpha suffix
59/// (length ≥ 1). Capture group 1 is the alpha suffix. Catches
60/// kworker high-priority bound workers (`1H`, `0H`, `2H` etc. —
61/// the `H` suffix added by `format_worker_id` in
62/// `kernel/workqueue.c` when the worker pool's nice value is
63/// negative).
64static TOKEN_RULE_DIGITS_ALPHA_SUFFIX: LazyLock<Regex> =
65 LazyLock::new(|| Regex::new(r"^[0-9]+([A-Za-z]+)$").unwrap());
66
67/// Token-classification rule. The token-based normalizer
68/// ([`pattern_key`]) walks segments produced by
69/// [`split_into_segments`] and applies the first rule that
70/// matches each token. Rules are checked in order; the first
71/// match wins. Rule patterns are direct regex translations of
72/// the thread-name normalization rules.
73pub(super) fn classify_token(t: &str) -> String {
74 if t.is_empty() {
75 return String::new();
76 }
77 // Rule 1: pure digits → `{N}`.
78 if TOKEN_RULE_PURE_DIGITS.is_match(t) {
79 return TOKEN_DIGIT_PLACEHOLDER.to_string();
80 }
81 // Rule 2: hex-like (all chars in [0-9a-f], length ≥ 2,
82 // contains at least one digit) → `{H}`. The regex enforces
83 // the character set + length; the `.contains` check enforces
84 // the "must have at least one digit" gate that the spec
85 // requires. Pure-alpha tokens like `abc` fail the digit check;
86 // pure-digit tokens fall through to rule 1 first.
87 if TOKEN_RULE_HEX_LIKE.is_match(t) && t.chars().any(|c| c.is_ascii_digit()) {
88 return TOKEN_HEX_PLACEHOLDER.to_string();
89 }
90 // Rule 3: alpha prefix + trailing digits → `prefix{N}`. The
91 // captured group is the alpha prefix; the trailing digit run
92 // is replaced with the placeholder. Single-letter alpha
93 // prefixes like `u8` (`kworker/u8:7`) qualify because the
94 // spec sets the prefix lower bound at 1.
95 if let Some(caps) = TOKEN_RULE_ALPHA_PREFIX_DIGITS.captures(t) {
96 return format!("{}{}", &caps[1], TOKEN_DIGIT_PLACEHOLDER);
97 }
98 // Rule 4: leading digits + alpha suffix → `{N}suffix`. The
99 // captured group is the alpha suffix. Comes AFTER rule 2 so
100 // hex-like tokens (`1a`, `0f`) take precedence over the
101 // leading-digit-suffix interpretation.
102 if let Some(caps) = TOKEN_RULE_DIGITS_ALPHA_SUFFIX.captures(t) {
103 return format!("{}{}", TOKEN_DIGIT_PLACEHOLDER, &caps[1]);
104 }
105 // Otherwise: keep literal.
106 t.to_string()
107}
108
109/// One segment of a tokenized string: either a non-separator run
110/// (a token) or a run of separator characters.
111#[derive(Debug, Clone, Copy, PartialEq, Eq)]
112pub(super) enum Segment<'a> {
113 Token(&'a str),
114 Separator(&'a str),
115}
116
117/// Returns true for any character treated as a token separator by
118/// the token-based normalizer. The set is `[.\-_/:@+\[\]\s]+` —
119/// ASCII punctuation `.`, `-`, `_`, `/`, `:`, `@`, `+`, `[`, `]`
120/// plus any Unicode whitespace. The `+` decoration kworker uses on
121/// active workers (`kworker/<cpu>:<id>+<wq>` per `wq_worker_comm`
122/// in `kernel/workqueue.c`) is a separator so the digit tokens on
123/// either side normalize independently. Brackets appear in
124/// process names set via `prctl(PR_SET_NAME)` (kernel threads in
125/// userspace tooling render as `[ksoftirqd/0]`, etc.) AND in the
126/// literal-mode smaps key shape `pcomm[tgid]` produced by
127/// [`crate::ctprof_compare::collect_smaps_rollup`] under
128/// [`crate::ctprof_compare::CompareOptions::no_thread_normalize`];
129/// treating brackets as separators allows the digit / hex tokens
130/// inside them to normalize independently from the surrounding
131/// alpha tokens.
132pub(super) fn is_token_separator(c: char) -> bool {
133 matches!(c, '.' | '-' | '_' | '/' | ':' | '@' | '+' | '[' | ']') || c.is_whitespace()
134}
135
136/// Walk the input and emit alternating token / separator runs.
137/// Empty input yields zero segments. Maximal runs are emitted —
138/// `a..b` produces `[Token("a"), Separator(".."), Token("b")]`.
139pub(super) fn split_into_segments(s: &str) -> Vec<Segment<'_>> {
140 let mut out = Vec::new();
141 if s.is_empty() {
142 return out;
143 }
144 let mut chars = s.char_indices().peekable();
145 while let Some(&(start, first_c)) = chars.peek() {
146 let is_sep = is_token_separator(first_c);
147 let mut end = start;
148 while let Some(&(idx, c)) = chars.peek() {
149 if is_token_separator(c) != is_sep {
150 break;
151 }
152 end = idx + c.len_utf8();
153 chars.next();
154 }
155 let slice = &s[start..end];
156 if is_sep {
157 out.push(Segment::Separator(slice));
158 } else {
159 out.push(Segment::Token(slice));
160 }
161 }
162 out
163}
164
165/// Compute the token-normalized skeleton for a name string.
166///
167/// Consumed by [`crate::ctprof_compare::GroupBy::Comm`]
168/// (thread-name grouping),
169/// [`crate::ctprof_compare::GroupBy::Pcomm`] (process-name
170/// grouping), and
171/// [`crate::ctprof_compare::collect_smaps_rollup`] (per-pcomm
172/// smaps aggregation) — each callsite passes a different field
173/// (`t.comm`, `t.pcomm`, `t.pcomm` respectively) and applies its
174/// own callsite-level policy on top of the skeleton this function
175/// returns.
176///
177/// Splits the input on a separator class (`[.\-_/:@+\[\]\s]+`),
178/// classifies each non-separator token by `classify_token`, and
179/// rejoins with the original separator runs preserved verbatim.
180/// The first matching rule wins per token:
181///
182/// 1. Pure digits → `{N}` (e.g. `42` → `{N}`).
183/// 2. Hex-like (all chars `[0-9a-f]`, length ≥ 2, contains at
184/// least one digit) → `{H}` (e.g. `abc123def` → `{H}`).
185/// 3. Alpha prefix + trailing digits (`^[A-Za-z]+\d+$`, alpha
186/// prefix length ≥ 1) → `prefix{N}` (e.g. `worker7` →
187/// `worker{N}`, `u8` → `u{N}`).
188/// 4. Leading digits + alpha suffix (`^\d+[A-Za-z]+$`) →
189/// `{N}suffix` (e.g. `1H` → `{N}H`, `100Hz` → `{N}Hz`).
190/// 5. Otherwise: keep literal.
191///
192/// Two names that produce the same skeleton group together at
193/// the bucket layer. The singleton-revert policy ("if only one
194/// thread / process matches a skeleton, revert to literal") is a
195/// callsite policy enforced by
196/// [`crate::ctprof_compare::build_groups`] — `pattern_key` itself
197/// always returns the skeleton, leaving callsites free to override
198/// (and indeed [`crate::ctprof_compare::collect_smaps_rollup`]
199/// does NOT singleton-revert; see its doc for why).
200///
201/// Examples:
202/// - `whirly-gig-15` → `whirly-gig-{N}`.
203/// - `kworker/0:0-wq_reclaim` → `kworker/{N}:{N}-wq_reclaim`.
204/// - `kworker/u8:7` → `kworker/u{N}:{N}` (single-letter alpha
205/// prefix `u` qualifies under rule 3).
206/// - `session-a1234` → `session-{H}` (hex-like).
207/// - `BPF_CUBIC` → `BPF_CUBIC` (pure alpha, no digits).
208/// - `bloop-tangler` → `bloop-tangler` (pure alpha).
209pub fn pattern_key(name: &str) -> String {
210 let segments = split_into_segments(name);
211 let mut out = String::new();
212 for seg in segments {
213 match seg {
214 Segment::Token(t) => out.push_str(&classify_token(t)),
215 Segment::Separator(s) => out.push_str(s),
216 }
217 }
218 out
219}
220
221/// Cgroup layer 1: systemd `template@instance.service`
222/// normalization. Walks the path, finding each
223/// `@<instance>.service` segment (bounded by `/` or end-of-string).
224/// If the instance contains any of `[._-]`, it is a structured
225/// service name and the segment is kept verbatim. Otherwise, the
226/// instance is treated as an opaque ID and the segment is rewritten
227/// to `@{I}.service`.
228///
229/// Examples:
230/// - `/user.slice/user-0.slice/user@0.service/boot.scope`
231/// → `/user.slice/user-0.slice/user@{I}.service/boot.scope`
232/// (`0` has no `[._-]`).
233/// - `/critical.slice/launcher@foo.bar.baz.service`
234/// → unchanged (instance `foo.bar.baz` has `.`).
235pub(super) fn apply_systemd_template(path: &str) -> String {
236 let mut out = String::new();
237 let mut rest = path;
238 while let Some(at_idx) = rest.find('@') {
239 out.push_str(&rest[..at_idx]);
240 out.push('@');
241 let after_at = &rest[at_idx + 1..];
242 // Bound the instance segment by the next `/` (or end-of-input).
243 let segment_end = after_at.find('/').unwrap_or(after_at.len());
244 let segment = &after_at[..segment_end];
245 if let Some(instance) = segment.strip_suffix(".service") {
246 if instance.is_empty() || instance.contains(['.', '_', '-']) {
247 // Structured instance — keep verbatim.
248 out.push_str(segment);
249 } else {
250 // Opaque ID — normalize.
251 out.push_str(TOKEN_INSTANCE_PLACEHOLDER);
252 out.push_str(".service");
253 }
254 rest = &after_at[segment_end..];
255 } else {
256 // No `.service` suffix on this segment — leave the `@`
257 // and continue scanning after it.
258 rest = after_at;
259 }
260 }
261 out.push_str(rest);
262 out
263}
264
265/// Cgroup layer 2: token-based normalization. Identical to
266/// [`pattern_key`] but operates on a cgroup path string. Returns
267/// the post-Layer-1 token list alongside the normalized skeleton —
268/// the token list is consumed by [`tighten_group`] to revert
269/// constant-across-members positions to literals (Layer 3).
270pub(super) fn cgroup_skeleton_tokens(post_l1: &str) -> (String, Vec<String>) {
271 let segments = split_into_segments(post_l1);
272 let mut skeleton = String::new();
273 let mut tokens = Vec::new();
274 for seg in segments {
275 match seg {
276 Segment::Token(t) => {
277 tokens.push(t.to_string());
278 skeleton.push_str(&classify_token(t));
279 }
280 Segment::Separator(s) => {
281 skeleton.push_str(s);
282 }
283 }
284 }
285 (skeleton, tokens)
286}
287
288/// Cgroup layer 3 (tighten): for a multi-member group sharing the
289/// same Layer-2 skeleton, revert a token position to its literal
290/// form only when its value is identical across every member AND
291/// that value is a pure literal (`classify_token` returns it
292/// unchanged). Positions that vary across members, or whose
293/// constant value is instance-classified (`{N}` / `{H}` /
294/// `prefix{N}` / `{N}suffix`), keep their Layer-2 placeholder.
295///
296/// Members carry both their post-Layer-1 path (used to recover
297/// separator runs verbatim from a representative member) and their
298/// per-position token list (compared across members for the
299/// position-by-position equality check). All members share the
300/// same number of tokens and the same separator structure by
301/// construction — they share a Layer-2 skeleton.
302///
303/// Returns the tightened skeleton; if every position varies
304/// (nothing to tighten), the result equals the input skeleton.
305pub(super) fn tighten_group(post_l1_paths: &[String], member_tokens: &[Vec<String>]) -> String {
306 let representative = match post_l1_paths.first() {
307 Some(p) => p,
308 None => return String::new(),
309 };
310 let segments = split_into_segments(representative);
311 let mut out = String::new();
312 let mut token_pos = 0;
313 for seg in segments {
314 match seg {
315 Segment::Token(_) => {
316 let first = &member_tokens[0][token_pos];
317 let classified = classify_token(first);
318 let all_equal = member_tokens
319 .iter()
320 .all(|tokens| &tokens[token_pos] == first);
321 if all_equal && classified == *first {
322 out.push_str(first);
323 } else {
324 out.push_str(&classified);
325 }
326 token_pos += 1;
327 }
328 Segment::Separator(s) => {
329 out.push_str(s);
330 }
331 }
332 }
333 out
334}
335
336/// Compute the cgroup grouping key for a path under
337/// [`crate::ctprof_compare::GroupBy::Cgroup`] aggregation. Applies
338/// Layer 1 (systemd template) and Layer 2 (token normalization).
339/// Layer 3 (tighten) runs separately on multi-member groups inside
340/// [`crate::ctprof_compare::build_groups`].
341///
342/// Returns `(layer2_skeleton, post_l1_path, post_l1_tokens)`. The
343/// skeleton is the join key; the post-L1 path and tokens feed
344/// [`tighten_group`] for groups with ≥ 2 members.
345pub(super) fn cgroup_normalize_skeleton(path: &str) -> (String, String, Vec<String>) {
346 let post_l1 = apply_systemd_template(path);
347 let (skeleton, tokens) = cgroup_skeleton_tokens(&post_l1);
348 (skeleton, post_l1, tokens)
349}
350
351/// Compute the operator-facing display label for a pattern-aware
352/// group, given the union of baseline+candidate member comms. For
353/// buckets with ≥ 2 distinct member names, runs grex over the
354/// sorted union to emit a regex that exactly matches the
355/// constituent thread names. For singleton or all-identical
356/// buckets, returns the join key unchanged so the rendered label
357/// equals what would have shown under literal grouping.
358///
359/// Empty `members` returns `key` — defensive against synthetic
360/// inputs; production builds populate `members` for every
361/// bucket.
362///
363/// When built without the `pretty-labels` feature, the grex
364/// regex-synthesis path is compiled out and the function returns
365/// the join key for every input — same bucket identity, just no
366/// rendered regex.
367pub fn pattern_display_label(key: &str, members: &[String]) -> String {
368 if members.len() < 2 {
369 return key.to_string();
370 }
371 #[cfg(feature = "pretty-labels")]
372 {
373 let regex = grex::RegExpBuilder::from(members).build();
374 if regex.len() <= key.len() {
375 return regex;
376 }
377 }
378 key.to_string()
379}
380
381/// Build the union frequency map for pattern-aware grouping
382/// ([`crate::ctprof_compare::GroupBy::Comm`] or
383/// [`crate::ctprof_compare::GroupBy::Pcomm`]) across the
384/// baseline + candidate snapshots. The frequency gate that
385/// promotes a `pattern_key` from per-thread literal to a clustered
386/// bucket must be evaluated against the UNION of both
387/// snapshots' threads — otherwise a pattern that has 1 thread
388/// in baseline + 3 threads in candidate would join under a
389/// `worker-{N}` key in candidate but a literal `worker-7` key in
390/// baseline, and `compare()` would surface the row as
391/// only-in-candidate. Computing the count from the union ensures
392/// the same key is used on both sides.
393///
394/// `field` selects which [`ThreadState`] string feeds the count:
395/// `|t| t.comm.as_str()` for `Comm`, `|t| t.pcomm.as_str()` for
396/// `Pcomm`. The two axes share the same union-frequency contract
397/// so one helper covers both.
398pub(super) fn pattern_counts_union(
399 baseline: &CtprofSnapshot,
400 candidate: &CtprofSnapshot,
401 field: fn(&ThreadState) -> &str,
402) -> BTreeMap<String, usize> {
403 let mut counts: BTreeMap<String, usize> = BTreeMap::new();
404 for t in baseline.threads.iter().chain(candidate.threads.iter()) {
405 *counts.entry(pattern_key(field(t))).or_insert(0) += 1;
406 }
407 counts
408}