ktstr/
fun.rs

1//! Fun mode — deterministically rename a JSON dump's non-metric values to
2//! playful `adjective-animal` names (and hashed numeric IDs). Every non-metric
3//! value is funified by default: strings and integers under non-metric keys
4//! become a deterministic fun name, while values under metric-allowlisted keys
5//! (counts, rates, ratios, byte/duration units, structural enums) pass through
6//! unchanged. Structure and relationships are preserved — which keys nest
7//! where, which values co-refer — so `agile-otter migrated from CPU 3 to
8//! CPU 7` reads the same, just with fun names. Deterministic per seed; a
9//! funify is a one-way rename.
10//!
11//! # Polarity: metric allowlist
12//!
13//! The walker funifies **every** value by default and passes through
14//! only the values whose containing key is a recognised metric
15//! ([`Funifier::is_metric_passthrough`]). This is the inverse of v1's
16//! identifier deny-list. A novel identifier-shaped field added to a
17//! schema is hidden by default; only counts / rates / ratios /
18//! byte-and-duration units / structural enums survive funification.
19//! The suffix-based allowlist may over-match novel keys ending in
20//! structural-enum suffixes (`_type`, `_kind`, `_state`, `_len`,
21//! `_offset`) — schema-driven classification is a future direction
22//! that would remove the heuristic's false positives.
23//!
24//! # Surfaces
25//!
26//!   - [`Funifier::petname_for`] turns a string identifier (cgroup
27//!     name, process comm, scheduler name, ...) into a deterministic
28//!     `adjective-animal` pair like `"agile-otter"`.
29//!   - [`Funifier::numeric_id`] turns a u64 identifier (pid, tid, cpu,
30//!     cgroup id, ...) into another u64 via a SipHash-2-4 keyed permutation.
31//!     The mapping is deterministic per `(seed, category, n)` so
32//!     cross-references inside a dump survive.
33//!
34//! Categories namespace the mapping: `petname_for("pid", "42")` and
35//! `petname_for("cgroup", "42")` produce different fun names because
36//! the category byte string is mixed into the keyed hash. The walker
37//! uses each non-metric key's literal name as the namespace, so two
38//! values under the same key collide deterministically (intentional —
39//! cross-reference preservation) and two values under different keys
40//! don't. Two pids with the same numeric value across two different
41//! dumps map to the same fun name only when both dumps share a
42//! `--seed`.
43//!
44//! Determinism contract: given a fixed seed, the same input always
45//! produces the same fun output. With the default
46//! [`Funifier::ephemeral`] constructor a fresh random key is
47//! generated per process invocation; `--seed` on the CLI passes
48//! through to [`Funifier::with_seed`] so a user can correlate fun
49//! names across multiple `funify` runs of the same dump.
50
51use std::hash::Hasher;
52
53use sha2::{Digest, Sha256};
54use siphasher::sip128::{Hasher128, SipHasher24};
55
56/// Fixed pepper mixed into seed-derived keys so two users picking
57/// the same `--seed` value get a different keyed mapping than each
58/// other unless they also coordinate the pepper. Burned into the
59/// binary on purpose — no need to make this configurable, the
60/// determinism contract is "same seed within one binary" not "same
61/// seed across the world".
62const FUN_PEPPER: &[u8] = b"ktstr-fun-mode/v1";
63
64/// All-vCPU fun-mode key + petname dictionary handle. Cheap to
65/// clone (everything inside is `Copy` or `'static`); typically
66/// constructed once per CLI invocation and reused for every
67/// identifier in the dump.
68#[derive(Clone, Debug)]
69pub struct Funifier {
70    /// 16-byte SipHash key. SipHash-2-4 is a keyed PRF; 128 bits give a
71    /// deterministic, low-collision mapping from an identifier to its fun
72    /// name. Derived either from SHA-256 over the
73    /// process pid and a ns timestamp ([`Self::ephemeral`]) or
74    /// from SHA-256 over [`FUN_PEPPER`] and a user-supplied seed
75    /// ([`Self::with_seed`]).
76    key: [u8; 16],
77}
78
79impl Funifier {
80    /// Construct a Funifier with a process-fresh random key. Two
81    /// invocations in the same process give DIFFERENT mappings —
82    /// callers who need cross-invocation determinism use
83    /// [`Self::with_seed`] instead. Used by callers that just want
84    /// "produce a fun version of this output" without any need to
85    /// reproduce the mapping later.
86    ///
87    /// Derives the key from SHA-256 over the process pid and a
88    /// nanosecond timestamp; no rand/getrandom dependency (see
89    /// body comment). Two instances in one process differ only
90    /// via the ns timestamp.
91    pub fn ephemeral() -> Self {
92        // SHA-256 over (process pid, monotonic ns) for the
93        // ephemeral key. Avoids depending on a specific rand-crate
94        // trait import path (rand 0.10's RNG-core trait paths
95        // shifted between minor versions); the inputs here are
96        // already non-replayable across processes — pid is unique
97        // per kernel concurrent-life, ns timestamp gives 64-bit
98        // intra-process distinctness. SHA-256 then mixes those
99        // into a 16-byte key with adequate avalanche for the
100        // value-substitution goal.
101        let pid = std::process::id() as u64;
102        let ns = std::time::SystemTime::now()
103            .duration_since(std::time::UNIX_EPOCH)
104            .map(|d| d.as_nanos() as u64)
105            .unwrap_or(0);
106        let mut h = Sha256::new();
107        h.update(FUN_PEPPER);
108        h.update([0u8]);
109        h.update(b"ephemeral");
110        h.update([0u8]);
111        h.update(pid.to_le_bytes());
112        h.update(ns.to_le_bytes());
113        let digest = h.finalize();
114        let mut key = [0u8; 16];
115        key.copy_from_slice(&digest[..16]);
116        Self { key }
117    }
118
119    /// Construct a Funifier whose mapping is fully determined by
120    /// `seed`. Two invocations with the same `seed` (in the same
121    /// binary build) produce identical fun names for the same
122    /// inputs. Different seeds give independent mappings.
123    ///
124    /// Uses SHA-256 over the fixed `FUN_PEPPER` || seed bytes,
125    /// truncated to 128 bits for SipHash — enough for a stable,
126    /// low-collision fun mapping.
127    pub fn with_seed(seed: &str) -> Self {
128        let mut h = Sha256::new();
129        h.update(FUN_PEPPER);
130        h.update([0u8]);
131        h.update(seed.as_bytes());
132        let digest = h.finalize();
133        let mut key = [0u8; 16];
134        key.copy_from_slice(&digest[..16]);
135        Self { key }
136    }
137
138    /// Internal: keyed 128-bit hash over (`category` || NUL ||
139    /// `payload`). The NUL byte separator guarantees that
140    /// `("pid", "42")` and `("pi", "d42")` yield distinct hashes
141    /// even with concatenation (no length prefix needed because
142    /// every category we use is a fixed-shape ASCII identifier
143    /// that does not embed NUL).
144    fn keyed_hash(&self, category: &[u8], payload: &[u8]) -> u128 {
145        let mut buf = Vec::with_capacity(category.len() + 1 + payload.len());
146        buf.extend_from_slice(category);
147        buf.push(0u8);
148        buf.extend_from_slice(payload);
149        let mut h = SipHasher24::new_with_key(&self.key);
150        h.write(&buf);
151        h.finish128().as_u128()
152    }
153
154    /// Replace a string identifier with a deterministic
155    /// `adjective-animal` pair. The 65 536 (adjective, animal)
156    /// pairs the dictionary supports give a comfortable margin for
157    /// dumps with hundreds of distinct identifiers per category —
158    /// the birthday-paradox collision probability for 100 names
159    /// drawn from 65k buckets is ~7%, for 50 names ~2%. A future
160    /// extension could append a 4-digit suffix on collision; for
161    /// v1 we accept the rare collision.
162    ///
163    /// Examples (with a fixed seed):
164    /// ```ignore
165    /// let f = Funifier::with_seed("demo");
166    /// // Each call yields an adjective-animal pair; the exact
167    /// // pair is seed-dependent.
168    /// f.petname_for("comm", "ktstr_test_main");
169    /// f.petname_for("comm", "scx_simple");
170    /// ```
171    pub fn petname_for(&self, category: &str, payload: &str) -> String {
172        let h = self.keyed_hash(category.as_bytes(), payload.as_bytes());
173        let adj_idx = (h & 0xff) as usize;
174        let ani_idx = ((h >> 8) & 0xff) as usize;
175        let adj = ADJECTIVES[adj_idx % ADJECTIVES.len()];
176        let ani = ANIMALS[ani_idx % ANIMALS.len()];
177        format!("{adj}-{ani}")
178    }
179
180    /// Replace a u64 identifier with another u64. The mapping is a
181    /// deterministic permutation per (seed, category): the keyed
182    /// hash mixes (category, n.to_le_bytes()), and we take the low
183    /// 64 bits as the new identifier.
184    ///
185    /// The permutation just needs to be deterministic and to rarely
186    /// collide; fun mode keeps the numbers stable and distinct, not
187    /// meaningful.
188    ///
189    /// Two distinct `(category, n)` inputs collide on the same
190    /// output u64 with probability ~2^-64. Within a single
191    /// category, n=0 always maps to 0 is NOT guaranteed; consumers
192    /// that need a sentinel zero should call [`Self::is_sentinel_u64`]
193    /// or carry the original value out-of-band.
194    pub fn numeric_id(&self, category: &str, n: u64) -> u64 {
195        let h = self.keyed_hash(category.as_bytes(), &n.to_le_bytes());
196        // Take the low 64 bits. The high 64 bits are discarded —
197        // SipHash's avalanche means either half is uniformly
198        // distributed conditional on the input.
199        h as u64
200    }
201
202    /// Replace an i64 identifier (e.g. a kernel pid_t which is
203    /// signed). Same contract as [`Self::numeric_id`] but
204    /// preserves the i64 zero (since dumps frequently use 0 or
205    /// -1 as sentinels). Negative values are funified by their
206    /// absolute value; the sign survives.
207    pub fn numeric_id_i64(&self, category: &str, n: i64) -> i64 {
208        if n == 0 {
209            return 0;
210        }
211        let abs = n.unsigned_abs();
212        // Mask to 63 bits so the result always fits in i64.
213        let funified = (self.numeric_id(category, abs) & ((1u64 << 63) - 1)) as i64;
214        if n < 0 { -funified } else { funified }
215    }
216
217    /// Replace an i32 identifier (e.g. a kernel pid_t / signed
218    /// uid_t / any 32-bit-wide signed field) with another i32.
219    /// Same contract as [`Self::numeric_id_i64`] but the output
220    /// is masked so it fits in 31 bits of magnitude (so a
221    /// downstream `as i32` cast of the funified value can never
222    /// wrap a high-bit hash output back into the legal i32
223    /// range). Sentinels (`0`, `i32::MIN`, `i32::MAX`) round-
224    /// trip unchanged so failure-dump renderers see the same
225    /// "kthread / no value" markers in the funified output.
226    pub fn numeric_id_i32(&self, category: &str, n: i32) -> i32 {
227        if Self::is_sentinel_i32(n) {
228            return n;
229        }
230        // i32::MIN is a sentinel per the schema convention;
231        // filtering it preserves round-trip semantics
232        // (i32::MIN funifies to i32::MIN). The 31-bit mask
233        // makes the cast safe regardless, but the sentinel
234        // guard also avoids routing i32::MIN through the hash
235        // which would lose the "no value" marker meaning.
236        let abs = n.unsigned_abs() as u64;
237        // Reuse numeric_id then mask to 31 bits so the result
238        // always fits in i32 with sign preserved.
239        let funified = (self.numeric_id(category, abs) & ((1u32 << 31) - 1) as u64) as i32;
240        if n < 0 { -funified } else { funified }
241    }
242
243    /// 32-bit-wide analog of [`Self::is_sentinel_u64`] for signed
244    /// 32-bit identifiers. Schemas commonly use `0` for
245    /// "kernel/unset", `i32::MIN` for "no value" / error sentinels,
246    /// and `i32::MAX` for "max" markers. Kept distinct from
247    /// [`Self::is_sentinel_u32`] because the negative sentinel
248    /// (`i32::MIN`) has no u32 analog.
249    pub fn is_sentinel_i32(n: i32) -> bool {
250        n == 0 || n == i32::MIN || n == i32::MAX
251    }
252
253    /// Replace a u32 identifier (e.g. a host CPU number, uid, gid,
254    /// nlink, or any other 32-bit-wide field) with another u32.
255    /// Same contract as [`Self::numeric_id`] but the output is
256    /// masked to fit in 32 bits so a downstream consumer that
257    /// `as u32`-casts the funified value cannot wrap a high-bit
258    /// hash output back into the legal 0..=u32::MAX range. Mirror
259    /// of [`Self::numeric_id_i64`] for the unsigned narrow case.
260    ///
261    /// Sentinel preservation differs from `numeric_id`: this
262    /// method preserves both `0` and `u32::MAX` exactly, since
263    /// 32-bit identifier schemas frequently use those as
264    /// sentinels (CPU 0, "no value" 0xFFFFFFFF). Consumers that
265    /// want the universal u64 sentinel-check semantics call
266    /// [`Self::is_sentinel_u64`] on the up-cast value, which is
267    /// equivalent because the u32 sentinels round-trip through
268    /// the u64 check.
269    pub fn numeric_id_u32(&self, category: &str, n: u32) -> u32 {
270        if Self::is_sentinel_u32(n) {
271            return n;
272        }
273        // Reuse numeric_id then mask to 32 bits. SipHash's
274        // avalanche means the low 32 bits are uniformly
275        // distributed conditional on the input, so the
276        // collision rate is the natural 2^-32 for a 32-bit
277        // permutation — same statistical posture as the i64
278        // narrowing in [`Self::numeric_id_i64`].
279        (self.numeric_id(category, n as u64) & u32::MAX as u64) as u32
280    }
281
282    /// True when the given identifier is "obvious sentinel" — 0
283    /// or "max" — and should be passed through unchanged. Lets
284    /// downstream renderers preserve the failure-dump's "kthread"
285    /// vs "pid 0" semantics without leaking real pids.
286    pub fn is_sentinel_u64(n: u64) -> bool {
287        n == 0 || n == u64::MAX
288    }
289
290    /// 32-bit-wide analog of [`Self::is_sentinel_u64`] for the
291    /// narrow u32 paths in [`Self::numeric_id_u32`]. Schemas
292    /// frequently use `u32::MAX` as the "no value" marker for
293    /// 32-bit fields and `0` as "kernel / unset", same shape as
294    /// the u64 check — kept distinct so downstream callers using
295    /// `numeric_id_u32` don't have to up-cast just to check.
296    pub fn is_sentinel_u32(n: u32) -> bool {
297        n == 0 || n == u32::MAX
298    }
299
300    /// Categories whose JSON value is u32-width in the originating
301    /// schema and must be funified through
302    /// [`Self::numeric_id_u32`] (32-bit-masked output) instead of
303    /// the default [`Self::numeric_id`] (full u64 output).
304    ///
305    /// Why this matters: serde_json's `Value::Number` only carries
306    /// `is_u64`/`is_i64`/`is_f64`, not the original Rust width.
307    /// When a struct field is typed `u32` but serialized through a
308    /// generic `serde_json::Value`, the funify walker can't see
309    /// the narrowing. A full-u64 funified output then overflows
310    /// when a downstream consumer (CLI parser, `as u32` cast,
311    /// JSON round-trip into a u32-field struct) narrows it back.
312    /// Naming the u32-width identifier categories explicitly
313    /// is the only mechanism available without schema metadata.
314    ///
315    /// The allowlist is conservative: only includes keys whose
316    /// originating Rust field is documented or named-matchable as
317    /// u32-wide. New u32 fields added to ktstr's schemas must be
318    /// declared here or they fall through to the u64 path and
319    /// the overflow returns.
320    pub fn is_u32_category(key: &str) -> bool {
321        // Match strategy mirrors `is_metric_passthrough`:
322        // whole-key match against a fixed vocabulary, then suffix
323        // match against narrow-width naming patterns. Keep the
324        // suffix list short — false positives here flip a u64
325        // identifier into a u32 funify, which silently
326        // collision-rate-bumps from 2^-64 to 2^-32.
327        let lc = key.to_ascii_lowercase();
328        if matches!(
329            lc.as_str(),
330            // CPU number — the kernel exposes them as `unsigned
331            // int` in /proc and sysfs; ktstr's u32-typed CPU
332            // fields (e.g. WorkerReport's u32 cpu samples) round-
333            // trip through u32 in the schema layer.
334            "cpu_id"
335            // Real / effective UID and GID. Linux kernel
336            // `uid_t`/`gid_t` are `unsigned int` (u32). Capture
337            // both bare and resolved forms used across ktstr's
338            // failure-dump enrichment.
339            | "uid" | "euid" | "ruid" | "suid" | "fsuid"
340            | "gid" | "egid" | "rgid" | "sgid" | "fsgid"
341            // Kernel-namespace uid/gid resolved forms. Linux
342            // kuid_t/kgid_t wrap unsigned int (u32), so the
343            // whole-key match keeps them in the 32-bit-masked
344            // narrow path so the masked output fits a downstream
345            // u32 cast.
346            | "kuid" | "kgid"
347        ) {
348            return true;
349        }
350        // Suffix vocabulary. `_u32` is the explicit marker some
351        // schemas use; the narrow-namespace conventions
352        // `*_id_u32` / `*_u32_id` are reserved for callers that
353        // know the field is 32-bit-wide. No general `_id` suffix —
354        // that catches both u64 and u32 fields and the false-
355        // positive rate would be too high.
356        const U32_SUFFIXES: &[&str] = &["_u32", "_u32_id"];
357        for suffix in U32_SUFFIXES {
358            if lc.ends_with(suffix) {
359                return true;
360            }
361        }
362        false
363    }
364
365    /// Allowlist gate for the funify walker: returns `true` when
366    /// the JSON-object key holds a value that is a METRIC (count,
367    /// rate, ratio, byte/duration unit, structural enum) and
368    /// should pass through funification unchanged. Returns `false`
369    /// for everything else — those values get funified.
370    ///
371    /// Inverted polarity vs. v1: previously a deny-list of known
372    /// identifier keys (pid/cpu/cgroup/...) selected the funify
373    /// path. The deny-list missed every novel identifier-shaped
374    /// field as the schema grew. The allowlist makes the safe
375    /// default "funify it" — any new or unrecognised field is
376    /// hidden by default, only metrics whose values are
377    /// numeric/categorical truth (and therefore safe to retain)
378    /// pass through.
379    ///
380    /// Match strategy:
381    ///   * lowercased-key whole-match against a fixed structural
382    ///     vocabulary (schema/version/type/kind/status/...);
383    ///   * suffix-match against unit/quantity vocabulary
384    ///     (_count/_total/_per_sec/_ns/_bytes/_ratio/_pct/...);
385    ///   * everything else returns false.
386    ///
387    /// Returns true when `key` names a metric value.
388    pub fn is_metric_passthrough(key: &str) -> bool {
389        let lc = key.to_ascii_lowercase();
390
391        // Whole-key allowlist. Structural enums, schema markers,
392        // top-level kernel/runqueue counters, and other named
393        // metrics whose value is numeric/categorical truth.
394        if matches!(
395            lc.as_str(),
396            "schema"
397                | "version"
398                | "type"
399                | "kind"
400                | "status"
401                | "state"
402                | "result"
403                | "verdict"
404                | "outcome"
405                | "phase"
406                | "policy"
407                | "priority"
408                | "nice"
409                | "weight"
410                | "capacity"
411                | "size"
412                | "len"
413                | "length"
414                | "depth"
415                | "index"
416                | "idx"
417                | "level"
418                | "tier"
419                | "rank"
420                | "slot"
421                | "epoch"
422                | "generation"
423                | "nr_running"
424                | "nr_queued"
425                | "nr_failed"
426                | "nr_switches"
427                | "runqueue_depth"
428                // NUMA event counters (vm_numa_event)
429                | "numa_hit" | "numa_miss" | "numa_foreign" | "numa_interleave_hit" | "numa_local" | "numa_other"
430                // SCX event counters (scx_exit_info)
431                | "select_cpu_fallback" | "dispatch_local_dsq_offline" | "dispatch_keep_last" | "enq_skip_exiting" | "enq_skip_migration_disabled" | "reenq_immed" | "reenq_local_repeat" | "refill_slice_dfl" | "bypass_duration" | "bypass_dispatch" | "bypass_activate" | "insert_not_owned" | "sub_bypass_dispatch"
432                // BPF prog runtime stats
433                | "cnt" | "nsecs" | "misses" | "verified_insns"
434                // Hardware perf counters
435                | "cycles" | "instructions" | "cache_misses" | "branch_misses"
436                // Per-rq SCX state
437                | "flags" | "ops_qseq" | "kick_sync" | "nr_immed" | "rq_clock"
438                // DSQ state
439                | "nr" | "seq"
440                // Task enrichment
441                | "nr_threads" | "prio" | "static_prio" | "normal_prio" | "nvcsw" | "nivcsw" | "signal_nvcsw" | "signal_nivcsw"
442                // VirtioBlkCounters disk metrics
443                | "bytes_read" | "bytes_written" | "io_errors"
444                // Topology metrics — CPU IDs in cpusets and affinity
445                // masks are placement information about the workload,
446                // not personally-identifying data. Funifying these to
447                // a u64 keyed-hash also breaks round-trip into the
448                // schema's `Vec<usize>` typing: the CPU IDs sit at
449                // 0..N for an N-CPU host (small values), but
450                // `numeric_id` returns a full 64-bit hash that scales
451                // CPU IDs into the u64 range and changes their
452                // semantic identity.
453                //
454                // # Collision risk
455                //
456                // `cpus` is a short, common key name. A future
457                // schema that adds a different field also called
458                // `cpus` — for example a list of pid-shaped task
459                // identifiers, a sequence of byte-counts named
460                // after a CPU-related metric, or any other
461                // payload that SHOULD funify — would silently
462                // pass through the allowlist instead of being
463                // funified, leaking the identifiers into
464                // user-visible output. Schema authors adding a
465                // new `cpus`-keyed field whose value is NOT a
466                // topology cpuset must either:
467                //   1. rename their field (preferred — `cpus` as
468                //      a bare key is reserved for topology
469                //      placement),
470                //   2. namespace this match by walker context
471                //      (would require threading parent-key
472                //      provenance through the walker), or
473                //   3. demonstrate that the value is also placement
474                //      information that's safe to pass through.
475                //
476                // The bare key is retained because every existing
477                // ktstr schema's `cpus` field IS a topology
478                // cpuset (`Vec<usize>` of CPU IDs); a rename here
479                // would break round-trip parsing of every
480                // failure-dump emitted to date.
481                | "cpus" | "cpuset_cpus"
482        ) {
483            return true;
484        }
485
486        // Suffix allowlist. Quantity / rate / ratio / unit
487        // vocabulary the failure-dump schemas use across
488        // VirtioBlkCounters, FailureDumpReport, ctprof samples,
489        // and the topology/cgroup-stats trees.
490        const METRIC_SUFFIXES: &[&str] = &[
491            "_count",
492            "_total",
493            "_completed",
494            "_dropped",
495            "_failed",
496            "_skipped",
497            "_throttled",
498            "_read",
499            "_written",
500            "_errors",
501            "_per_sec",
502            "_per_ms",
503            "_rate",
504            "_hz",
505            "_ratio",
506            "_fraction",
507            "_pct",
508            "_percent",
509            "_ns",
510            "_us",
511            "_ms",
512            "_sec",
513            "_seconds",
514            "_bytes",
515            "_kb",
516            "_mb",
517            "_gb",
518            "_pages",
519            "_min",
520            "_max",
521            "_mean",
522            "_avg",
523            "_stddev",
524            "_p50",
525            "_p90",
526            "_p95",
527            "_p99",
528            "_capacity",
529            "_size",
530            "_depth",
531            "_len",
532            "_length",
533            "_weight",
534            "_nice",
535            "_priority",
536            "_index",
537            "_idx",
538            "_offset",
539            "_generation",
540            "_epoch",
541            "_version",
542            "_status",
543            "_state",
544            "_kind",
545            "_type",
546            "_phase",
547            "_verdict",
548            "_outcome",
549        ];
550        for suffix in METRIC_SUFFIXES {
551            if lc.ends_with(suffix) {
552                return true;
553            }
554        }
555
556        false
557    }
558}
559
560// ---------------------------------------------------------------------------
561// JSON walker
562// ---------------------------------------------------------------------------
563
564/// Recursively walk a `serde_json::Value` and funify every value
565/// whose containing key is NOT in [`Funifier::is_metric_passthrough`].
566/// Returns the funified value — input is consumed (cheaper than
567/// cloning a deep tree).
568///
569/// Inverted polarity (metric allowlist): the default action is
570/// "funify it" — a value passes through unchanged ONLY when its
571/// containing key is a metric (count/rate/ratio/byte/duration/
572/// structural enum). Any other field — pid, comm, cgroup_path,
573/// scheduler name, version string, novel identifier-shaped key
574/// the schema didn't have last week — gets replaced.
575///
576/// Funification rules at the leaves:
577/// * **String** under a non-metric key — replaced via
578///   [`Funifier::petname_for`] using the key name itself as the
579///   namespace. Two distinct keys with the same string value get
580///   different fun names; the same key + same value yields the
581///   same fun name everywhere in the dump (cross-reference
582///   preservation).
583/// * **Integer** (u64 or i64) under a non-metric key — replaced
584///   via [`Funifier::numeric_id`] / [`Funifier::numeric_id_i64`]
585///   with the key name as namespace. Sentinel zero and `u64::MAX`
586///   pass through unchanged ([`Funifier::is_sentinel_u64`]); the
587///   i64 path also preserves zero per [`Funifier::numeric_id_i64`].
588/// * **Float** — always passes through. Floats are quasi-
589///   exclusively rates/ratios/durations in the dump schemas
590///   (cpu_time_fraction, wakeups_per_sec, ...) and there is no
591///   sensible fun mapping for IEEE-754 values; making the rule
592///   uniform avoids hazarding the rate/ratio metrics that happen
593///   to live under non-metric-keyed parents (e.g. inside an
594///   anonymous-object array element).
595/// * **Bool / null** — always pass through.
596///
597/// Recursive rules:
598/// * **Object** — re-classify each key independently. Nested
599///   objects do NOT inherit metric state across the boundary.
600/// * **Array** — children inherit the parent key's
601///   metric/non-metric verdict and (when non-metric) the parent
602///   key's namespace. So `"pids": [1, 2, 3]` funifies each int
603///   under namespace "pids" and `"counters": [...]` passes every
604///   element through.
605pub fn funify_json(value: serde_json::Value, f: &Funifier) -> serde_json::Value {
606    funify_json_with_context(value, f, None)
607}
608
609/// `category` semantics:
610/// * `Some(key)` — value sits under a NON-metric key whose name
611///   is `key`; leaves get funified using `key` as the namespace.
612/// * `None` — value sits at the root or under a metric key;
613///   leaves pass through unchanged.
614fn funify_json_with_context(
615    value: serde_json::Value,
616    f: &Funifier,
617    category: Option<&str>,
618) -> serde_json::Value {
619    use serde_json::Value;
620    match value {
621        Value::Object(map) => {
622            let mut out = serde_json::Map::with_capacity(map.len());
623            for (k, v) in map {
624                // Re-classify each key independently. Metric ⇒
625                // descendants pass through (`None`); non-metric
626                // ⇒ descendants funify under `k`'s namespace.
627                let child_cat: Option<&str> = if Funifier::is_metric_passthrough(&k) {
628                    None
629                } else {
630                    Some(k.as_str())
631                };
632                let funified = funify_json_with_context(v, f, child_cat);
633                out.insert(k, funified);
634            }
635            Value::Object(out)
636        }
637        Value::Array(items) => {
638            // Inherit the parent key's category verbatim. An
639            // array under a metric key passes through; an array
640            // under a non-metric key funifies each element using
641            // the parent's name as namespace.
642            let out: Vec<Value> = items
643                .into_iter()
644                .map(|v| funify_json_with_context(v, f, category))
645                .collect();
646            Value::Array(out)
647        }
648        Value::String(s) => {
649            if let Some(cat) = category {
650                Value::String(f.petname_for(cat, &s))
651            } else {
652                Value::String(s)
653            }
654        }
655        Value::Number(num) => {
656            // Floats always pass through (see module doc) — check
657            // first so the u64/i64 cascade only runs for integer
658            // numbers.
659            if num.is_f64() {
660                return Value::Number(num);
661            }
662            // Sentinel preservation applies universally — even
663            // at a non-metric key, 0 and u64::MAX retain their
664            // sentinel meaning (kthread pid 0, "no value"
665            // u64::MAX) so failure-dump renderers downstream
666            // don't have to special-case the funified bytes.
667            if let Some(cat) = category {
668                if let Some(u) = num.as_u64() {
669                    if Funifier::is_sentinel_u64(u) {
670                        return Value::Number(num);
671                    }
672                    // Schemas that serialize a u32-width identifier
673                    // through a generic `serde_json::Value` lose
674                    // the width — the walker sees only `as_u64`.
675                    // `Funifier::is_u32_category` names the keys
676                    // whose originating Rust field is u32-wide;
677                    // route those through the 32-bit-masked path
678                    // so a downstream `as u32` cast / u32-typed
679                    // struct round-trip never wraps a high-bit
680                    // hash output back into the legal range.
681                    if Funifier::is_u32_category(cat) {
682                        // Values that exceed u32::MAX shouldn't
683                        // appear under a u32-category key in
684                        // well-formed input; clamp explicitly so a
685                        // hostile / malformed input can't bypass
686                        // the narrowing through truncation. Casting
687                        // a > u32::MAX value `as u32` would silently
688                        // discard the high bits and the funified
689                        // output would be derived from a different
690                        // input than the one the operator sees.
691                        let narrow = if u > u32::MAX as u64 {
692                            u32::MAX
693                        } else {
694                            u as u32
695                        };
696                        let funified = f.numeric_id_u32(cat, narrow);
697                        Value::Number(serde_json::Number::from(funified))
698                    } else {
699                        Value::Number(serde_json::Number::from(f.numeric_id(cat, u)))
700                    }
701                } else if let Some(i) = num.as_i64() {
702                    // numeric_id_i64 itself preserves zero.
703                    // For u32-category keys with negative
704                    // values, narrow to i32 so a downstream
705                    // `as i32`/`as u32` cast cannot wrap a high-
706                    // bit hash output back into the legal i32/u32
707                    // range. Same rationale as the unsigned u32
708                    // branch above; the negative-value case is
709                    // reachable when serde lowers a kernel
710                    // pid_t/uid_t value (signed) through
711                    // `as_i64` because the field declared i32 in
712                    // Rust serializes as i64 in `serde_json::Value`
713                    // when the value is negative.
714                    if Funifier::is_u32_category(cat) {
715                        let narrow = if i < i32::MIN as i64 {
716                            i32::MIN
717                        } else if i > i32::MAX as i64 {
718                            i32::MAX
719                        } else {
720                            i as i32
721                        };
722                        let funified = f.numeric_id_i32(cat, narrow);
723                        Value::Number(serde_json::Number::from(funified))
724                    } else {
725                        Value::Number(serde_json::Number::from(f.numeric_id_i64(cat, i)))
726                    }
727                } else {
728                    // Defensive: serde_json::Number variants are
729                    // u64/i64/f64; the float case is handled above.
730                    Value::Number(num)
731                }
732            } else {
733                Value::Number(num)
734            }
735        }
736        // Booleans, null pass through.
737        other => other,
738    }
739}
740
741// ---------------------------------------------------------------------------
742// Petname dictionary
743// ---------------------------------------------------------------------------
744//
745// 272 adjectives x 264 animals are listed, but `petname_for` masks
746// each index with `& 0xff` (0..=255) before the `% .len()` modulo,
747// so only the first 256 of each are reachable: 256 x 256 = 65 536
748// selectable (adjective, animal) pairs (the count the module doc on
749// `petname_for` bases its birthday-collision math on). Entries past
750// index 255 — the last 16 adjectives and last 8 animals — are
751// currently unreachable. Words are common-language, public-domain,
752// single-word
753// (no spaces or hyphens) so the rendered name is always a clean
754// `adjective-animal` token suitable for downstream tooling.
755//
756// Word lists curated for readable fun names:
757// playful, recognizable, no edge-cases (no profanity, no political,
758// no unusual spellings). The order is fixed for the lifetime of
759// this v1 — adding new words to the END is safe; reordering would
760// break the determinism contract for callers using a fixed seed.
761
762const ADJECTIVES: &[&str] = &[
763    "able",
764    "agile",
765    "airy",
766    "amber",
767    "ample",
768    "amused",
769    "ancient",
770    "angry",
771    "antsy",
772    "apt",
773    "ardent",
774    "arid",
775    "ashen",
776    "auburn",
777    "aware",
778    "awesome",
779    "balmy",
780    "bashful",
781    "beaded",
782    "beamy",
783    "bendy",
784    "best",
785    "big",
786    "bitter",
787    "black",
788    "blameless",
789    "blazing",
790    "bleached",
791    "blissful",
792    "blithe",
793    "blocky",
794    "bloomy",
795    "blue",
796    "blunt",
797    "bold",
798    "bony",
799    "bouncy",
800    "brainy",
801    "brassy",
802    "brave",
803    "breezy",
804    "bright",
805    "brisk",
806    "bristly",
807    "brittle",
808    "broad",
809    "bronze",
810    "brown",
811    "bubbly",
812    "burly",
813    "busy",
814    "buttery",
815    "calm",
816    "candid",
817    "casual",
818    "cheery",
819    "chilly",
820    "chipper",
821    "chubby",
822    "chummy",
823    "civic",
824    "classy",
825    "clean",
826    "clear",
827    "clever",
828    "cloudy",
829    "clumsy",
830    "coiled",
831    "cold",
832    "comfy",
833    "cool",
834    "copper",
835    "cosmic",
836    "cozy",
837    "crafty",
838    "crimson",
839    "crisp",
840    "crystal",
841    "curious",
842    "dainty",
843    "damp",
844    "dapper",
845    "daring",
846    "dark",
847    "dashing",
848    "dazed",
849    "deep",
850    "deft",
851    "delft",
852    "dewy",
853    "dim",
854    "dimpled",
855    "dingy",
856    "dippy",
857    "distant",
858    "dizzy",
859    "dopey",
860    "dotted",
861    "drafty",
862    "dreamy",
863    "dressy",
864    "drowsy",
865    "dry",
866    "dual",
867    "dulcet",
868    "dusty",
869    "eager",
870    "early",
871    "easy",
872    "eclectic",
873    "edgy",
874    "eerie",
875    "elastic",
876    "elated",
877    "elder",
878    "electric",
879    "elfin",
880    "emerald",
881    "empty",
882    "endless",
883    "ethereal",
884    "even",
885    "exact",
886    "fabled",
887    "faint",
888    "fancy",
889    "fawn",
890    "fearless",
891    "feisty",
892    "ferny",
893    "festive",
894    "fey",
895    "fierce",
896    "fiery",
897    "filmy",
898    "fine",
899    "fizzy",
900    "flat",
901    "fleet",
902    "fleeting",
903    "flighty",
904    "flinty",
905    "floaty",
906    "floral",
907    "flowy",
908    "fluffy",
909    "fluted",
910    "foamy",
911    "fond",
912    "foppish",
913    "frank",
914    "fresh",
915    "fretful",
916    "frilly",
917    "frisky",
918    "frosty",
919    "frugal",
920    "fudgy",
921    "funky",
922    "furry",
923    "fuzzy",
924    "gallant",
925    "game",
926    "gawky",
927    "gentle",
928    "genuine",
929    "ghostly",
930    "giddy",
931    "giggly",
932    "glad",
933    "glassy",
934    "gleaming",
935    "glib",
936    "global",
937    "glossy",
938    "glowing",
939    "glum",
940    "golden",
941    "good",
942    "goopy",
943    "gossamer",
944    "graceful",
945    "grainy",
946    "grand",
947    "grassy",
948    "great",
949    "green",
950    "grim",
951    "groovy",
952    "grown",
953    "grumpy",
954    "gummy",
955    "gusty",
956    "hale",
957    "halting",
958    "handy",
959    "happy",
960    "hardy",
961    "harmless",
962    "hasty",
963    "hazy",
964    "heady",
965    "hearty",
966    "heavy",
967    "helpful",
968    "high",
969    "hilly",
970    "hippy",
971    "hoarse",
972    "hollow",
973    "holy",
974    "homely",
975    "honest",
976    "hooked",
977    "hopeful",
978    "hot",
979    "humble",
980    "hungry",
981    "icy",
982    "ideal",
983    "iffy",
984    "immense",
985    "indigo",
986    "inland",
987    "inner",
988    "ironic",
989    "itchy",
990    "ivory",
991    "jade",
992    "jaunty",
993    "jazzy",
994    "jelly",
995    "jiffy",
996    "jiggly",
997    "jolly",
998    "jovial",
999    "joyful",
1000    "jumpy",
1001    "kelpy",
1002    "keen",
1003    "kind",
1004    "kindly",
1005    "kinetic",
1006    "knotty",
1007    "lacy",
1008    "ladylike",
1009    "lambent",
1010    "lanky",
1011    "lapis",
1012    "large",
1013    "late",
1014    "lavish",
1015    "lawful",
1016    "lazy",
1017    "leafy",
1018    "lean",
1019    "lemony",
1020    "lenient",
1021    "level",
1022    "lifelong",
1023    "light",
1024    "lily",
1025    "linen",
1026    "linked",
1027    "lithe",
1028    "little",
1029    "lively",
1030    "loamy",
1031    "lofty",
1032    "long",
1033    "loud",
1034    "lovely",
1035];
1036
1037const ANIMALS: &[&str] = &[
1038    "aardvark",
1039    "albatross",
1040    "alligator",
1041    "alpaca",
1042    "ant",
1043    "antelope",
1044    "ape",
1045    "armadillo",
1046    "ass",
1047    "auk",
1048    "axolotl",
1049    "baboon",
1050    "badger",
1051    "bandicoot",
1052    "barnacle",
1053    "barracuda",
1054    "basilisk",
1055    "bat",
1056    "bear",
1057    "beaver",
1058    "bee",
1059    "beetle",
1060    "bison",
1061    "blackbird",
1062    "boar",
1063    "bobcat",
1064    "bonobo",
1065    "boomslang",
1066    "buffalo",
1067    "bulldog",
1068    "bullfrog",
1069    "bumblebee",
1070    "bushbaby",
1071    "butterfly",
1072    "buzzard",
1073    "camel",
1074    "canary",
1075    "capybara",
1076    "caracal",
1077    "cardinal",
1078    "caribou",
1079    "carp",
1080    "cat",
1081    "caterpillar",
1082    "catfish",
1083    "centaur",
1084    "centipede",
1085    "chameleon",
1086    "cheetah",
1087    "chickadee",
1088    "chicken",
1089    "chihuahua",
1090    "chinchilla",
1091    "chipmunk",
1092    "civet",
1093    "clam",
1094    "cobra",
1095    "cockatoo",
1096    "cod",
1097    "coral",
1098    "cougar",
1099    "cow",
1100    "coyote",
1101    "crab",
1102    "crane",
1103    "crayfish",
1104    "cricket",
1105    "crocodile",
1106    "crow",
1107    "cuckoo",
1108    "curlew",
1109    "cuttlefish",
1110    "dachshund",
1111    "dalmatian",
1112    "deer",
1113    "dingo",
1114    "dodo",
1115    "dog",
1116    "dolphin",
1117    "donkey",
1118    "dormouse",
1119    "dove",
1120    "dragon",
1121    "dragonfly",
1122    "drake",
1123    "duck",
1124    "dugong",
1125    "eagle",
1126    "eel",
1127    "egret",
1128    "elephant",
1129    "elk",
1130    "emu",
1131    "ermine",
1132    "falcon",
1133    "fawn",
1134    "ferret",
1135    "finch",
1136    "firefly",
1137    "fish",
1138    "flamingo",
1139    "flatfish",
1140    "flounder",
1141    "fly",
1142    "flycatcher",
1143    "fowl",
1144    "fox",
1145    "frog",
1146    "fulmar",
1147    "gannet",
1148    "gar",
1149    "gazelle",
1150    "gecko",
1151    "gerbil",
1152    "gibbon",
1153    "giraffe",
1154    "gnat",
1155    "gnu",
1156    "goat",
1157    "goldfish",
1158    "goose",
1159    "gopher",
1160    "gorilla",
1161    "goshawk",
1162    "grasshopper",
1163    "greyhound",
1164    "grouse",
1165    "guanaco",
1166    "gull",
1167    "guppy",
1168    "haddock",
1169    "hagfish",
1170    "halibut",
1171    "hamster",
1172    "hare",
1173    "harrier",
1174    "hawk",
1175    "hedgehog",
1176    "hen",
1177    "heron",
1178    "herring",
1179    "hippo",
1180    "hognose",
1181    "hornet",
1182    "horse",
1183    "hound",
1184    "hyena",
1185    "ibex",
1186    "ibis",
1187    "iguana",
1188    "impala",
1189    "jackal",
1190    "jackrabbit",
1191    "jaguar",
1192    "javelina",
1193    "jay",
1194    "jellyfish",
1195    "kangaroo",
1196    "katydid",
1197    "kestrel",
1198    "kingfisher",
1199    "kite",
1200    "kiwi",
1201    "koala",
1202    "kookaburra",
1203    "krill",
1204    "lamb",
1205    "lamprey",
1206    "langur",
1207    "lark",
1208    "lemming",
1209    "lemur",
1210    "leopard",
1211    "lion",
1212    "lizard",
1213    "llama",
1214    "lobster",
1215    "locust",
1216    "loon",
1217    "louse",
1218    "lynx",
1219    "macaque",
1220    "macaw",
1221    "mackerel",
1222    "magpie",
1223    "mallard",
1224    "mammoth",
1225    "manatee",
1226    "mandrill",
1227    "marlin",
1228    "marmoset",
1229    "marmot",
1230    "marten",
1231    "meerkat",
1232    "mink",
1233    "minnow",
1234    "mole",
1235    "molly",
1236    "mongoose",
1237    "monkey",
1238    "moose",
1239    "mosquito",
1240    "moth",
1241    "mouse",
1242    "mule",
1243    "muskrat",
1244    "narwhal",
1245    "newt",
1246    "nightingale",
1247    "ocelot",
1248    "octopus",
1249    "okapi",
1250    "opossum",
1251    "orangutan",
1252    "orca",
1253    "oriole",
1254    "ostrich",
1255    "otter",
1256    "owl",
1257    "ox",
1258    "oyster",
1259    "panda",
1260    "pangolin",
1261    "panther",
1262    "parakeet",
1263    "parrot",
1264    "partridge",
1265    "peacock",
1266    "pelican",
1267    "penguin",
1268    "perch",
1269    "petrel",
1270    "pheasant",
1271    "pig",
1272    "pigeon",
1273    "piglet",
1274    "pika",
1275    "pike",
1276    "pinscher",
1277    "piranha",
1278    "platypus",
1279    "polecat",
1280    "pony",
1281    "poodle",
1282    "porcupine",
1283    "porpoise",
1284    "possum",
1285    "prawn",
1286    "puffin",
1287    "puma",
1288    "puppy",
1289    "python",
1290    "quagga",
1291    "quail",
1292    "quetzal",
1293    "quokka",
1294    "rabbit",
1295    "raccoon",
1296    "ram",
1297    "rat",
1298    "raven",
1299    "reindeer",
1300    "rhino",
1301    "robin",
1302];
1303
1304#[cfg(test)]
1305mod tests {
1306    use super::*;
1307    use serde_json::json;
1308
1309    /// Same seed → same fun name. Two Funifiers built with the
1310    /// same seed must agree on every input.
1311    #[test]
1312    fn petname_deterministic_per_seed() {
1313        let a = Funifier::with_seed("demo-seed");
1314        let b = Funifier::with_seed("demo-seed");
1315        assert_eq!(
1316            a.petname_for("comm", "ktstr_test"),
1317            b.petname_for("comm", "ktstr_test"),
1318        );
1319    }
1320
1321    /// Different categories must produce different fun names for
1322    /// the SAME payload — pid 42 and cgroup 42 should not collapse
1323    /// to the same name.
1324    #[test]
1325    fn petname_namespaced_by_category() {
1326        let f = Funifier::with_seed("demo");
1327        let pid_name = f.petname_for("pid", "42");
1328        let cg_name = f.petname_for("cgroup", "42");
1329        // Could rarely collide by chance (1/65536); pin a specific
1330        // payload pair where the dictionary lookup differs.
1331        // The seed is fixed, so this is a stable assertion.
1332        assert_ne!(
1333            pid_name, cg_name,
1334            "category bytes must namespace the keyed hash"
1335        );
1336    }
1337
1338    /// Petname output is always two non-empty tokens joined by
1339    /// `-`. Pins the wire shape so a CLI consumer can tokenize.
1340    #[test]
1341    fn petname_format_is_adjective_dash_animal() {
1342        let f = Funifier::with_seed("demo");
1343        let name = f.petname_for("comm", "anything");
1344        let parts: Vec<&str> = name.split('-').collect();
1345        assert_eq!(parts.len(), 2, "expected exactly two segments: {name}");
1346        assert!(!parts[0].is_empty());
1347        assert!(!parts[1].is_empty());
1348        assert!(parts[0].chars().all(|c| c.is_ascii_lowercase()));
1349        assert!(parts[1].chars().all(|c| c.is_ascii_lowercase()));
1350    }
1351
1352    /// Numeric id is deterministic per (seed, category, n).
1353    #[test]
1354    fn numeric_id_deterministic() {
1355        let f = Funifier::with_seed("demo");
1356        assert_eq!(f.numeric_id("pid", 42), f.numeric_id("pid", 42));
1357        assert_ne!(f.numeric_id("pid", 42), f.numeric_id("pid", 43));
1358        assert_ne!(f.numeric_id("pid", 42), f.numeric_id("cgroup", 42));
1359    }
1360
1361    /// `numeric_id_i64` preserves zero verbatim (sentinel) and
1362    /// keeps sign across funification.
1363    #[test]
1364    fn numeric_id_i64_preserves_zero_and_sign() {
1365        let f = Funifier::with_seed("demo");
1366        assert_eq!(f.numeric_id_i64("pid", 0), 0);
1367        let pos = f.numeric_id_i64("pid", 42);
1368        let neg = f.numeric_id_i64("pid", -42);
1369        assert!(pos > 0);
1370        assert!(neg < 0);
1371        assert_eq!(pos, -neg, "abs value must match across signs");
1372    }
1373
1374    /// Sentinel u64 values pass through is_sentinel_u64.
1375    #[test]
1376    fn is_sentinel_u64_table() {
1377        assert!(Funifier::is_sentinel_u64(0));
1378        assert!(Funifier::is_sentinel_u64(u64::MAX));
1379        assert!(!Funifier::is_sentinel_u64(1));
1380        assert!(!Funifier::is_sentinel_u64(42));
1381    }
1382
1383    /// Sentinel u32 values pass through is_sentinel_u32.
1384    #[test]
1385    fn is_sentinel_u32_table() {
1386        assert!(Funifier::is_sentinel_u32(0));
1387        assert!(Funifier::is_sentinel_u32(u32::MAX));
1388        assert!(!Funifier::is_sentinel_u32(1));
1389        assert!(!Funifier::is_sentinel_u32(42));
1390    }
1391
1392    /// `numeric_id_u32` produces u32-range output deterministically.
1393    /// Pins (a) the output is masked to fit u32, (b) sentinels round-
1394    /// trip unchanged, (c) determinism per (seed, category, n) is
1395    /// preserved, and (d) the sample is well-distributed across the
1396    /// 32-bit range — a regression that left the high bits of the
1397    /// SipHash output in the result would have nearly all test
1398    /// inputs land above u32::MAX.
1399    #[test]
1400    fn numeric_id_u32_masks_to_u32_range() {
1401        let f = Funifier::with_seed("demo");
1402        // Sentinels preserved.
1403        assert_eq!(f.numeric_id_u32("cpu_id", 0), 0);
1404        assert_eq!(f.numeric_id_u32("cpu_id", u32::MAX), u32::MAX);
1405        // Non-sentinels are guaranteed to fit in u32 because the
1406        // return type is u32 — the value-level assertion is that
1407        // the result is non-zero and not the trivial echo of the
1408        // input (catching a regression that would make
1409        // numeric_id_u32 return its argument unchanged).
1410        let id_42 = f.numeric_id_u32("cpu_id", 42);
1411        assert_ne!(id_42, 42, "non-sentinel input must be funified");
1412        assert_ne!(id_42, 0, "non-sentinel input must not collapse to 0");
1413        // Determinism: same (seed, category, n) → same output.
1414        assert_eq!(f.numeric_id_u32("cpu_id", 42), id_42);
1415        // Different category → different funified output. Could
1416        // collide by chance at 2^-32, but the seed is fixed so
1417        // this is a stable assertion.
1418        assert_ne!(
1419            f.numeric_id_u32("cpu_id", 42),
1420            f.numeric_id_u32("uid", 42),
1421            "category must namespace the funified output",
1422        );
1423    }
1424
1425    /// `is_u32_category` allowlist hits — pins the documented u32-
1426    /// width identifier vocabulary so a future edit that drops an
1427    /// entry (and silently routes a u32 field through the u64
1428    /// path) trips here.
1429    #[test]
1430    fn is_u32_category_allowlist_hits() {
1431        // CPU IDs.
1432        assert!(Funifier::is_u32_category("cpu_id"));
1433        // UID/GID family.
1434        assert!(Funifier::is_u32_category("uid"));
1435        assert!(Funifier::is_u32_category("euid"));
1436        assert!(Funifier::is_u32_category("ruid"));
1437        assert!(Funifier::is_u32_category("gid"));
1438        assert!(Funifier::is_u32_category("egid"));
1439        assert!(Funifier::is_u32_category("kuid"));
1440        assert!(Funifier::is_u32_category("kgid"));
1441        // Suffix vocabulary.
1442        assert!(Funifier::is_u32_category("worker_u32"));
1443        assert!(Funifier::is_u32_category("alien_id_u32_id"));
1444        // Misses — generic identifier fields stay on the u64
1445        // path. A `_id` suffix would over-classify; pin that the
1446        // current cautious allowlist does NOT include it.
1447        assert!(!Funifier::is_u32_category("pid"));
1448        assert!(!Funifier::is_u32_category("worker_id"));
1449        assert!(!Funifier::is_u32_category("cgroup"));
1450        assert!(!Funifier::is_u32_category("comm"));
1451    }
1452
1453    /// JSON walker: u32-categorised values funify through the
1454    /// 32-bit-masked path, NEVER exceed u32::MAX. Pins the bug fix
1455    /// for the original "funifier produces u64 for u32 schema fields"
1456    /// regression — without the dispatch, a value funified under
1457    /// `cpu_id` could surface > u32::MAX and overflow a downstream
1458    /// `as u32` cast.
1459    #[test]
1460    fn funify_json_u32_category_stays_in_u32_range() {
1461        let f = Funifier::with_seed("demo");
1462        // Iterate a range of u32-shaped inputs under a u32-category
1463        // key. Every funified output must be representable in u32.
1464        for n in [1u32, 7, 42, 100, 1024, 65535, 1_000_000, 0x7FFF_FFFF] {
1465            let input = json!({ "cpu_id": n });
1466            let out = funify_json(input, &f);
1467            let funified = out["cpu_id"]
1468                .as_u64()
1469                .expect("u32 category must remain a Number");
1470            assert!(
1471                funified <= u32::MAX as u64,
1472                "funified `cpu_id`={n} produced {funified}, exceeds u32::MAX. \
1473                 The u32-narrow dispatch is broken or the category fell through \
1474                 to numeric_id (full u64).",
1475            );
1476        }
1477    }
1478
1479    /// JSON walker: u32 sentinels (0 and u32::MAX) round-trip
1480    /// unchanged through the u32-category dispatch. Without this,
1481    /// a `uid: 0` (root) or `cpu_id: u32::MAX` ("no value") marker
1482    /// would silently turn into a random funified u32, hiding the
1483    /// sentinel meaning the failure-dump renderers depend on.
1484    #[test]
1485    fn funify_json_u32_category_preserves_sentinels() {
1486        let f = Funifier::with_seed("demo");
1487        let input = json!({
1488            "cpu_id_zero": { "cpu_id": 0 },
1489            "cpu_id_max":  { "cpu_id": u32::MAX },
1490            "uid_zero":    { "uid": 0 },
1491            "uid_max":     { "uid": u32::MAX },
1492        });
1493        let out = funify_json(input, &f);
1494        assert_eq!(out["cpu_id_zero"]["cpu_id"], json!(0));
1495        assert_eq!(out["cpu_id_max"]["cpu_id"], json!(u32::MAX));
1496        assert_eq!(out["uid_zero"]["uid"], json!(0));
1497        assert_eq!(out["uid_max"]["uid"], json!(u32::MAX));
1498    }
1499
1500    /// `numeric_id_i32` produces in-range output deterministically.
1501    /// Pins (a) sentinels round-trip (0, i32::MIN, i32::MAX),
1502    /// (b) sign survives funification, (c) abs values match
1503    /// across signs, (d) determinism per (seed, category, n),
1504    /// and (e) the i32::MIN sentinel guard prevents the
1505    /// `unsigned_abs` overflow that would otherwise wrap to 0.
1506    #[test]
1507    fn numeric_id_i32_masks_to_i32_range_and_preserves_sentinels() {
1508        let f = Funifier::with_seed("demo");
1509        // Sentinels.
1510        assert_eq!(f.numeric_id_i32("kuid", 0), 0);
1511        assert_eq!(f.numeric_id_i32("kuid", i32::MIN), i32::MIN);
1512        assert_eq!(f.numeric_id_i32("kuid", i32::MAX), i32::MAX);
1513        // Sign + abs symmetry mirrors numeric_id_i64.
1514        let pos = f.numeric_id_i32("kuid", 42);
1515        let neg = f.numeric_id_i32("kuid", -42);
1516        assert!(pos > 0, "positive input must funify to positive output");
1517        assert!(neg < 0, "negative input must funify to negative output");
1518        assert_eq!(pos, -neg, "abs value must match across signs");
1519        // Determinism.
1520        assert_eq!(f.numeric_id_i32("kuid", 42), pos);
1521        // Different category → different funified output.
1522        assert_ne!(
1523            f.numeric_id_i32("kuid", 42),
1524            f.numeric_id_i32("cpu_id", 42),
1525            "category must namespace the funified output",
1526        );
1527    }
1528
1529    /// `is_sentinel_i32` recognises the documented signed-32
1530    /// sentinels.
1531    #[test]
1532    fn is_sentinel_i32_table() {
1533        assert!(Funifier::is_sentinel_i32(0));
1534        assert!(Funifier::is_sentinel_i32(i32::MIN));
1535        assert!(Funifier::is_sentinel_i32(i32::MAX));
1536        assert!(!Funifier::is_sentinel_i32(1));
1537        assert!(!Funifier::is_sentinel_i32(-1));
1538        assert!(!Funifier::is_sentinel_i32(42));
1539    }
1540
1541    /// JSON walker: u32-categorised NEGATIVE values (lowered
1542    /// through serde's i64 path) funify through the i32-narrow
1543    /// dispatch and stay in i32 range. Without the dispatch a
1544    /// negative kuid/kgid would funify to a full-range i64 that
1545    /// overflows a downstream `as i32` cast — this is the
1546    /// regression the new branch closes.
1547    #[test]
1548    fn funify_json_u32_category_negative_stays_in_i32_range() {
1549        let f = Funifier::with_seed("demo");
1550        for n in [
1551            -1i64,
1552            -7,
1553            -42,
1554            -100,
1555            -1024,
1556            -65535,
1557            -1_000_000,
1558            i32::MIN as i64,
1559            // Below i32::MIN — exercises the lower clamp arm
1560            // (`i < i32::MIN as i64` → `i32::MIN`).
1561            i32::MIN as i64 - 1,
1562            // i64::MIN — extreme of the clamp domain; pins the
1563            // arm against an i64-wide negative.
1564            i64::MIN,
1565        ] {
1566            let input = json!({ "kuid": n });
1567            let out = funify_json(input, &f);
1568            let funified = out["kuid"]
1569                .as_i64()
1570                .expect("u32 category negative must remain a signed Number");
1571            assert!(
1572                (i32::MIN as i64..=i32::MAX as i64).contains(&funified),
1573                "funified `kuid`={n} produced {funified}, exceeds i32 range. \
1574                 The i32-narrow dispatch in the i64 branch is broken or the \
1575                 category fell through to numeric_id_i64 (full i64).",
1576            );
1577        }
1578    }
1579
1580    /// `is_metric_passthrough` allowlist hits — whole-key
1581    /// structural vocabulary plus suffix-based unit/quantity
1582    /// patterns. Pins the allowlist content so a future edit
1583    /// that drops an entry (and silently un-allowlists a metric)
1584    /// trips here.
1585    #[test]
1586    fn is_metric_passthrough_allowlist_hits() {
1587        // Whole-key structural vocabulary.
1588        assert!(Funifier::is_metric_passthrough("schema"));
1589        assert!(Funifier::is_metric_passthrough("version"));
1590        assert!(Funifier::is_metric_passthrough("type"));
1591        assert!(Funifier::is_metric_passthrough("kind"));
1592        assert!(Funifier::is_metric_passthrough("status"));
1593        assert!(Funifier::is_metric_passthrough("nr_running"));
1594        assert!(Funifier::is_metric_passthrough("nr_queued"));
1595        assert!(Funifier::is_metric_passthrough("runqueue_depth"));
1596        assert!(Funifier::is_metric_passthrough("nice"));
1597        assert!(Funifier::is_metric_passthrough("weight"));
1598        assert!(Funifier::is_metric_passthrough("priority"));
1599
1600        // Suffix vocabulary — count / rate / unit / ratio.
1601        assert!(Funifier::is_metric_passthrough("reads_completed"));
1602        assert!(Funifier::is_metric_passthrough("io_errors_total"));
1603        assert!(Funifier::is_metric_passthrough("wakeups_per_sec"));
1604        assert!(Funifier::is_metric_passthrough("memory_max_bytes"));
1605        assert!(Funifier::is_metric_passthrough("cpu_max_quota_us"));
1606        assert!(Funifier::is_metric_passthrough("page_locality_ratio"));
1607        assert!(Funifier::is_metric_passthrough("cpu_time_fraction"));
1608        assert!(Funifier::is_metric_passthrough("idle_pct"));
1609        assert!(Funifier::is_metric_passthrough("queue_depth"));
1610        assert!(Funifier::is_metric_passthrough("buffer_size"));
1611        assert!(Funifier::is_metric_passthrough("thread_count"));
1612    }
1613
1614    /// `is_metric_passthrough` allowlist misses — identifier-
1615    /// shaped keys that the inverted polarity now FUNIFIES (vs.
1616    /// v1, which passed through everything not in the
1617    /// identifier deny-list).
1618    #[test]
1619    fn is_metric_passthrough_allowlist_misses() {
1620        // Keys the v1 deny-list classified as identifiers — now
1621        // funified through the default-funify path.
1622        assert!(!Funifier::is_metric_passthrough("pid"));
1623        assert!(!Funifier::is_metric_passthrough("tid"));
1624        assert!(!Funifier::is_metric_passthrough("tgid"));
1625        assert!(!Funifier::is_metric_passthrough("ppid"));
1626        assert!(!Funifier::is_metric_passthrough("comm"));
1627        assert!(!Funifier::is_metric_passthrough("cpu"));
1628        assert!(!Funifier::is_metric_passthrough("cgroup"));
1629        assert!(!Funifier::is_metric_passthrough("dest_cpu"));
1630        assert!(!Funifier::is_metric_passthrough("running_pid"));
1631        assert!(!Funifier::is_metric_passthrough("scheduler"));
1632
1633        // Known suffix-aliasing gaps. The current allowlist treats
1634        // `_type`, `_kind`, `_state`, `_len`, `_offset` (and other
1635        // structural-enum / quantity suffixes) as metric markers,
1636        // which is sound when the value is a structural enum or
1637        // numeric quantity but over-matches on identifier-shaped
1638        // keys whose tail happens to resemble one. The keys below
1639        // SHOULD funify and DO NOT under the suffix gate. No
1640        // assertions are added — they would fail today, and the
1641        // resolution is schema-driven classification rather than
1642        // encoding a known-bad expectation. Examples observed in
1643        // the failure-dump and capture schemas:
1644        //   - `task_type`, `node_type`   — cgroup / NUMA tags whose
1645        //                                  values are identifier-
1646        //                                  shaped enums
1647        //   - `parent_kind`              — task-relationship tag
1648        //   - `path_len`                 — ends in `_len`, but the
1649        //                                  sibling `path` carries
1650        //                                  the actual identifier
1651        //                                  string
1652        //   - `mount_offset`             — ends in `_offset`, but
1653        //                                  co-locates with a
1654        //                                  mount-point identifier
1655        // All of the above pass through is_metric_passthrough
1656        // today. Schema-driven classification (tagging each field's
1657        // intent at the type level) is a future direction that
1658        // would remove the suffix heuristic's false positives.
1659
1660        // Novel identifier-shaped keys the v1 deny-list missed
1661        // entirely — now funified by default. The suffix heuristic
1662        // can over-match keys ending in structural-enum suffixes
1663        // (see the gap comment above); the cases below avoid those
1664        // suffixes and are reliably hidden.
1665        assert!(!Funifier::is_metric_passthrough("cgroup_path"));
1666        assert!(!Funifier::is_metric_passthrough("path"));
1667        assert!(!Funifier::is_metric_passthrough("hostname"));
1668        assert!(!Funifier::is_metric_passthrough("xyz"));
1669    }
1670
1671    /// Every VirtioBlkCounters field name passes the metric
1672    /// allowlist. Pinning each name guards against fun mode
1673    /// silently hiding disk counters in failure dumps when a
1674    /// future allowlist edit drops a suffix or whole-key entry
1675    /// these counters depend on.
1676    #[test]
1677    fn virtio_blk_counter_names_are_metric_passthrough() {
1678        for name in [
1679            "reads_completed",
1680            "writes_completed",
1681            "flushes_completed",
1682            "bytes_read",
1683            "bytes_written",
1684            "throttled_count",
1685            "io_errors",
1686        ] {
1687            assert!(
1688                Funifier::is_metric_passthrough(name),
1689                "{name} must be metric",
1690            );
1691        }
1692    }
1693
1694    /// funify_json funifies non-metric-keyed values and
1695    /// preserves metric-keyed values. The input mixes both
1696    /// classes plus an array of objects to exercise every
1697    /// walker path.
1698    #[test]
1699    fn funify_json_funifies_non_metric_keys_and_preserves_metrics() {
1700        let f = Funifier::with_seed("demo");
1701        let input = json!({
1702            "schema": "single",
1703            "version": "1.2.3",
1704            "comm": "ktstr_test",
1705            "pid": 42,
1706            "nr_running": 7,
1707            "scheduler": "scx_simple",
1708            "wakeups_per_sec": 500.0,
1709            "thread_count": 4,
1710            "cpus": [
1711                { "cpu": 1, "comm": "swapper" },
1712                { "cpu": 3, "comm": "ktstr_worker" }
1713            ]
1714        });
1715        let out = funify_json(input.clone(), &f);
1716
1717        // Metric-keyed values pass through unchanged.
1718        assert_eq!(out["schema"], json!("single"));
1719        assert_eq!(out["version"], json!("1.2.3"));
1720        assert_eq!(out["nr_running"], json!(7));
1721        assert_eq!(out["wakeups_per_sec"], json!(500.0));
1722        assert_eq!(out["thread_count"], json!(4));
1723
1724        // Non-metric-keyed values get funified.
1725        assert_ne!(out["comm"], input["comm"]);
1726        assert_ne!(out["pid"], input["pid"]);
1727        assert_ne!(out["scheduler"], input["scheduler"]);
1728
1729        // String funification renders as "adjective-animal".
1730        let comm = out["comm"].as_str().unwrap();
1731        assert!(
1732            comm.contains('-'),
1733            "expected adjective-animal token, got {comm:?}",
1734        );
1735
1736        // Array of objects: each object's keys are independently
1737        // re-classified. cpu and comm are non-metric so they get
1738        // funified per element.
1739        assert_ne!(out["cpus"][0]["comm"], input["cpus"][0]["comm"]);
1740        assert_ne!(out["cpus"][1]["comm"], input["cpus"][1]["comm"]);
1741        // cpu=1 is non-sentinel so funification swaps it.
1742        assert_ne!(out["cpus"][0]["cpu"], input["cpus"][0]["cpu"]);
1743        assert_ne!(out["cpus"][1]["cpu"], input["cpus"][1]["cpu"]);
1744
1745        // Round-trip through serde_json::to_string succeeds.
1746        let s = serde_json::to_string(&out).expect("serialize");
1747        assert!(!s.is_empty());
1748    }
1749
1750    /// Sentinel preservation under non-metric keys: `cpu: 0`
1751    /// stays 0, `pid: u64::MAX` stays u64::MAX. Sentinels carry
1752    /// kthread / "no value" semantics that downstream renderers
1753    /// must still see.
1754    #[test]
1755    fn funify_json_preserves_numeric_sentinels() {
1756        let f = Funifier::with_seed("demo");
1757        let input = json!({
1758            "cpu": 0,
1759            "pid": u64::MAX,
1760            "tid": 1,
1761        });
1762        let out = funify_json(input.clone(), &f);
1763        // Sentinel u64 zero preserved (cpu).
1764        assert_eq!(out["cpu"], json!(0));
1765        // Sentinel u64::MAX preserved (pid).
1766        assert_eq!(out["pid"], json!(u64::MAX));
1767        // Non-sentinel funified (tid=1).
1768        assert_ne!(out["tid"], json!(1));
1769    }
1770
1771    /// Floats always pass through, regardless of whether the
1772    /// containing key is a metric. Non-metric float keys stay
1773    /// stable because there is no sensible fun mapping for
1774    /// IEEE-754 values and rates/ratios live everywhere in the
1775    /// schemas.
1776    #[test]
1777    fn funify_json_floats_pass_through_unconditionally() {
1778        let f = Funifier::with_seed("demo");
1779        let input = json!({
1780            "wakeups_per_sec": 500.5,
1781            "fairness_score": 0.75,
1782            "anonymous_float": 4.25,
1783        });
1784        let out = funify_json(input.clone(), &f);
1785        assert_eq!(out["wakeups_per_sec"], json!(500.5));
1786        assert_eq!(out["fairness_score"], json!(0.75));
1787        assert_eq!(out["anonymous_float"], json!(4.25));
1788    }
1789
1790    /// Cross-reference preservation: two values that share both
1791    /// a key AND a payload yield the same funified output, so
1792    /// downstream tooling can correlate "same pid mentioned in
1793    /// two places" without leaking the real pid.
1794    #[test]
1795    fn funify_json_cross_reference_within_dump() {
1796        let f = Funifier::with_seed("demo");
1797        let input = json!({
1798            "running": [
1799                { "pid": 100 },
1800                { "pid": 100 },
1801                { "pid": 200 }
1802            ]
1803        });
1804        let out = funify_json(input, &f);
1805        let p0 = &out["running"][0]["pid"];
1806        let p1 = &out["running"][1]["pid"];
1807        let p2 = &out["running"][2]["pid"];
1808        assert_eq!(p0, p1, "same key + same value must funify identically");
1809        assert_ne!(p0, p2, "same key + different value must differ");
1810    }
1811
1812    /// Array elements inherit the parent key's category. A
1813    /// non-metric parent key (e.g. `pids`) makes every array
1814    /// element funify under that key's namespace; a metric
1815    /// parent key passes every element through.
1816    #[test]
1817    fn funify_json_array_inherits_parent_category() {
1818        let f = Funifier::with_seed("demo");
1819        let input = json!({
1820            "pids": [1, 2, 3],
1821            "completed_per_sec": [10.0, 20.0, 30.0],
1822        });
1823        let out = funify_json(input.clone(), &f);
1824        // Non-metric parent → each element funified.
1825        for i in 0..3 {
1826            assert_ne!(out["pids"][i], input["pids"][i]);
1827        }
1828        // Metric parent → array passes through.
1829        assert_eq!(out["completed_per_sec"], input["completed_per_sec"]);
1830    }
1831
1832    /// Two seeds produce different mappings for the same input.
1833    #[test]
1834    fn distinct_seeds_produce_distinct_mappings() {
1835        let a = Funifier::with_seed("seed-a");
1836        let b = Funifier::with_seed("seed-b");
1837        let na = a.petname_for("comm", "x");
1838        let nb = b.petname_for("comm", "x");
1839        // Could rarely collide by chance; assert at least one
1840        // category differs.
1841        let na2 = a.numeric_id("pid", 42);
1842        let nb2 = b.numeric_id("pid", 42);
1843        assert!(
1844            na != nb || na2 != nb2,
1845            "two seeds must differ on at least one mapping"
1846        );
1847    }
1848
1849    /// Ephemeral Funifier produces stable names within ITS OWN
1850    /// process life but two ephemeral instances differ.
1851    #[test]
1852    fn ephemeral_within_instance_stable_across_instances_random() {
1853        let a = Funifier::ephemeral();
1854        let n1 = a.petname_for("comm", "same");
1855        let n2 = a.petname_for("comm", "same");
1856        assert_eq!(n1, n2);
1857        // Two ephemerals nearly always differ. Compare two
1858        // different categories to keep the test stable in the
1859        // 1-in-65536 collision case.
1860        let b = Funifier::ephemeral();
1861        let a_bundle = (
1862            a.petname_for("comm", "same"),
1863            a.numeric_id("pid", 42),
1864            a.numeric_id("cgroup", 7),
1865        );
1866        let b_bundle = (
1867            b.petname_for("comm", "same"),
1868            b.numeric_id("pid", 42),
1869            b.numeric_id("cgroup", 7),
1870        );
1871        assert_ne!(a_bundle, b_bundle, "two ephemeral instances must differ");
1872    }
1873
1874    /// Dictionary sizes — pinned so a future word-list edit that
1875    /// trims either array trips here before downstream callers
1876    /// see fewer fun names than expected.
1877    #[test]
1878    fn dictionary_sizes_pinned() {
1879        assert_eq!(ADJECTIVES.len(), 272, "adjective list must be 272 entries");
1880        assert_eq!(ANIMALS.len(), 264, "animal list must be 264 entries");
1881    }
1882
1883    /// Every dictionary entry is non-empty lowercase ASCII (no
1884    /// spaces, hyphens, or special characters). Guards against a
1885    /// future word-list addition that breaks the
1886    /// "adjective-animal" tokenization invariant.
1887    #[test]
1888    fn dictionary_entries_are_lowercase_ascii_words() {
1889        for w in ADJECTIVES.iter().chain(ANIMALS.iter()) {
1890            assert!(!w.is_empty(), "empty word in dictionary");
1891            assert!(
1892                w.chars().all(|c| c.is_ascii_lowercase()),
1893                "non-lowercase-ASCII word in dictionary: {w:?}",
1894            );
1895        }
1896    }
1897}
ktstr/fun.rs

ktstr/
fun.rs