From 734d615eadb702f27a28737af461d306ef8b1af6 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 8 Jun 2026 17:21:14 +0000 Subject: [PATCH] report: count obfuscator.io corpus in readability metrics The synthetic javascript-obfuscator (obfuscator.io) fixtures in samples/generated/ gated correctness via manifest.json but were excluded from the readability metrics: both the live report binary and the committed SCOREBOARD.md read samples/ non-recursively. Add a per-profile rollup of the generated corpus (aggregated over all seeds, one row per obfuscation technique) to both surfaces, so the obfuscator.io samples count toward readability the same way they count toward correctness. kept% is byte-weighted, opaque% is the mean per-file ratio, rounds is the worst case, and converged flags any non-fixpoint. --- src/bin/report.rs | 120 ++++++++++++++++++++++++++++++++ tests/golden.rs | 125 ++++++++++++++++++++++++++++++++++ tests/snapshots/SCOREBOARD.md | 15 ++++ 3 files changed, 260 insertions(+) diff --git a/src/bin/report.rs b/src/bin/report.rs index 097d43c..b9480f1 100644 --- a/src/bin/report.rs +++ b/src/bin/report.rs @@ -45,6 +45,126 @@ fn main() { r.rounds ); } + + report_generated_corpus(); +} + +/// Profile order mirrors `scripts/gen-samples.js`: each is one javascript-obfuscator +/// (obfuscator.io) technique. Unknown profiles are appended in sorted order. +const PROFILE_ORDER: &[&str] = &[ + "minimal", + "strarr_base64", + "strarr_rc4", + "controlflow", + "deadcode", + "numbers_keys", + "strong", +]; + +/// Aggregate of the obfuscator.io corpus (`samples/generated/`) by profile. +/// Synthetic but obfuscator.io-faithful, so these rows track per-technique +/// readability the same way the real-sample rows track per-target readability. +fn report_generated_corpus() { + let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("samples") + .join("generated"); + let Ok(rd) = fs::read_dir(&dir) else { + return; // corpus not generated locally; nothing to add + }; + + let mut by_profile: std::collections::BTreeMap = Default::default(); + for entry in rd.flatten() { + let p = entry.path(); + if p.extension().and_then(|s| s.to_str()) != Some("js") { + continue; + } + let name = p.file_name().unwrap().to_string_lossy().into_owned(); + let Some(profile) = profile_of(&name) else { + continue; + }; + let src = fs::read_to_string(&p).unwrap(); + let r = deobfuscate(&src, &name); + let agg = by_profile.entry(profile).or_default(); + agg.files += 1; + agg.in_bytes += src.len(); + agg.out_bytes += r.code.len(); + agg.brackets += count_bracket_access(&r.code); + agg.opaque_sum += opaque_name_ratio(&r.code); + agg.rounds_max = agg.rounds_max.max(r.rounds); + if !r.converged { + agg.not_converged += 1; + } + } + if by_profile.is_empty() { + return; + } + + println!("\nobfuscator.io corpus (samples/generated/) — aggregated by profile"); + println!( + "{:<13} {:>5} {:>9} {:>9} {:>6} {:>8} {:>8} {:>6} {:>5}", + "profile", "files", "in_bytes", "out_bytes", "kept%", "brackets", "opaque%", "rounds", "conv" + ); + + let mut profiles: Vec = by_profile.keys().cloned().collect(); + profiles.sort_by_key(|p| { + PROFILE_ORDER + .iter() + .position(|x| x == p) + .unwrap_or(PROFILE_ORDER.len()) + }); + + let mut total = ProfileAgg::default(); + for profile in &profiles { + let a = &by_profile[profile]; + print_agg_row(profile, a); + total.merge(a); + } + print_agg_row("ALL", &total); +} + +#[derive(Default)] +struct ProfileAgg { + files: usize, + in_bytes: usize, + out_bytes: usize, + brackets: usize, + opaque_sum: usize, + rounds_max: usize, + not_converged: usize, +} + +impl ProfileAgg { + fn merge(&mut self, o: &ProfileAgg) { + self.files += o.files; + self.in_bytes += o.in_bytes; + self.out_bytes += o.out_bytes; + self.brackets += o.brackets; + self.opaque_sum += o.opaque_sum; + self.rounds_max = self.rounds_max.max(o.rounds_max); + self.not_converged += o.not_converged; + } +} + +fn print_agg_row(label: &str, a: &ProfileAgg) { + let kept = a.out_bytes * 100 / a.in_bytes.max(1); + let opaque = a.opaque_sum / a.files.max(1); // mean per-file ratio + let conv = if a.not_converged == 0 { + "ok".to_string() + } else { + format!("!{}", a.not_converged) + }; + println!( + "{:<13} {:>5} {:>9} {:>9} {:>5}% {:>8} {:>7}% {:>6} {:>5}", + label, a.files, a.in_bytes, a.out_bytes, kept, a.brackets, opaque, a.rounds_max, conv + ); +} + +/// `seed_01_arithmetic__strong.js` -> `strong`. Profiles are the segment after +/// the `__` separator (seed names use single `_`, the join uses `__`). +fn profile_of(name: &str) -> Option { + let stem = name.strip_suffix(".js")?; + let (_, profile) = stem.rsplit_once("__")?; + Some(profile.to_string()) } fn count_bracket_access(s: &str) -> usize { diff --git a/tests/golden.rs b/tests/golden.rs index 72ed387..328d79e 100644 --- a/tests/golden.rs +++ b/tests/golden.rs @@ -186,9 +186,134 @@ fn render_scoreboard() -> String { if r.converged { "yes" } else { "**no**" }, )); } + render_generated_corpus(&mut s); s } +/// Profile order mirrors `scripts/gen-samples.js` — each is one +/// javascript-obfuscator (obfuscator.io) technique. Unknown profiles append last. +const PROFILE_ORDER: &[&str] = &[ + "minimal", + "strarr_base64", + "strarr_rc4", + "controlflow", + "deadcode", + "numbers_keys", + "strong", +]; + +/// Append a per-profile rollup of the obfuscator.io corpus (`samples/generated/`). +/// These are synthetic but obfuscator.io-faithful, and they gate correctness via +/// `manifest.json`; aggregating them here makes them count toward readability too, +/// per technique. `kept%` is byte-weighted (Σout/Σin); `opaque%` is the mean of +/// per-file ratios; `rounds` is the worst case; `converged` flags any non-fixpoint. +fn render_generated_corpus(s: &mut String) { + let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("samples") + .join("generated"); + let mut by_profile: std::collections::BTreeMap = Default::default(); + let Ok(rd) = fs::read_dir(&dir) else { + return; + }; + for entry in rd.flatten() { + let p = entry.path(); + if p.extension().and_then(|x| x.to_str()) != Some("js") { + continue; + } + let name = p.file_name().unwrap().to_string_lossy().into_owned(); + let Some((_, profile)) = name.strip_suffix(".js").and_then(|st| st.rsplit_once("__")) else { + continue; + }; + let src = fs::read_to_string(&p).unwrap(); + let r = deobfuscate(&src, &name); + let a = by_profile.entry(profile.to_string()).or_default(); + a.files += 1; + a.in_bytes += src.len(); + a.out_bytes += r.code.len(); + a.brackets += count_bracket_access(&r.code); + a.opaque_sum += opaque_name_ratio(&r.code); + a.rounds_max = a.rounds_max.max(r.rounds); + if !r.converged { + a.not_converged += 1; + } + } + if by_profile.is_empty() { + return; + } + + let mut profiles: Vec = by_profile.keys().cloned().collect(); + profiles.sort_by_key(|p| { + PROFILE_ORDER + .iter() + .position(|x| x == p) + .unwrap_or(PROFILE_ORDER.len()) + }); + + s.push_str("\n## obfuscator.io corpus (`samples/generated/`), by profile\n\n"); + s.push_str( + "Synthetic javascript-obfuscator fixtures (one row per technique, \ + aggregated over all seeds). `kept%` is byte-weighted Σout/Σin; \ + `opaque%` is the mean per-file ratio; `rounds` is the worst case; \ + `converged` is `yes` only if every file reached a fixpoint.\n\n", + ); + s.push_str( + "| profile | files | in_bytes | out_bytes | kept% | brackets | opaque% | rounds | converged |\n", + ); + s.push_str( + "|---------|------:|---------:|----------:|------:|---------:|--------:|-------:|:---------:|\n", + ); + let mut total = ProfileAgg::default(); + for profile in &profiles { + push_agg_row(s, profile, &by_profile[profile]); + total.merge(&by_profile[profile]); + } + push_agg_row(s, "**all**", &total); +} + +#[derive(Default)] +struct ProfileAgg { + files: usize, + in_bytes: usize, + out_bytes: usize, + brackets: usize, + opaque_sum: usize, + rounds_max: usize, + not_converged: usize, +} + +impl ProfileAgg { + fn merge(&mut self, o: &ProfileAgg) { + self.files += o.files; + self.in_bytes += o.in_bytes; + self.out_bytes += o.out_bytes; + self.brackets += o.brackets; + self.opaque_sum += o.opaque_sum; + self.rounds_max = self.rounds_max.max(o.rounds_max); + self.not_converged += o.not_converged; + } +} + +fn push_agg_row(s: &mut String, label: &str, a: &ProfileAgg) { + let kept = a.out_bytes * 100 / a.in_bytes.max(1); + let opaque = a.opaque_sum / a.files.max(1); + s.push_str(&format!( + "| {} | {} | {} | {} | {}% | {} | {}% | {} | {} |\n", + label, + a.files, + a.in_bytes, + a.out_bytes, + kept, + a.brackets, + opaque, + a.rounds_max, + if a.not_converged == 0 { + "yes".to_string() + } else { + format!("**{} no**", a.not_converged) + }, + )); +} + fn count_bracket_access(s: &str) -> usize { s.matches("[\"").count() } diff --git a/tests/snapshots/SCOREBOARD.md b/tests/snapshots/SCOREBOARD.md index 79510fe..c584c71 100644 --- a/tests/snapshots/SCOREBOARD.md +++ b/tests/snapshots/SCOREBOARD.md @@ -19,3 +19,18 @@ Regenerated by `tests/golden.rs` (re-bless with `UPDATE_SNAPSHOTS=1 cargo test - | sample_7.js | 745944 | 609607 | 81% | 576 | 6% | 6 | yes | | sample_8.js | 401876 | 232934 | 57% | 4 | 1% | 7 | yes | | sample_9.js | 156692 | 157990 | 100% | 0 | 5% | 5 | yes | + +## obfuscator.io corpus (`samples/generated/`), by profile + +Synthetic javascript-obfuscator fixtures (one row per technique, aggregated over all seeds). `kept%` is byte-weighted Σout/Σin; `opaque%` is the mean per-file ratio; `rounds` is the worst case; `converged` is `yes` only if every file reached a fixpoint. + +| profile | files | in_bytes | out_bytes | kept% | brackets | opaque% | rounds | converged | +|---------|------:|---------:|----------:|------:|---------:|--------:|-------:|:---------:| +| minimal | 20 | 31067 | 19334 | 62% | 12 | 8% | 4 | yes | +| strarr_base64 | 20 | 68836 | 47423 | 68% | 12 | 5% | 4 | yes | +| strarr_rc4 | 20 | 107162 | 77393 | 72% | 12 | 4% | 4 | yes | +| controlflow | 20 | 62944 | 20273 | 32% | 12 | 8% | 7 | yes | +| deadcode | 20 | 56174 | 19709 | 35% | 12 | 8% | 5 | yes | +| numbers_keys | 20 | 90297 | 48641 | 53% | 12 | 5% | 4 | yes | +| strong | 20 | 164800 | 95481 | 57% | 12 | 5% | 5 | yes | +| **all** | 140 | 581280 | 328254 | 56% | 84 | 6% | 7 | yes |