Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 40 additions & 7 deletions src/bin/report.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
//! - bracket string accesses `["x"]` (lower = more `.x` member access)
//! - "opaque name ratio": fraction of identifiers that are 1-2 chars or hex
//! (lower = more meaningful names)
//! - "hexrefs": raw (non-distinct) occurrence count of `_0x…`-style identifiers
//! (lower = less decoder residue; spikes when an undecoded string-array
//! decoder is referenced throughout the output)

use std::collections::HashSet;
use std::fs;
Expand All @@ -23,8 +26,8 @@ fn main() {
entries.sort();

println!(
"{:<13} {:>9} {:>9} {:>6} {:>8} {:>8} {:>6}",
"sample", "in_bytes", "out_bytes", "kept%", "brackets", "opaque%", "rounds"
"{:<13} {:>9} {:>9} {:>6} {:>8} {:>8} {:>8} {:>6}",
"sample", "in_bytes", "out_bytes", "kept%", "brackets", "opaque%", "hexrefs", "rounds"
);
for p in entries {
let name = p.file_name().unwrap().to_string_lossy().into_owned();
Expand All @@ -34,14 +37,16 @@ fn main() {
let kept = r.code.len() * 100 / src.len().max(1);
let brackets = count_bracket_access(&r.code);
let opaque = opaque_name_ratio(&r.code);
let hexrefs = count_hex_refs(&r.code);
println!(
"{:<13} {:>9} {:>9} {:>5}% {:>8} {:>7}% {:>6}",
"{:<13} {:>9} {:>9} {:>5}% {:>8} {:>7}% {:>8} {:>6}",
name,
src.len(),
r.code.len(),
kept,
brackets,
opaque,
hexrefs,
r.rounds
);
}
Expand Down Expand Up @@ -90,6 +95,7 @@ fn report_generated_corpus() {
agg.out_bytes += r.code.len();
agg.brackets += count_bracket_access(&r.code);
agg.opaque_sum += opaque_name_ratio(&r.code);
agg.hexrefs += count_hex_refs(&r.code);
agg.rounds_max = agg.rounds_max.max(r.rounds);
if !r.converged {
agg.not_converged += 1;
Expand All @@ -101,8 +107,8 @@ fn report_generated_corpus() {

println!("\nobfuscator.io corpus (samples/generated/) — aggregated by profile");
println!(
"{:<13} {:>5} {:>9} {:>9} {:>6} {:>8} {:>8} {:>6} {:>5}",
"profile", "files", "in_bytes", "out_bytes", "kept%", "brackets", "opaque%", "rounds", "conv"
"{:<13} {:>5} {:>9} {:>9} {:>6} {:>8} {:>8} {:>8} {:>6} {:>5}",
"profile", "files", "in_bytes", "out_bytes", "kept%", "brackets", "opaque%", "hexrefs", "rounds", "conv"
);

let mut profiles: Vec<String> = by_profile.keys().cloned().collect();
Expand All @@ -129,6 +135,7 @@ struct ProfileAgg {
out_bytes: usize,
brackets: usize,
opaque_sum: usize,
hexrefs: usize,
rounds_max: usize,
not_converged: usize,
}
Expand All @@ -140,6 +147,7 @@ impl ProfileAgg {
self.out_bytes += o.out_bytes;
self.brackets += o.brackets;
self.opaque_sum += o.opaque_sum;
self.hexrefs += o.hexrefs;
self.rounds_max = self.rounds_max.max(o.rounds_max);
self.not_converged += o.not_converged;
}
Expand All @@ -154,8 +162,8 @@ fn print_agg_row(label: &str, a: &ProfileAgg) {
format!("!{}", a.not_converged)
};
println!(
"{:<13} {:>5} {:>9} {:>9} {:>5}% {:>8} {:>7}% {:>6} {:>5}",
label, a.files, a.in_bytes, a.out_bytes, kept, a.brackets, opaque, a.rounds_max, conv
"{:<13} {:>5} {:>9} {:>9} {:>5}% {:>8} {:>7}% {:>8} {:>6} {:>5}",
label, a.files, a.in_bytes, a.out_bytes, kept, a.brackets, opaque, a.hexrefs, a.rounds_max, conv
);
}

Expand All @@ -172,6 +180,31 @@ fn count_bracket_access(s: &str) -> usize {
s.matches("[\"").count()
}

/// Raw (non-distinct) occurrence count of `_0x…`-style hex-named identifiers
/// in the output. Counts every token starting with `_0x` followed by hex digits,
/// regardless of how many times the same name appears — so a single undecoded
/// decoder referenced N times contributes N. Lower is better; ~0 means no
/// decoder residue. Spikes clearly on strarr/numbers/strong profiles where the
/// string-array decoder survives intact.
fn count_hex_refs(s: &str) -> usize {
let mut count = 0usize;
let mut cur = String::new();
for ch in s.chars() {
if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
cur.push(ch);
} else {
if cur.starts_with("_0x") && cur.len() > 3 {
count += 1;
}
cur.clear();
}
}
if cur.starts_with("_0x") && cur.len() > 3 {
count += 1;
}
count
}

/// Fraction (%) of distinct identifier-ish tokens that look obfuscated:
/// length ≤ 2, or hex-style `_0x...`.
fn opaque_name_ratio(s: &str) -> usize {
Expand Down
91 changes: 74 additions & 17 deletions src/passes/dce.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

use oxc_allocator::{Allocator, Vec as ArenaVec};
use oxc_ast::ast::{BindingPattern, Program, Statement, VariableDeclarator};
use oxc_ast_visit::Visit;
use oxc_semantic::Scoping;
use oxc_traverse::{traverse_mut, Traverse, TraverseCtx};

Expand Down Expand Up @@ -75,10 +76,10 @@ fn keep_statement(s: &Statement<'_>, scoping: &Scoping) -> bool {
match s {
Statement::EmptyStatement(_) => false,
Statement::VariableDeclaration(vd) => !vd.declarations.is_empty(),
Statement::FunctionDeclaration(f) => {
f.id.as_ref()
.is_none_or(|id| !fn_decl_is_dead(id.symbol_id(), scoping))
}
Statement::FunctionDeclaration(f) => f
.id
.as_ref()
.is_none_or(|id| !fn_decl_is_dead(id.symbol_id(), f, scoping)),
// A bare literal statement (`80214130;`, `true;`, `null;`) computes a
// value that is immediately discarded with no side effect — pure
// obfuscation noise (often a spent opaque-predicate constant). Drop it.
Expand Down Expand Up @@ -111,20 +112,76 @@ fn is_removable_symbol(symbol: oxc_syntax::symbol::SymbolId, scoping: &Scoping)
scoping.symbol_is_unused(symbol)
}

/// A function declaration is dead if its name is never *read* anywhere — i.e. it
/// is never called or otherwise observed. Defining a function declaration has no
/// side effect, so when nothing reads the binding, neither its body nor any
/// self-reassignment inside it can ever run, and the whole statement is inert.
/// A function declaration is dead if its name is never *read* from outside its
/// own body — i.e. no live code calls or otherwise observes it. Defining a
/// function declaration has no side effect, so when nothing external reads the
/// binding, the whole statement is inert: its body only runs when called, and a
/// call can only come through an external read.
///
/// Two gaps over plain `symbol_is_unused` (which counts *any* reference — read or
/// write — as a use) are closed here, both exploited by obfuscator string-array
/// accessors left dead after every call site is inlined:
///
/// This is strictly broader than `symbol_is_unused`, which counts a *write* as a
/// use. The obfuscator string-array accessor relies on that gap: after every
/// call site is inlined away, all that remains is the self-memoizing write inside
/// its own body —
/// `function f(){ const a=[…]; f = function(){ return a; }; return […]; }`
/// — a lone write reference that pins the function alive even though it is dead.
/// Keying on read references instead lets it be collected.
fn fn_decl_is_dead(symbol: oxc_syntax::symbol::SymbolId, scoping: &Scoping) -> bool {
!scoping.get_resolved_references(symbol).any(|r| r.is_read())
/// * **Self-reassignment.** `function f(){ … f = function(){ return a; } … }`
/// leaves a lone *write* reference that `symbol_is_unused` treats as live.
/// Keying on *reads* drops it.
/// * **Self-reads.** The javascript-obfuscator base64/RC4 accessor memoizes
/// through its own name: `function f(i){ … if (f.X === undefined){ f.Y = …;
/// f.Z = {}; f.X = true; } … f.Z[k] … }`. Every surviving reference is a
/// *read* of `f`, but all are lexically inside `f`'s own body, so they only
/// execute if `f` is called from outside — which never happens once the call
/// sites are lifted. Excluding self-body reads lets it be collected.
///
/// Self-body reads are identified by counting, inside the function's own body,
/// the read references that resolve to its symbol, and comparing against the
/// symbol's *total* resolved read count. When they match, every read is in-body
/// and the declaration is dead; an external read makes the in-body count smaller
/// and keeps it live.
fn fn_decl_is_dead(
symbol: oxc_syntax::symbol::SymbolId,
func: &oxc_ast::ast::Function<'_>,
scoping: &Scoping,
) -> bool {
let total_reads = scoping
.get_resolved_references(symbol)
.filter(|r| r.is_read())
.count();
if total_reads == 0 {
return true;
}
// Count the read references to `symbol` that occur within the function's own
// body (its self-memoizing `f.cache`/`f = …` references). Resolution is by
// `reference_id`, so a parameter or local that shadows the name is *not*
// miscounted — only references that actually bind to `symbol` are tallied.
let mut scan = SelfReadScan {
symbol,
scoping,
in_body_reads: 0,
};
if let Some(body) = &func.body {
for stmt in &body.statements {
scan.visit_statement(stmt);
}
}
total_reads == scan.in_body_reads
}

/// Counts read references resolving to `symbol` within the subtree it visits.
struct SelfReadScan<'r> {
symbol: oxc_syntax::symbol::SymbolId,
scoping: &'r Scoping,
in_body_reads: usize,
}

impl<'a, 'r> Visit<'a> for SelfReadScan<'r> {
fn visit_identifier_reference(&mut self, it: &oxc_ast::ast::IdentifierReference<'a>) {
if let Some(rid) = it.reference_id.get() {
let r = self.scoping.get_reference(rid);
if r.symbol_id() == Some(self.symbol) && r.is_read() {
self.in_body_reads += 1;
}
}
}
}

fn is_terminator(s: &Statement<'_>) -> bool {
Expand Down
22 changes: 21 additions & 1 deletion src/passes/member_normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

use oxc_allocator::{Allocator, TakeIn};
use oxc_ast::ast::{
ComputedMemberExpression, Expression, Program, SimpleAssignmentTarget, StaticMemberExpression,
ChainElement, ComputedMemberExpression, Expression, Program, SimpleAssignmentTarget,
StaticMemberExpression,
};
use oxc_traverse::{traverse_mut, Traverse, TraverseCtx};

Expand Down Expand Up @@ -88,4 +89,23 @@ impl<'a> Traverse<'a, ()> for MemberNormalize {
self.changed = true;
}
}

/// Inside an optional chain (`a?.["x"]`), the computed member is a
/// `ChainElement::ComputedMemberExpression`, *not* an
/// `Expression::ComputedMemberExpression`, so `enter_expression` never sees
/// it. Normalize it here too so `a?.["x"]` becomes `a?.x` (and the trailing
/// member of `a["x"]?.["y"]`, which is also a chain element).
fn enter_chain_element(
&mut self,
element: &mut ChainElement<'a>,
ctx: &mut TraverseCtx<'a, ()>,
) {
let ChainElement::ComputedMemberExpression(computed) = element else {
return;
};
if let Some(member) = try_static(computed, ctx) {
*element = ChainElement::StaticMemberExpression(member);
self.changed = true;
}
}
}
Loading
Loading