diff --git a/progress-handoff-91xXCG.md b/progress-handoff-91xXCG.md
new file mode 100644
index 0000000000000..f849d0fbe4e51
--- /dev/null
+++ b/progress-handoff-91xXCG.md
@@ -0,0 +1,279 @@
+# Progress for handoff-91xXCG
+
+Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-91xXCG.md`
+
+## Status
+
+- [x] Confirmed no active `html-decoder-fuzz` run before editing.
+- [x] Tier 1 item 1: run both decoder contexts per generated case.
+- [x] Tier 1 item 2: add oracle-free arbitrary byte-space lane.
+- [x] Tier 1 item 3: add reference-at-EOF generation strategy.
+- [x] Tier 1 item 4: add `attribute_starts_with()` monotonicity invariants.
+- [x] Tier 1 item 5: exercise multi-code-point `attribute_starts_with()` prefix paths.
+- [x] Tier 1 item 6: add range-based numeric code point generation.
+- [x] Tier 2 item 7: add exhaustive deterministic name sweep lane.
+- [x] Tier 2 item 8: add edit-distance-1 lookalike generation.
+- [x] Tier 2 item 9: add full follower-byte sweep after legacy names.
+- [x] Tier 2 item 10: add prefix-family stress generation.
+- [x] Tier 2 item 11: add digit-count numeric boundary stress generation.
+- [x] Tier 2 item 12: add strategy composition and generalized attribute-prefix encoding.
+- [x] Tier 2 item 13: add mutation/corpus mode.
+- [x] Tier 2 item 14: add reader compositionality invariant.
+- [x] Tier 2 item 15: add case-mangled valid-name near-misses.
+- [x] Tier 3 item 16: assert null reader matches leave `match_byte_length` untouched.
+- [x] Tier 3 item 17: assert non-ampersand reader offsets never match.
+- [x] Tier 3 item 18: assert attribute no-amp identity in oracle mode.
+- [x] Tier 3 item 19: add tab, LF, and FF to the oracle-safe generator alphabet.
+- [x] Tier 3 item 20: assert reader reconstruction walks input without gaps or overlaps.
+- [x] Tier 3 item 21: assert invalid numeric references decode to exactly U+FFFD.
+- [x] Tier 3 item 22: assert C1 remapping applies only to numeric references while raw C1 bytes pass through unchanged.
+- [x] Tier 3 item 23: add `html_entity_decode( ENT_HTML5 | ENT_QUOTES )` as a secondary text-context oracle.
+- [x] Tier 3 item 24: add token-map structure-aware deterministic inputs.
+- [x] Tier 3 item 25: add pcov-backed coverage-guided lane with new-edge corpus retention.
+- [x] Tier 3 item 26: assert documented single-level decoding for nested ampersand references.
+- [x] Cross-cutting concerns: sort derived name lists deterministically and document DOM oracle throughput limits.
+
+## Verification
+
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` passed.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 20 --progress-every 20` passed and reported `by_context: {"both":20}`.
+- 2026-06-11: `php -l` passed for `Generator.php`, `Checks.php`, `Targets.php`, `worker.php`, `runner.php`, `replay.php`, `minimize.php`, and `tests/harness-smoke.php`.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200` passed.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=byte-no-amp-identity php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200` reported findings as expected.
+- 2026-06-11: `php tools/html-decoder-fuzz/runner.php --mode bytes --lanes 1 --duration-seconds 0 --max-cases 200 --cases-per-batch 200 --summary-mode none --output-dir /tmp/html-decoder-fuzz-byte-check` passed.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=byte-no-amp-identity php tools/html-decoder-fuzz/runner.php --mode bytes --lanes 1 --duration-seconds 0 --max-cases 200 --cases-per-batch 200 --max-artifacts-per-signature 1 --output-dir /tmp/html-decoder-fuzz-byte-fault-runner` reported findings as expected.
+- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode bytes --seed 1 --case 0` passed.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after byte-space lane coverage was added.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding mode-aware artifact separation, oracle-trap, and bogus-mode malformed-record coverage.
+- 2026-06-11: `git diff --check` passed.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` passed after adding the reference-at-EOF strategy.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed and reported `reference-at-eof: 46`.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding reference-at-EOF coverage.
+- 2026-06-11: Documented that adding the new weighted strategy intentionally changes generated-case `--seed --case` payload mapping; failure-manifest replay remains payload-stable.
+- 2026-06-11: Verified `reference-at-eof` still ends in a reference for `max-bytes` 1, 2, 3, 4, 5, and 8 after reserving suffix space.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after tightening EOF suffix-shape coverage.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Checks.php`, `php -l tools/html-decoder-fuzz/lib/Targets.php`, and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding `attribute_starts_with()` monotonicity checks.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding `attribute_starts_with()` prefix, extension, case monotonicity, and fault-target coverage.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed with no real-target findings after adding `attribute_starts_with()` monotonicity checks.
+- 2026-06-11: `git diff --check` passed after adding `attribute_starts_with()` monotonicity checks.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Checks.php`, `php -l tools/html-decoder-fuzz/lib/Generator.php`, `php -l tools/html-decoder-fuzz/lib/Targets.php`, and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding multi-code-point `attribute_starts_with()` prefix coverage.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding byte-slice search probes, multi-code-point generator cases, and the `attribute-multicodepoint-prefix` fault target.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed with no real-target findings after adding multi-code-point prefix coverage.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-multicodepoint-prefix php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 681 --cases 1 --progress-every 1` reported findings as expected and verified invalid-UTF-8 search details remain JSON-safe.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-multicodepoint-prefix php tools/html-decoder-fuzz/replay.php --seed 1 --case 681` reproduced the multi-code-point prefix finding.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-multicodepoint-prefix php tools/html-decoder-fuzz/minimize.php --failure /tmp/html-decoder-fuzz-multicodepoint-fault-681/failure-seed1-case681/failure.json` minimized the finding from 18 to 6 bytes.
+- 2026-06-11: `git diff --check` passed after adding multi-code-point prefix coverage.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding range-based numeric code point generation.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting numeric range buckets, all 32 C1 remap rows, and all 16 noncharacter planes.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed with no real-target findings after adding range-based numeric code points.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=skip-c1-remap php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 128 --cases 1 --progress-every 1` reported findings as expected after the range generator shifted the deterministic C1 fault case from 170 to 128.
+- 2026-06-11: `git diff --check` passed after adding range-based numeric code points.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed after addressing reviewer feedback on post-surrogate BMP coverage and multi-reference numeric smoke classification.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed after adding explicit BMP terminal noncharacter coverage for `0xFFFE` and `0xFFFF`.
+- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `runner.php`, `replay.php`, `minimize.php`, and `tests/harness-smoke.php` after adding the deterministic name-sweep lane.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding full-period name-sweep generator coverage plus worker, runner, and replay smoke checks for `--mode names`.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode names --seed 1 --cases 1000 --progress-every 1000` passed and reported `by_strategy: {"name-sweep":1000}` and `by_context: {"both":1000}`.
+- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode names --seed 1 --case 11593` passed for the deterministic `Áx` case.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-semicolonless php tools/html-decoder-fuzz/replay.php --mode names --seed 1 --case 11593` reproduced the expected attribute decode mismatch for `Áx`.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-semicolonless php tools/html-decoder-fuzz/minimize.php --failure /tmp/html-decoder-fuzz-name-fault-11593/failure-seed1-case11593/failure.json` minimized the finding from 8 to 7 bytes.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding reviewer-requested checks for distinct `names` runner start-case windows and the faulted name-sweep worker/replay/minimize pipeline.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding edit-distance-1 lookalike generation.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting lookalike samples produce edit-distance-1 name misses and a sparse-name corpus exercises delete, insert, substitute, and transpose branches.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed with no real-target findings after adding dynamic lookalikes.
+- 2026-06-11: `git diff --check` passed after adding dynamic lookalikes.
+- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `runner.php`, `replay.php`, `tests/harness-smoke.php`, `class-wp-html-decoder.php`, and `wpHtmlDecoder.php` after adding the legacy-follower sweep and ASCII-only ambiguous follower fix.
+- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode legacy-followers --seed 1 --case 124` initially reproduced a real attribute decode mismatch for `Á\xC2\x80`; after replacing locale-sensitive `ctype_alnum()` with ASCII byte checks, the replay passed.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `vendor/bin/phpunit --group html-api tests/phpunit/tests/html-api/wpHtmlDecoder.php`, `php tools/html-decoder-fuzz/worker.php --mode legacy-followers --seed 1 --cases 300 --progress-every 300`, `php tools/html-decoder-fuzz/runner.php --mode legacy-followers --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-legacy-followers-check-fixed`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed.
+- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `runner.php`, `replay.php`, and `tests/harness-smoke.php` after adding the prefix-family sweep mode.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting prefix-family full-period mapping over the exact expected reference set, reference splits, and ambiguous followers plus worker, runner, replay, seed-replay fault, and failure-manifest fault-pipeline coverage.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode prefix-families --seed 1 --cases 300 --progress-every 300`, `php tools/html-decoder-fuzz/runner.php --mode prefix-families --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-prefix-families-runner-check`, `php tools/html-decoder-fuzz/replay.php --mode prefix-families --seed 1 --case 37`, `HTML_DECODER_FUZZ_FAULT=attribute-semicolonless php tools/html-decoder-fuzz/worker.php --mode prefix-families --seed 1 --start-case 37 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-prefix-families-fault-check`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed.
+- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `replay.php`, and `tests/harness-smoke.php` after adding the numeric-boundary sweep mode.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting numeric-boundary full-period mapping over 6/7 hex and 7/8 decimal significant digit counts, leading-zero variants, semicolon variants, mixed-case hex digits, worker, runner, replay, seed-replay fault, and failure-manifest fault-pipeline coverage.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode numeric-boundaries --seed 1 --cases 300 --progress-every 300`, `php tools/html-decoder-fuzz/runner.php --mode numeric-boundaries --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-numeric-boundaries-runner-check`, `php tools/html-decoder-fuzz/replay.php --mode numeric-boundaries --seed 1 --case 25`, `HTML_DECODER_FUZZ_FAULT=match-length-off-by-one php tools/html-decoder-fuzz/worker.php --mode numeric-boundaries --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-numeric-boundaries-fault-check`, `HTML_DECODER_FUZZ_FAULT=match-length-off-by-one php tools/html-decoder-fuzz/replay.php --failure /tmp/html-decoder-fuzz-numeric-boundaries-fault-check/failure-seed1-case0/failure.json`, `HTML_DECODER_FUZZ_FAULT=match-length-off-by-one php tools/html-decoder-fuzz/minimize.php --failure /tmp/html-decoder-fuzz-numeric-boundaries-fault-check/failure-seed1-case0/failure.json`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed.
+- 2026-06-11: After reviewer feedback, exact-max numeric-boundary cases now use in-range payloads (`` and `` casing variants) while max-plus-one cases remain invalid; `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/replay.php --mode numeric-boundaries --seed 1 --case 25`, the refreshed fault-manifest replay/minimize, default 500-case worker, and `git diff --check` passed.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding the weighted composition strategy and generalized attribute-prefix encoder.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting all 12 weighted strategies appear, composition generates multi-reference splices, generalized attribute-prefix encoding covers every target string and literal/decimal/leading-zero/hex/semicolonless forms, and the skip-C1 fault artifact checks use the new deterministic case 157 after generator weighting shifted seed/case mapping.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed after adding strategy composition and reported `by_strategy` including `"composition":30`; `git diff --check` passed.
+- 2026-06-11: After reviewer feedback, semicolonless numeric boundary protection now treats `;` as a reference-extending follower, composition inserts explicit fragment separators, and smoke asserts the exact weighted strategy set plus 2-3 separated composition fragments; `php tools/html-decoder-fuzz/tests/harness-smoke.php` and `git diff --check` passed.
+- 2026-06-11: After follow-up reviewer feedback, composition now keeps separated fragments nonempty under small public `max-bytes` values; a targeted probe for max bytes 3, 5, 7, and 12, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed.
+- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `replay.php`, and `tests/harness-smoke.php` after adding the corpus mutation mode.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting corpus seed-corpus size, retained html5lib text and attribute vectors, all four mutation strategies, semicolon-toggle/reference-duplication shapes, UTF-8-safe splice/perturb mutations, oracle-safe diversified payloads, worker, runner start windows, seed replay, faulted seed replay, and failure-manifest replay/minimize coverage.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --cases 300 --progress-every 300` passed and reported all corpus mutation strategies with `by_context: {"both":300}`.
+- 2026-06-11: `php tools/html-decoder-fuzz/runner.php --mode corpus --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-corpus-runner-check-20260611-2` passed.
+- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode corpus --seed 1 --case 0` passed for the deterministic `corpus-byte-perturb` case with hex preview `67262335383b`.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=match-length-off-by-one php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-corpus-fault-check-20260611-2` reported the expected `reader-overran-input` findings; replaying and minimizing the resulting failure manifest with the same fault both succeeded.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` and `git diff --check` passed after adding corpus mutation mode.
+- 2026-06-11: After reviewer feedback, html5lib tree-construction entity rows now normalize simple `
` and `
...
` fixtures into decoder payloads before oracle-safety filtering, corpus mutations choose splice/edit offsets on UTF-8 boundaries, and smoke asserts retained WPT attribute sentinels plus mutation helper shapes; `php tools/html-decoder-fuzz/tests/harness-smoke.php`, the refreshed corpus worker/runner/replay/fault-manifest checks, default 500-case worker, and `git diff --check` passed.
+- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after adding the reader compositionality invariant.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting empty reader chunks, one-byte matches, and non-compositional local-slice reads are detected by fault targets.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/replay.php --seed 1 --case 31`, and `git diff --check` passed after adding reader compositionality checks.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=reader-substring-composition php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 31 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-reader-composition-fault-20260611-1` reported `reader-composition-mismatch` findings; replaying and minimizing the resulting failure manifest with the same fault both succeeded.
+- 2026-06-11: After reviewer feedback, `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed with automated worker, failure-manifest, replay, and minimize coverage for `reader-empty-chunk`, `reader-short-match-length`, and `reader-substring-composition`.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding the case-mangled named-reference strategy.
+- 2026-06-11: A targeted probe over 2,000 seeds produced 115 distinct `case-mangled-name` candidates with zero invalid shape/collision samples, including both lowercase-to-uppercase and uppercase-to-lowercase flips.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=skip-c1-remap php tools/html-decoder-fuzz/worker.php --seed 2 --start-case 36 --cases 1 --progress-every 1`, `HTML_DECODER_FUZZ_FAULT=reader-empty-chunk php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 57 --cases 1 --progress-every 1`, and `HTML_DECODER_FUZZ_FAULT=reader-substring-composition php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 97 --cases 1 --progress-every 1` reported the expected findings after the weighted strategy shifted generated-case mappings.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding case-mangled valid-name near-misses.
+- 2026-06-11: After reviewer feedback, case-mangled smoke coverage now directly invokes `case_mangle_name_base()` against lowercase and uppercase source names; `php -l tools/html-decoder-fuzz/lib/Generator.php`, `php -l tools/html-decoder-fuzz/tests/harness-smoke.php`, a direct helper probe reporting `errors=0`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed.
+- 2026-06-11: Adding the null-return `match_byte_length` sentinel invariant exposed a real `WP_HTML_Decoder::read_character_reference()` issue for unmatched named references in `data` context; `WP_Token_Map::read_token()` returns `null`, and the decoder now checks for `null` instead of `false`.
+- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, `tests/harness-smoke.php`, `class-wp-html-decoder.php`, and `wpHtmlDecoder.php` after adding the null-return match-length invariant and decoder regression test.
+- 2026-06-11: `vendor/bin/phpunit --group html-api tests/phpunit/tests/html-api/wpHtmlDecoder.php` passed with the unmatched named-reference match-length regression coverage.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=reader-null-mutates-match-length php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 7 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-null-match-fault-check` reported `reader-mutated-match-length-on-null` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after fixing the decoder and adding the invariant.
+- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, `tests/harness-smoke.php`, and `wpHtmlDecoder.php` after adding non-ampersand reader-offset probes.
+- 2026-06-11: `vendor/bin/phpunit --group html-api tests/phpunit/tests/html-api/wpHtmlDecoder.php` passed with non-ampersand offset match-length regression coverage.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=reader-non-amp-match php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-non-amp-fault-check` reported `reader-non-amp-match` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding non-ampersand reader-offset probes.
+- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after generalizing no-amp identity checks to attribute context.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-no-amp-identity php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 38 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-attr-no-amp-fault-check-item18` reported `attribute-without-ampersand-not-identity` findings; replaying the failure manifest reproduced the findings and minimizing it completed successfully.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding attribute no-amp identity coverage.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding tab, LF, and FF to the oracle-safe generator alphabet.
+- 2026-06-11: A reflection probe confirmed the generator alphabet contains space, tab, LF, and FF and remains `Generator::is_oracle_safe_payload()` safe.
+- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode corpus --seed 1 --case 0` passed with the refreshed deterministic corpus byte-perturb preview `64262335383b`, and `php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --cases 300 --progress-every 300` passed.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after expanding the generator alphabet with tab, LF, and FF.
+- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after adding the gapless reader-walk invariant.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=reader-gapless-drop-span php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-gapless-fault-check` reported `reader-walk-not-gapless` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding gapless reader-walk coverage.
+- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after adding the invalid numeric replacement invariant.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=numeric-invalid-not-replacement php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-invalid-numeric-fault-check` reported `numeric-invalid-not-replacement` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding invalid numeric replacement coverage.
+- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after adding numeric C1 remap and raw C1 pass-through invariants.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=numeric-c1-not-remapped php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 2 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-c1-fault-check` reported `numeric-c1-not-remapped` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=raw-c1-not-pass-through php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --start-case 3 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-raw-c1-fault-check` reported `raw-c1-not-pass-through` findings; replaying the failure manifest reproduced the findings and minimizing it with `--signature raw-c1-not-pass-through:text` preserved the raw-C1-specific signature.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding C1 remap-only-for-numeric coverage.
+- 2026-06-11: `php -l` passed for `Oracles.php`, `Checks.php`, `Targets.php`, `worker.php`, and `tests/harness-smoke.php` after adding the secondary text oracle.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=text-secondary-oracle php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 4 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-secondary-fault-check` reported `text-secondary-oracle-mismatch` findings; replaying the failure manifest reproduced the findings and minimizing it with `--signature text-secondary-oracle-mismatch:text` preserved the secondary-oracle signature.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding the secondary text oracle and tightening it to known semicolon-terminated names.
+- 2026-06-11: `php -l` passed for `Bootstrap.php`, `Cli.php`, `Generator.php`, `worker.php`, `replay.php`, and `tests/harness-smoke.php` after adding the token-map structure-aware sweep mode.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode token-map --seed 1 --cases 764 --progress-every 764` passed for one full token-map period and reported `by_strategy: {"token-map-structure-sweep":764}` and `by_context: {"both":764}`.
+- 2026-06-11: `php tools/html-decoder-fuzz/runner.php --mode token-map --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-token-map-runner-check` passed with distinct start-case windows.
+- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode token-map --seed 1 --case 0` passed for the deterministic `&AEaQQ;` large-prefix divergent case.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-semicolonless php tools/html-decoder-fuzz/worker.php --mode token-map --seed 1 --start-case 631 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-token-map-fault-check` reported the expected `decode-mismatch:attribute` finding; replaying and minimizing the resulting failure manifest with the same fault both succeeded.
+- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding the token-map mode, smoke coverage, and docs.
+- 2026-06-11: Local PHP did not have the `pcov` extension installed (`php --ri pcov` reported `Extension 'pcov' not present`), so coverage-mode smoke coverage used the explicit `HTML_DECODER_FUZZ_FAKE_COVERAGE=1` provider while the real mode reports a fatal error when pcov is unavailable.
+- 2026-06-11: `php -l` passed for `CoverageGuidance.php`, `Cli.php`, `worker.php`, `runner.php`, `replay.php`, and `tests/harness-smoke.php` after adding coverage mode.
+- 2026-06-11: `HTML_DECODER_FUZZ_DISABLE_PCOV=1 HTML_DECODER_FUZZ_FAKE_COVERAGE=0 php tools/html-decoder-fuzz/worker.php --mode coverage --seed 1 --cases 1 --progress-every 1` exited `2` with the expected fatal `coverage mode requires pcov`.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAKE_COVERAGE=1 php tools/html-decoder-fuzz/worker.php --mode coverage --seed 1 --cases 8 --progress-every 8 --output-dir /tmp/html-decoder-fuzz-coverage-worker-check` passed and retained fake new-edge payloads under `coverage-corpus/`.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAKE_COVERAGE=1 php tools/html-decoder-fuzz/runner.php --mode coverage --lanes 2 --duration-seconds 0 --max-cases 40 --cases-per-batch 20 --summary-mode failures --output-dir /tmp/html-decoder-fuzz-coverage-runner-check` passed and wrote coverage state with `cases=40`, `edges=76`, `payloads=40`, and `40` coverage corpus manifests.
+- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode coverage --seed 1 --case 0` passed for the deterministic coverage-mode generated case.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAKE_COVERAGE=1 HTML_DECODER_FUZZ_FAULT=reader-empty-chunk php tools/html-decoder-fuzz/worker.php --mode coverage --seed 1 --start-case 57 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-coverage-fault-check` reported the expected reader findings; replaying and minimizing the resulting coverage-mode failure manifest with `HTML_DECODER_FUZZ_FAULT=reader-empty-chunk` both succeeded.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, and `git diff --check` passed after adding coverage mode, fake-provider smoke coverage, and docs.
+- 2026-06-11: `php -l` passed for `Checks.php`, `Oracles.php`, `Targets.php`, and `tests/harness-smoke.php` after adding the single-level decode invariant and `single-level-overdecode` fault target.
+- 2026-06-11: A direct real-target probe over `pre&post` returned no failures, and `php tools/html-decoder-fuzz/replay.php --mode corpus --seed 1 --case 11875` passed for the deterministic `&Z` corpus-splice fixture.
+- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=single-level-overdecode php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --start-case 11875 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-single-level-fault-check` reported `single-level-decode-overdecoded` findings in text and attribute contexts; replaying the manifest reproduced the findings and minimizing it with `--signature single-level-decode-overdecoded:text` reduced the payload to `&`.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, and `git diff --check` passed after adding single-level decode checks, smoke coverage, and docs.
+- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after sorting derived named-reference lists.
+- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode corpus --seed 1 --case 11875` and `php tools/html-decoder-fuzz/replay.php --mode names --seed 1 --case 11593` still passed with the expected deterministic payloads after adding explicit generator list sorting.
+- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, and `git diff --check` passed after addressing the cross-cutting determinism and throughput notes.
+
+## Review Log
+
+- Tier 1 item 1:
+ - Curie: APPROVE, determinism/API behavior.
+ - Dewey: APPROVE, harness and fault-injection coverage.
+ - Mencius: APPROVE, runtime/replay compatibility.
+- Tier 1 item 2:
+ - Jason: APPROVE, byte generator/check semantics.
+ - Rawls: APPROVE, CLI/artifact compatibility after mode-aware artifact keying fix.
+ - Sartre: APPROVE, tests and documentation after oracle-trap and bogus-mode coverage.
+- Tier 1 item 3:
+ - Hegel: APPROVE, generator semantics after max-bytes suffix reservation fix.
+ - Lovelace: APPROVE, smoke coverage after strict EOF suffix-shape checks.
+ - Erdos: APPROVE, docs/replay compatibility after documenting generated-case mapping drift.
+- Tier 1 item 4:
+ - Copernicus: APPROVE, invariant semantics and exception handling.
+ - Maxwell: APPROVE, fault-target and smoke coverage.
+ - Poincare: APPROVE, integration/runtime compatibility.
+- Tier 1 item 5:
+ - Banach: APPROVE, generator and fault-target coverage.
+ - Meitner: APPROVE, byte-slice search semantics and JSON-safe failure details.
+ - Carver: APPROVE, worker/replay/minimize integration and runtime compatibility.
+- Tier 1 item 6:
+ - Kepler: APPROVE, numeric generator ranges after post-surrogate BMP coverage fix.
+ - Pascal: APPROVE, numeric smoke coverage after BMP noncharacter and multi-reference fixes.
+ - Beauvoir: APPROVE, integration/runtime compatibility after explicit `0xFFFE`/`0xFFFF` coverage.
+- Tier 2 item 7:
+ - Mendel: APPROVE, generator semantics and deterministic mapping after smoke additions.
+ - Pasteur: APPROVE, CLI/worker/replay/minimize/runner integration and mode handling.
+ - Popper: APPROVE, smoke and fault-pipeline coverage after requested start-window and name-fault checks.
+- Tier 2 item 8:
+ - Hilbert: APPROVE, generator semantics and single-edit mutation filtering after sparse smoke fix.
+ - Sagan: APPROVE, smoke rigor after branch-specific sparse corpus coverage replaced inferred operation coverage.
+ - Turing: APPROVE, runtime/integration compatibility and deterministic replay behavior.
+- Tier 2 item 9:
+ - Chandrasekhar: APPROVE, `legacy-followers` generator/mode semantics and deterministic sharding.
+ - Linnaeus: APPROVE, ASCII-only ambiguous follower decoder fix and PHPUnit coverage.
+ - Leibniz: APPROVE, smoke/integration coverage for full-period sweep, runner windows, and fault pipeline.
+- Tier 2 item 10:
+ - Gauss: APPROVE, prefix-family generator semantics after exact reference-set and replay smoke tightening.
+ - Peirce: APPROVE, CLI/worker/replay/runner integration and oracle-backed deterministic sharding.
+ - Noether: APPROVE, smoke coverage after requested exact reference and seed/case replay checks.
+- Tier 2 item 11:
+ - Plato: APPROVE, numeric-boundary generator semantics after in-range exact-max correction and decode-outcome smoke tightening.
+ - Socrates: APPROVE, CLI/worker/replay/runner integration and artifact replay after the mixed-case case update.
+ - Volta: APPROVE, smoke/docs/progress coverage after exact-max and max-plus-one replacement assertions.
+- Tier 2 item 12:
+ - Lagrange: APPROVE, generator semantics after semicolon follower protection and small-`max-bytes` composition fixes.
+ - Pauli: APPROVE, smoke coverage after exact strategy-set and delimiter-based composition assertions.
+ - Locke: APPROVE, integration/docs/progress accuracy after weighted composition and generalized encoder changes.
+- Tier 2 item 13:
+ - Russell: APPROVE, corpus generator semantics after WPT attribute retention and UTF-8 boundary fixes.
+ - Ramanujan: APPROVE, CLI/worker/replay/runner integration and deterministic corpus replay.
+ - Zeno: APPROVE, smoke/docs/progress coverage after WPT sentinel and mutation-shape assertions.
+- Tier 2 item 14:
+ - Boyle: APPROVE, reader compositionality invariant semantics and deterministic cases after pipeline coverage.
+ - Kuhn: APPROVE, fault-target and smoke coverage after automated worker/replay/minimize pipelines.
+ - Bohr: APPROVE, integration/runtime/docs/progress coverage after shared reader-path verification.
+- Tier 2 item 15:
+ - Anscombe: APPROVE, generator semantics after independent generated-candidate and raw-helper probes.
+ - Cicero: APPROVE, smoke and deterministic fault fixture coverage after direct lowercase/uppercase helper checks replaced ambiguous source inference.
+ - Parfit: APPROVE, integration/docs/progress scope and generated-case mapping drift notes.
+- Tier 3 item 16:
+ - Singer: APPROVE, production decoder semantics and PHPUnit regression coverage.
+ - Darwin: APPROVE, fuzzer invariant, fault target, and smoke pipeline coverage.
+ - Harvey: APPROVE, integration/docs/progress scope including the decoder fix exposed by the invariant.
+- Tier 3 item 17:
+ - Lorentz: APPROVE, non-ampersand reader-offset invariant semantics and byte-safety.
+ - Arendt: APPROVE, fault target, smoke pipeline, and PHPUnit coverage.
+ - Gibbs: APPROVE, integration/docs/progress scope and commit boundaries.
+- Tier 3 item 18:
+ - Wegener: APPROVE, no-amp identity invariant semantics after README wording correction.
+ - Descartes: APPROVE, attribute no-amp fault target, smoke pipeline, and docs after stale Checks doc fix.
+ - Hypatia: APPROVE, integration/progress scope and commit boundaries after README wording correction.
+- Tier 3 item 19:
+ - Galileo: APPROVE, generator alphabet semantics and oracle-safety after explicit whitespace wording.
+ - Bacon: APPROVE, smoke coverage and docs after replacing broad HTML-whitespace wording.
+ - Euclid: APPROVE, integration/progress scope and commit boundaries after explicit tab/LF/FF wording.
+- Tier 3 item 20:
+ - Carson: APPROVE, gapless reader-walk invariant semantics and failure signature stability.
+ - Herschel: APPROVE, span-drop fault target, smoke pipeline, and docs.
+ - Bernoulli: APPROVE, integration/progress scope and commit boundaries.
+- Tier 3 item 21:
+ - Einstein: APPROVE, invalid numeric replacement invariant semantics and signature stability.
+ - Confucius: APPROVE, invalid numeric fault target, smoke pipeline, and docs.
+ - Aristotle: APPROVE, integration/progress scope and commit boundaries.
+- Tier 3 item 22:
+ - McClintock: APPROVE, numeric C1 remap and raw C1 pass-through invariant semantics.
+ - Averroes: APPROVE, fault targets and smoke coverage after adding raw-C1 byte worker/replay/signature-pinned minimize coverage.
+ - Heisenberg: APPROVE, docs/progress scope and commit boundaries.
+- Tier 3 item 23:
+ - Boole: APPROVE, secondary text oracle semantics and support gating.
+ - Ampere: APPROVE, secondary-oracle check, fault target, and smoke pipeline coverage.
+ - Feynman: APPROVE, docs/progress scope and commit boundaries.
+- Tier 3 item 24:
+ - Hooke: APPROVE, token-map extraction and generator semantics after verifying name extraction, deterministic coverage, oracle-safety, and default mapping stability.
+ - Nash: APPROVE, CLI/worker/replay/runner integration and mode-aware failure artifact behavior.
+ - Goodall: APPROVE, smoke/docs/progress coverage and commit scope after full smoke and targeted token-map verification.
+- Tier 3 item 25:
+ - Helmholtz: APPROVE, coverage-guidance and pcov semantics after static pcov-path review plus fake-provider verification on this no-pcov runtime.
+ - Laplace: APPROVE, worker/runner/replay/minimize integration and coverage-corpus artifact safety after duplicate-pruning verification.
+ - Nietzsche: APPROVE, smoke/docs/progress scope with explicit no-pcov residual-risk note and fake-provider coverage checks.
+- Tier 3 item 26:
+ - Fermat: APPROVE, single-level decode invariant semantics and oracle-free byte-mode narrowness.
+ - Newton: APPROVE, fault target and worker/replay/minimize integration after README fault-target docs fix.
+ - Euler: APPROVE, docs/progress scope after README self-test and fault-target list updates.
+- Cross-cutting concerns:
+ - Ohm: APPROVE, generator derived-list sorting and default mapping stability.
+ - Archimedes: APPROVE, injected-order smoke coverage and verification scope.
+ - Faraday: APPROVE, README throughput note, progress accuracy, and commit scope.
diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php
index d902f4b7cabc4..f9402b86d33ad 100644
--- a/src/wp-includes/html-api/class-wp-html-decoder.php
+++ b/src/wp-includes/html-api/class-wp-html-decoder.php
@@ -60,17 +60,23 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen
continue;
}
- // If there is a character reference, then the decoded value must exactly match what follows in the search string.
- if ( 0 !== substr_compare( $search_text, $next_chunk, $search_at, strlen( $next_chunk ), $loose_case ) ) {
+ /*
+ * If there is a character reference, then the decoded value must
+ * match what follows in the search string. The search string may
+ * end within a multi-code-point replacement, such as `<⃒`
+ * decoding to `<⃒`, and still be a prefix match.
+ */
+ $match_length = min( strlen( $next_chunk ), $search_length - $search_at );
+ if ( 0 !== substr_compare( $search_text, $next_chunk, $search_at, $match_length, $loose_case ) ) {
return false;
}
// The character reference matched, so continue checking.
$haystack_at += $token_length;
- $search_at += strlen( $next_chunk );
+ $search_at += $match_length;
}
- return true;
+ return $search_at === $search_length;
}
/**
@@ -361,7 +367,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
$name_length = 0;
$replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length );
- if ( false === $replacement ) {
+ if ( null === $replacement ) {
return null;
}
@@ -378,12 +384,14 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
* character reference table but the match doesn't end in `;`.
* It may be allowed if it's followed by something unambiguous.
*/
+ $follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null;
$ambiguous_follower = (
- $after_name < $length &&
- $name_at < $length &&
+ null !== $follower_byte &&
(
- ctype_alnum( $text[ $after_name ] ) ||
- '=' === $text[ $after_name ]
+ ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) ||
+ ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) ||
+ ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) ||
+ 0x3D === $follower_byte
)
);
diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php
index 97954f4eb3e30..f51b25bc9a88f 100644
--- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php
+++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php
@@ -61,6 +61,126 @@ static function ( int $errno, string $errstr ) use ( &$errors ) {
$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
}
+ /**
+ * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes.
+ */
+ public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() {
+ $raw_attribute = "Á\xC2\x80";
+
+ $this->assertSame(
+ "\xC3\x81\xC2\x80",
+ WP_HTML_Decoder::decode_attribute( $raw_attribute ),
+ 'Should have decoded the semicolonless legacy reference before a multibyte follower.'
+ );
+
+ $match_byte_length = null;
+ $this->assertSame(
+ "\xC3\x81",
+ WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ),
+ 'Should have matched the semicolonless legacy reference before a multibyte follower.'
+ );
+ $this->assertSame( strlen( 'Á' ), $match_byte_length );
+ }
+
+ /**
+ * Ensures semicolonless legacy references remain ambiguous before ASCII alnum or equals.
+ *
+ * @dataProvider data_ambiguous_ascii_attribute_followers
+ *
+ * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower.
+ */
+ public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous( $raw_attribute ) {
+ $this->assertSame(
+ $raw_attribute,
+ WP_HTML_Decoder::decode_attribute( $raw_attribute ),
+ 'Should not have decoded an ambiguous semicolonless legacy reference.'
+ );
+
+ $match_byte_length = 'sentinel';
+ $this->assertNull(
+ WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ),
+ 'Should not have matched an ambiguous semicolonless legacy reference.'
+ );
+ $this->assertSame( 'sentinel', $match_byte_length );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[].
+ */
+ public static function data_ambiguous_ascii_attribute_followers() {
+ return array(
+ 'ASCII digit' => array( 'Á0' ),
+ 'ASCII uppercase alpha' => array( 'ÁA' ),
+ 'ASCII lowercase alpha' => array( 'Áa' ),
+ 'equals' => array( 'Á=' ),
+ );
+ }
+
+ /**
+ * Ensures unmatched named character references leave the by-ref match length unchanged.
+ *
+ * @dataProvider data_unmatched_named_character_references
+ *
+ * @param string $context Decoder context.
+ * @param string $raw_text_node Raw text containing an unmatched named character reference.
+ */
+ public function test_unmatched_named_character_reference_does_not_set_match_byte_length( $context, $raw_text_node ) {
+ $match_byte_length = 'sentinel';
+ $this->assertNull(
+ WP_HTML_Decoder::read_character_reference( $context, $raw_text_node, 0, $match_byte_length ),
+ 'Should not have matched an unmatched named character reference.'
+ );
+ $this->assertSame( 'sentinel', $match_byte_length );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[].
+ */
+ public static function data_unmatched_named_character_references() {
+ return array(
+ 'text invalid name' => array( 'data', '&bogus;' ),
+ 'text invalid short-name candidate' => array( 'data', '&Fv=q' ),
+ 'attribute invalid name' => array( 'attribute', '&bogus;' ),
+ 'attribute invalid short-name candidate' => array( 'attribute', '&Fv=q' ),
+ );
+ }
+
+ /**
+ * Ensures non-ampersand offsets never match character references.
+ *
+ * @dataProvider data_non_ampersand_character_reference_offsets
+ *
+ * @param string $context Decoder context.
+ * @param string $raw_text_node Raw text containing a character reference away from offset.
+ * @param int $offset Offset that does not point at an ampersand.
+ */
+ public function test_non_ampersand_offset_does_not_set_match_byte_length( $context, $raw_text_node, $offset ) {
+ $match_byte_length = 'sentinel';
+ $this->assertNull(
+ WP_HTML_Decoder::read_character_reference( $context, $raw_text_node, $offset, $match_byte_length ),
+ 'Should not have matched a character reference away from an ampersand.'
+ );
+ $this->assertSame( 'sentinel', $match_byte_length );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[].
+ */
+ public static function data_non_ampersand_character_reference_offsets() {
+ return array(
+ 'text before reference' => array( 'data', 'a&b', 0 ),
+ 'text inside reference name' => array( 'data', 'a&b', 2 ),
+ 'attribute before reference' => array( 'attribute', 'a&b', 0 ),
+ 'attribute inside reference name' => array( 'attribute', 'a&b', 2 ),
+ );
+ }
+
/**
* Ensures proper detection of attribute prefixes ignoring ASCII case.
*
@@ -161,6 +281,11 @@ public static function data_attributes_with_prefix_and_case_sensitive_match() {
array( 'http://wordpress.org', 'Http', 'ascii-case-insensitive', true ),
array( 'http://wordpress.org', 'https', 'case-sensitive', false ),
array( 'http://wordpress.org', 'https', 'ascii-case-insensitive', false ),
+ array( '', 'http', 'case-sensitive', false ),
+ array( 'jav', 'javascript:', 'case-sensitive', false ),
+ array( 'jav', 'javascript:', 'ascii-case-insensitive', false ),
+ array( '<⃒script', '<', 'case-sensitive', true ),
+ array( '>⃒script', '>', 'case-sensitive', true ),
);
}
}
diff --git a/tools/html-decoder-fuzz/README.md b/tools/html-decoder-fuzz/README.md
new file mode 100644
index 0000000000000..9eb6df2f76a48
--- /dev/null
+++ b/tools/html-decoder-fuzz/README.md
@@ -0,0 +1,347 @@
+# WP_HTML_Decoder Fuzzer
+
+Differential fuzzer for `WP_HTML_Decoder`:
+
+- `decode_text_node()`
+- `decode_attribute()`
+- `read_character_reference()`
+- `attribute_starts_with()`
+
+The fuzzer runs in a bare PHP process. It loads only `WP_Token_Map`, the
+generated HTML5 named-character-reference map, and `WP_HTML_Decoder`; it does
+not bootstrap WordPress, a database, browsers, Node, or `wp-env`.
+
+## Requirements
+
+- PHP 8.4+ with `Dom\HTMLDocument`
+- `mbstring`
+- `pcov` for `coverage` mode
+- Run from the repository root
+
+## Oracle
+
+The primary oracle is PHP's HTML5 parser:
+
+- Text context: parse `
PAYLOAD
` and read the
+ div's `textContent`.
+- Attribute context: parse `
` and read
+ `getAttribute( 'title' )`.
+
+For text context, `html_entity_decode( ENT_HTML5 | ENT_QUOTES, 'UTF-8' )` also
+runs as a secondary oracle on payloads whose references it supports:
+known semicolon-terminated named references and literal text. Numeric
+references, unknown named-looking references, and semicolonless named-looking
+references stay with the DOM oracle and fuzzer
+invariants, because `html_entity_decode()` does not implement those parser
+states. `html_entity_decode()` is deliberately not used as the primary oracle or
+as an attribute-context oracle because it does not implement the HTML
+attribute-context rule for semicolonless named references followed by `=` or an
+alphanumeric byte.
+
+In the default `oracle` mode, the generator neutralizes parser-vs-decoder
+confounders by producing valid UTF-8 payloads with no raw `<`, no raw double
+quote, no CR, and no NUL. This keeps the DOM parser focused on
+character-reference decoding instead of tag structure, attribute termination,
+input-preprocessing newline normalization, or NUL substitution.
+
+The separate `bytes` mode deliberately generates arbitrary byte payloads,
+including invalid UTF-8, NUL, raw `<`, raw double quote, and CR. These payloads
+never go to the DOM oracle. They run only oracle-free decoder invariants.
+
+The separate `names` mode deterministically sweeps every generated named
+character reference base name with and without `;`, followed by representative
+end, alphanumeric, equals, punctuation, whitespace, and multibyte followers.
+
+## Checks
+
+For each generated payload, the fuzzer runs both text and attribute contexts:
+
+1. Compare `decode_text_node()` or `decode_attribute()` to the DOM oracle, and
+ compare supported text payloads to the secondary `html_entity_decode()`
+ oracle.
+2. Rebuild the decoded string with repeated `read_character_reference()` calls
+ plus literal spans, then compare it to the high-level decoder.
+3. Assert every matched character reference reports a nonempty chunk, a byte
+ length of at least two, no input overrun, and the same chunk/length when
+ the matched slice is read again at offset zero.
+4. Check `attribute_starts_with()` against the decoded attribute prefix for
+ ASCII search strings in both case-sensitive and ASCII-case-insensitive modes,
+ leading byte-slice prefixes that can end inside UTF-8 replacements, and
+ monotonic prefix, extension, and case-sensitivity invariants.
+5. Assert decoded output is valid UTF-8.
+6. Assert known nested ampersand fixtures decode exactly one level, so
+ `&` decodes to `&` rather than `&`.
+7. Assert text and attribute payloads without `&` are identity decodes.
+
+In `bytes` mode, checks 1, 4, and 5 are skipped because they depend on
+DOM-safe UTF-8 payloads or a DOM-derived decoded attribute value. The lane keeps
+the reader rebuild, advance/overrun, single-level decode, and no-`&` identity
+checks for both text and attribute contexts.
+
+Decoding is not treated as idempotent. The checks and smoke suite explicitly
+verify this with nested ampersand-reference fixtures.
+
+## Generator
+
+Every case is determined by `(seed, case index)`. Generated cases run in both
+text and attribute contexts so the same payload exercises semicolonless and
+attribute-disambiguation differences side by side. The generator preserves the
+former context PRNG draw, so the earlier both-context lane change did not by
+itself shift payload mapping.
+Named-reference lists derived from the generated token map, or injected for
+tests, are sorted by length and byte value before case-index mapping, so token
+map storage order and caller array order do not affect generated payloads.
+Adding or reweighting generation strategies intentionally changes future
+`--seed --case` payload mapping; failure-manifest replay remains stable because
+manifests store `payload_base64`.
+
+The generator uses the real generated named-reference map, with weighted
+strategies for:
+
+- exact named references
+- semicolonless legacy references
+- attribute-context ambiguous followers
+- numeric decimal and hex references drawn from ranges covering C0 controls,
+ all C1 controls, surrogates, BMP and per-plane noncharacters, astral values,
+ above-Unicode values with legal digit counts, digit-count overflow, zero-only
+ references, and leading zeros
+- adjacent references
+- truncation sweeps
+- references ending at EOF, including bare introducers, partial numeric
+ references, semicolonless numeric references, and truncated names
+- multibyte UTF-8 around references
+- `attribute_starts_with()` prefixes generated by encoding target strings
+ per character as literals, decimal references, hex references, leading-zero
+ references, and semicolonless references
+- `attribute_starts_with()` prefixes that split multi-code-point named-reference
+ replacements such as `<⃒`
+- edit-distance-1 named-reference lookalikes plus ampersand boundaries
+- valid named-reference names with letter case mangled into case-sensitive
+ near-misses
+- composed cases that splice two or three generated strategy outputs
+- plain no-ampersand text from an oracle-safe alphabet that includes space, tab,
+ LF, and FF
+
+`bytes` mode uses separate weighted strategies for uniform random bytes,
+no-ampersand byte strings, arbitrary bytes around `&` boundaries, invalid UTF-8
+sequences, and raw HTML delimiters/control bytes.
+
+`names` mode uses a deterministic sweep instead of weighted random generation.
+Case index maps directly to a named-reference base, semicolon variant, and
+follower class; the same payload still runs in both text and attribute contexts.
+
+`legacy-followers` mode deterministically sweeps every semicolonless legacy
+name followed by each oracle-safe ASCII byte, plus valid UTF-8 sequences
+covering multibyte lead and continuation byte values.
+
+`prefix-families` mode deterministically sweeps known named-reference prefix
+families such as `¬`/`∉`/`∉` and `≫⃒`/`≯`, truncating
+each reference at every byte split and appending ambiguous followers.
+
+`numeric-boundaries` mode deterministically sweeps decimal and hex numeric
+references at the decoder's maximum significant-digit count and one digit past
+it, with and without leading zeros, semicolons, and mixed-case hex digits.
+
+`corpus` mode mutates a seed corpus built from retained decoder payloads, the
+oracle battery, and html5lib entity vectors. Mutations splice corpus fragments,
+perturb bytes within the oracle-safe alphabet, including space, tab, LF, and FF,
+add or remove semicolons, and duplicate references to diversify structure beyond
+the grammar.
+
+`token-map` mode deterministically sweeps the generated `WP_Token_Map` layout:
+large-word group prefixes with names that diverge immediately after the shared
+two-byte prefix, every small-word boundary name, and every large-word name at
+the small/large length boundary.
+
+`coverage` mode runs the normal oracle-safe generator under pcov and treats each
+new covered executable line in `WP_HTML_Decoder` or `WP_Token_Map` as a coverage
+edge. Workers emit `coverage` events for payloads that discover new edges and
+write those payloads under `coverage-corpus/` when an output directory is
+provided. The runner deduplicates coverage edges across lanes and prunes
+duplicate coverage-corpus artifacts.
+
+## Common Commands
+
+Run the smoke test:
+
+```sh
+php tools/html-decoder-fuzz/tests/harness-smoke.php
+```
+
+Run one worker batch:
+
+```sh
+php tools/html-decoder-fuzz/worker.php --seed 1 --cases 5000
+```
+
+Run one oracle-free arbitrary-byte worker batch:
+
+```sh
+php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 5000
+```
+
+Run one deterministic named-reference sweep batch:
+
+```sh
+php tools/html-decoder-fuzz/worker.php --mode names --seed 1 --cases 5000
+```
+
+Run one deterministic legacy-follower sweep batch:
+
+```sh
+php tools/html-decoder-fuzz/worker.php --mode legacy-followers --seed 1 --cases 5000
+```
+
+Run one deterministic prefix-family sweep batch:
+
+```sh
+php tools/html-decoder-fuzz/worker.php --mode prefix-families --seed 1 --cases 5000
+```
+
+Run one deterministic numeric-boundary sweep batch:
+
+```sh
+php tools/html-decoder-fuzz/worker.php --mode numeric-boundaries --seed 1 --cases 5000
+```
+
+Run one corpus mutation batch:
+
+```sh
+php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --cases 5000
+```
+
+Run one token-map structure sweep batch:
+
+```sh
+php tools/html-decoder-fuzz/worker.php --mode token-map --seed 1 --cases 5000
+```
+
+Run one coverage-guided batch:
+
+```sh
+php -d pcov.enabled=1 -d pcov.directory=src/wp-includes tools/html-decoder-fuzz/worker.php --mode coverage --seed 1 --cases 5000 --output-dir /tmp/html-decoder-fuzz-coverage
+```
+
+Run parallel lanes for one minute:
+
+```sh
+php tools/html-decoder-fuzz/runner.php --lanes 4 --duration-seconds 60
+```
+
+Oracle modes spend most of their time in the two DOM parser calls per payload,
+not in the PRNG. Scale long oracle runs with more lanes; the oracle-free `bytes`
+mode avoids that DOM cost, and future high-throughput oracle work should batch
+payloads into fewer documents or cache repeated sub-payloads.
+
+Run indefinitely:
+
+```sh
+php tools/html-decoder-fuzz/runner.php --lanes 8 --duration-seconds 0 --max-cases 0
+```
+
+Long runs keep disk use bounded by default. The runner records aggregate
+counters in `state.json`, writes only newly retained failure exemplars plus
+oracle/fatal events to `summary.ndjson`, and retains at most five failure
+artifact directories for each distinct signature. Per-lane stderr logs are
+capped at 64 KiB each, including reused output directories with existing
+oversized lane logs. Repeated over-cap failures remain counted in `state.json`
+without growing the event log.
+
+When startup verification is unavailable, the runner preserves complete
+existing artifacts instead of pruning them to the cap; without the verifier it
+cannot safely distinguish stale or fake full-shape manifests from valuable
+findings.
+
+Useful retention options:
+
+```sh
+# Preserve the previous verbose event log.
+php tools/html-decoder-fuzz/runner.php --summary-mode all
+
+# Keep only one on-disk exemplar per signature.
+php tools/html-decoder-fuzz/runner.php --max-artifacts-per-signature 1
+
+# Prune every failure artifact and rely on state counters/signatures.
+php tools/html-decoder-fuzz/runner.php --artifact-retention none
+
+# Keep every failure artifact for a short diagnostic run.
+php tools/html-decoder-fuzz/runner.php --artifact-retention all
+
+# Raise or disable per-lane stderr capture.
+php tools/html-decoder-fuzz/runner.php --max-stderr-bytes 262144
+php tools/html-decoder-fuzz/runner.php --max-stderr-bytes 0
+```
+
+Replay a failure, an input file, or a generated case:
+
+```sh
+php tools/html-decoder-fuzz/replay.php --failure artifacts/html-decoder-fuzz/run-.../failure-seedS-caseN/failure.json
+php tools/html-decoder-fuzz/replay.php --input payload.txt --context attribute
+php tools/html-decoder-fuzz/replay.php --seed 123 --case 45
+php tools/html-decoder-fuzz/replay.php --mode bytes --seed 123 --case 45
+```
+
+Minimize a failure while preserving its signature:
+
+```sh
+php tools/html-decoder-fuzz/minimize.php --failure artifacts/html-decoder-fuzz/run-.../failure-seedS-caseN/failure.json
+```
+
+Exit codes everywhere: `0` clean, `1` findings, `2` harness error.
+
+## Artifacts
+
+The runner writes under `artifacts/html-decoder-fuzz/run-*` by default:
+
+- `summary.ndjson` with retained failure exemplars plus oracle/fatal events by
+ default (`--summary-mode all` preserves every worker event;
+ `--summary-mode none` disables the file)
+- `state.json` with aggregate counters, stop reason, Git metadata, and failure
+ seeds for retained exemplars, including retained/pruned artifact counts by
+ signature
+- per-lane stderr logs, capped by `--max-stderr-bytes`
+- retained failure directories containing `payload.txt` and a self-contained
+ `failure.json` with base64 payload, context, signatures, failure details,
+ full expected/got output as base64 for differential failures, environment
+ metadata, and Git metadata. By default retention is capped per signature;
+ use `--artifact-retention all` to keep every directory.
+
+## Harness Self-Test
+
+`tests/harness-smoke.php` verifies the DOM oracle battery, real target behavior
+on the battery, generator determinism and safety, a short real fuzz run, and
+mutation-tested broken targets:
+
+- C1 numeric references not remapped through the Windows-1252 table, and raw
+ C1 bytes not passing through unchanged
+- supported text payloads disagreeing with the secondary `html_entity_decode()`
+ oracle
+- nested ampersand references being decoded more than one level
+- zero, surrogate, and above-Unicode numeric references not decoding to exactly
+ U+FFFD
+- semicolonless named references decoded in attributes despite ambiguous
+ followers
+- off-by-one `read_character_reference()` match lengths
+- empty `read_character_reference()` chunks, one-byte matches,
+ null-return match-length mutations, non-ampersand offset matches,
+ non-compositional local slice reads, and non-gapless reader walks
+- partial-prefix `attribute_starts_with()` matches
+- partial multi-code-point `attribute_starts_with()` replacement matches
+- non-monotonic `attribute_starts_with()` prefix, extension, and
+ case-sensitivity results
+- safe attribute payloads and raw byte payloads without `&` not decoding
+ identically
+
+For end-to-end failure-pipeline checks, set `HTML_DECODER_FUZZ_FAULT` to one of
+`skip-c1-remap`, `numeric-c1-not-remapped`, `raw-c1-not-pass-through`,
+`text-secondary-oracle`, `numeric-invalid-not-replacement`,
+`attribute-semicolonless`, `match-length-off-by-one`,
+`reader-empty-chunk`, `reader-short-match-length`,
+`reader-substring-composition`, `reader-null-mutates-match-length`,
+`reader-non-amp-match`, `reader-gapless-drop-span`,
+`attribute-no-amp-identity`, `byte-no-amp-identity`,
+`single-level-overdecode`,
+`attribute-prefix-monotonicity`,
+`attribute-extension-monotonicity`, `attribute-case-monotonicity`, or
+`attribute-multicodepoint-prefix` before running `worker.php`, `runner.php`,
+`replay.php`, or `minimize.php`.
diff --git a/tools/html-decoder-fuzz/lib/Bootstrap.php b/tools/html-decoder-fuzz/lib/Bootstrap.php
new file mode 100644
index 0000000000000..5096937a8951d
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/Bootstrap.php
@@ -0,0 +1,166 @@
+,
+ * small_names: string[]
+ * }
+ */
+ public static function named_reference_structure(): array {
+ static $structure = null;
+ if ( null !== $structure ) {
+ return $structure;
+ }
+
+ self::load_targets();
+
+ global $html5_named_character_references;
+ $map = $html5_named_character_references;
+
+ $reflection = new \ReflectionObject( $map );
+ $get = static function ( string $property ) use ( $reflection, $map ) {
+ $ref = $reflection->getProperty( $property );
+ $ref->setAccessible( true );
+ return $ref->getValue( $map );
+ };
+
+ $key_length = (int) $get( 'key_length' );
+ $groups = (string) $get( 'groups' );
+ $large_words = (array) $get( 'large_words' );
+ $small_words = (string) $get( 'small_words' );
+ $large_names_by_key = array();
+ $large_names_by_prefix = array();
+ $small_names_by_key = array();
+ $group_stride = $key_length + 1;
+ $groups_length = strlen( $groups );
+
+ for ( $group_at = 0, $group_index = 0; $group_at + $key_length <= $groups_length; $group_at += $group_stride, ++$group_index ) {
+ $prefix = substr( $groups, $group_at, $key_length );
+ if ( '' === $prefix || ! isset( $large_words[ $group_index ] ) ) {
+ continue;
+ }
+
+ $row = $large_words[ $group_index ];
+ $row_at = 0;
+ while ( $row_at < strlen( $row ) ) {
+ $token_length = unpack( 'C', $row[ $row_at++ ] )[1];
+ $token = substr( $row, $row_at, $token_length );
+ $row_at += $token_length;
+
+ $mapping_length = unpack( 'C', $row[ $row_at++ ] )[1];
+ $row_at += $mapping_length;
+
+ $name = $prefix . $token;
+ $large_names_by_key[ $name ] = true;
+ $large_names_by_prefix[ $prefix ][ $name ] = true;
+ }
+ }
+
+ for ( $at = 0; $at < strlen( $small_words ); $at += $group_stride ) {
+ $name = rtrim( substr( $small_words, $at, $group_stride ), "\x00" );
+ if ( '' !== $name ) {
+ $small_names_by_key[ $name ] = true;
+ }
+ }
+
+ $group_prefixes = array_keys( $large_names_by_prefix );
+ sort( $group_prefixes, SORT_STRING );
+
+ $large_names = array_keys( $large_names_by_key );
+ $small_names = array_keys( $small_names_by_key );
+ self::sort_names( $large_names );
+ self::sort_names( $small_names );
+ foreach ( $large_names_by_prefix as $prefix => $prefix_names_by_key ) {
+ $prefix_names = array_keys( $prefix_names_by_key );
+ self::sort_names( $prefix_names );
+ $large_names_by_prefix[ $prefix ] = $prefix_names;
+ }
+ ksort( $large_names_by_prefix, SORT_STRING );
+
+ $structure = array(
+ 'key_length' => $key_length,
+ 'group_prefixes' => $group_prefixes,
+ 'large_names' => $large_names,
+ 'large_names_by_prefix' => $large_names_by_prefix,
+ 'small_names' => $small_names,
+ );
+
+ return $structure;
+ }
+
+ /**
+ * @param string[] $names
+ */
+ private static function sort_names( array &$names ): void {
+ usort(
+ $names,
+ static function ( string $a, string $b ): int {
+ return strlen( $b ) <=> strlen( $a ) ?: strcmp( $a, $b );
+ }
+ );
+ }
+}
+}
diff --git a/tools/html-decoder-fuzz/lib/Checks.php b/tools/html-decoder-fuzz/lib/Checks.php
new file mode 100644
index 0000000000000..5c9015121c120
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/Checks.php
@@ -0,0 +1,1016 @@
+ '&',
+ '<' => '<',
+ ':' => ':',
+ ':' => ':',
+ );
+
+ private Oracles $oracles;
+
+ /** @var array
*/
+ private array $targets;
+
+ public function __construct( Oracles $oracles, ?array $targets = null ) {
+ $this->oracles = $oracles;
+ $this->targets = $targets ?? Targets::resolve();
+ }
+
+ /**
+ * @return array
+ */
+ public function run( string $context, string $payload ): array {
+ $failures = array();
+
+ if ( ! Generator::is_oracle_safe_payload( $payload ) ) {
+ return array(
+ self::failure(
+ 'unsafe-oracle-payload',
+ $context,
+ array(
+ 'context' => $context,
+ 'payload' => self::preview( $payload ),
+ )
+ ),
+ );
+ }
+
+ $contexts = 'both' === $context ? array( 'text', 'attribute' ) : array( $context );
+
+ foreach ( $contexts as $one_context ) {
+ $failures = array_merge( $failures, $this->check_decode_context( $one_context, $payload ) );
+ }
+
+ $failures = array_merge( $failures, $this->check_attribute_starts_with( $payload ) );
+
+ return $failures;
+ }
+
+ /**
+ * @return array
+ */
+ public function run_without_oracle( string $context, string $payload ): array {
+ $failures = array();
+ $contexts = 'both' === $context ? array( 'text', 'attribute' ) : array( $context );
+
+ foreach ( $contexts as $one_context ) {
+ $failures = array_merge( $failures, $this->check_decode_context_without_oracle( $one_context, $payload ) );
+ }
+
+ return $failures;
+ }
+
+ /**
+ * @return array
+ */
+ private function check_decode_context( string $context, string $payload ): array {
+ $failures = array();
+
+ try {
+ $expected = $this->oracles->decode( $context, $payload );
+ } catch ( \Throwable $error ) {
+ return array(
+ self::failure(
+ 'oracle-exception',
+ $context,
+ array(
+ 'context' => $context,
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ ),
+ );
+ }
+
+ $target_key = 'text' === $context ? 'decode_text' : 'decode_attribute';
+ try {
+ $got = ( $this->targets[ $target_key ] )( $payload );
+ } catch ( \Throwable $error ) {
+ return array(
+ self::failure(
+ 'target-exception',
+ "{$context}:decode",
+ array(
+ 'context' => $context,
+ 'target' => $target_key,
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ ),
+ );
+ }
+
+ if ( $got !== $expected ) {
+ $failures[] = self::failure(
+ 'decode-mismatch',
+ $context,
+ self::diff_detail( $context, $expected, $got )
+ );
+ }
+
+ $single_level_expected = self::single_level_decode_expected( $payload );
+ if ( null !== $single_level_expected && $got !== $single_level_expected ) {
+ $failures[] = self::failure(
+ 'single-level-decode-overdecoded',
+ $context,
+ array_merge(
+ self::diff_detail( $context, $single_level_expected, $got ),
+ self::byte_detail( 'payload', $payload )
+ )
+ );
+ }
+
+ if ( 'text' === $context ) {
+ try {
+ $entity_decode_expected = $this->oracles->decode_text_with_entity_decode( $payload );
+ } catch ( \Throwable $error ) {
+ $failures[] = self::failure(
+ 'oracle-exception',
+ 'text:entity-decode',
+ array(
+ 'context' => $context,
+ 'oracle' => 'entity-decode',
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ );
+ $entity_decode_expected = null;
+ }
+
+ if ( null !== $entity_decode_expected && $got !== $entity_decode_expected ) {
+ $failures[] = self::failure(
+ 'text-secondary-oracle-mismatch',
+ $context,
+ array_merge(
+ self::diff_detail( $context, $entity_decode_expected, $got ),
+ array(
+ 'secondary_oracle' => 'html_entity_decode',
+ 'dom_expected_base64' => base64_encode( $expected ),
+ )
+ )
+ );
+ }
+ }
+
+ if ( ! mb_check_encoding( $got, 'UTF-8' ) ) {
+ $failures[] = self::failure(
+ 'decoded-not-valid-utf8',
+ $context,
+ array(
+ 'context' => $context,
+ 'decoded' => self::preview( $got ),
+ )
+ );
+ }
+
+ if ( ! str_contains( $payload, '&' ) && $got !== $payload ) {
+ $failures[] = self::failure(
+ "{$context}-without-ampersand-not-identity",
+ $context,
+ self::diff_detail( $context, $payload, $got )
+ );
+ }
+
+ $reader = $this->decode_with_reader( $context, $payload );
+ foreach ( $reader['failures'] as $failure ) {
+ $failures[] = $failure;
+ }
+
+ if ( $reader['decoded'] !== $got ) {
+ $failures[] = self::failure(
+ 'reader-decode-mismatch',
+ $context,
+ self::diff_detail( $context, $got, $reader['decoded'] )
+ );
+ }
+
+ return $failures;
+ }
+
+ /**
+ * @return array
+ */
+ private function check_decode_context_without_oracle( string $context, string $payload ): array {
+ $failures = array();
+ $target_key = 'text' === $context ? 'decode_text' : 'decode_attribute';
+
+ try {
+ $got = ( $this->targets[ $target_key ] )( $payload );
+ } catch ( \Throwable $error ) {
+ return array(
+ self::failure(
+ 'target-exception',
+ "{$context}:decode",
+ array(
+ 'context' => $context,
+ 'target' => $target_key,
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ ),
+ );
+ }
+
+ if ( ! str_contains( $payload, '&' ) && $got !== $payload ) {
+ $failures[] = self::failure(
+ "{$context}-without-ampersand-not-identity",
+ $context,
+ self::diff_detail( $context, $payload, $got )
+ );
+ }
+
+ if ( ! str_contains( $payload, '&' ) && self::contains_raw_c1_byte( $payload ) && $got !== $payload ) {
+ $failures[] = self::failure(
+ 'raw-c1-not-pass-through',
+ $context,
+ self::diff_detail( $context, $payload, $got )
+ );
+ }
+
+ $single_level_expected = self::single_level_decode_expected( $payload );
+ if ( null !== $single_level_expected && $got !== $single_level_expected ) {
+ $failures[] = self::failure(
+ 'single-level-decode-overdecoded',
+ $context,
+ array_merge(
+ self::diff_detail( $context, $single_level_expected, $got ),
+ self::byte_detail( 'payload', $payload )
+ )
+ );
+ }
+
+ $reader = $this->decode_with_reader( $context, $payload );
+ foreach ( $reader['failures'] as $failure ) {
+ $failures[] = $failure;
+ }
+
+ if ( $reader['decoded'] !== $got ) {
+ $failures[] = self::failure(
+ 'reader-decode-mismatch',
+ $context,
+ self::diff_detail( $context, $got, $reader['decoded'] )
+ );
+ }
+
+ return $failures;
+ }
+
+ /**
+ * @return array{decoded: string, failures: array}
+ */
+ private function decode_with_reader( string $context, string $payload ): array {
+ $decoder_context = 'text' === $context ? 'data' : 'attribute';
+ $decoded = '';
+ $failures = array();
+ $end = strlen( $payload );
+ $at = 0;
+ $was_at = 0;
+ $walk_at = 0;
+ $walk_spans = array();
+
+ $failures = array_merge( $failures, $this->check_reader_non_amp_offsets( $context, $decoder_context, $payload ) );
+
+ while ( $at < $end ) {
+ $amp_at = strpos( $payload, '&', $at );
+ if ( false === $amp_at ) {
+ break;
+ }
+
+ $match_byte_length = self::MATCH_BYTE_LENGTH_SENTINEL;
+ try {
+ $chunk = ( $this->targets['read_character_reference'] )( $decoder_context, $payload, $amp_at, $match_byte_length );
+ } catch ( \Throwable $error ) {
+ $failures[] = self::failure(
+ 'target-exception',
+ "{$context}:read-character-reference",
+ array(
+ 'context' => $context,
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ );
+ break;
+ }
+
+ if ( null === $chunk ) {
+ if ( self::MATCH_BYTE_LENGTH_SENTINEL !== $match_byte_length ) {
+ $failures[] = self::failure(
+ 'reader-mutated-match-length-on-null',
+ $context,
+ array(
+ 'context' => $context,
+ 'at' => $amp_at,
+ 'match_byte_length' => $match_byte_length,
+ 'match_byte_length_type' => gettype( $match_byte_length ),
+ )
+ );
+ break;
+ }
+ if ( $walk_at < $amp_at + 1 ) {
+ $walk_spans[] = array(
+ 'type' => 'literal',
+ 'start' => $walk_at,
+ 'end' => $amp_at + 1,
+ );
+ $walk_at = $amp_at + 1;
+ }
+ $at = $amp_at + 1;
+ continue;
+ }
+
+ if ( ! is_int( $match_byte_length ) || $match_byte_length <= 0 ) {
+ $failures[] = self::failure(
+ 'reader-did-not-advance',
+ $context,
+ array(
+ 'context' => $context,
+ 'at' => $amp_at,
+ 'match_byte_length' => $match_byte_length,
+ )
+ );
+ break;
+ }
+
+ if ( '' === $chunk ) {
+ $failures[] = self::failure(
+ 'reader-returned-empty-chunk',
+ $context,
+ array(
+ 'context' => $context,
+ 'at' => $amp_at,
+ 'match_byte_length' => $match_byte_length,
+ )
+ );
+ break;
+ }
+
+ if ( $match_byte_length < 2 ) {
+ $failures[] = self::failure(
+ 'reader-match-too-short',
+ $context,
+ array(
+ 'context' => $context,
+ 'at' => $amp_at,
+ 'match_byte_length' => $match_byte_length,
+ )
+ );
+ break;
+ }
+
+ if ( $amp_at + $match_byte_length > $end ) {
+ $failures[] = self::failure(
+ 'reader-overran-input',
+ $context,
+ array(
+ 'context' => $context,
+ 'at' => $amp_at,
+ 'match_byte_length' => $match_byte_length,
+ 'input_length' => $end,
+ )
+ );
+ break;
+ }
+
+ if ( $walk_at < $amp_at ) {
+ $walk_spans[] = array(
+ 'type' => 'literal',
+ 'start' => $walk_at,
+ 'end' => $amp_at,
+ );
+ }
+ $walk_spans[] = array(
+ 'type' => 'reference',
+ 'start' => $amp_at,
+ 'end' => $amp_at + $match_byte_length,
+ );
+ $walk_at = $amp_at + $match_byte_length;
+
+ $reference = substr( $payload, $amp_at, $match_byte_length );
+ $numeric_c1_replacement = self::numeric_c1_replacement( $reference );
+ if ( null !== $numeric_c1_replacement && $numeric_c1_replacement !== $chunk ) {
+ $failures[] = self::failure(
+ 'numeric-c1-not-remapped',
+ $context,
+ array_merge(
+ array(
+ 'context' => $context,
+ 'at' => $amp_at,
+ 'expected_base64' => base64_encode( $numeric_c1_replacement ),
+ 'got_base64' => base64_encode( $chunk ),
+ 'match_byte_length' => $match_byte_length,
+ ),
+ self::byte_detail( 'reference', $reference )
+ )
+ );
+ }
+
+ $invalid_numeric_reason = self::invalid_numeric_replacement_reason( $reference );
+ if ( null !== $invalid_numeric_reason && self::REPLACEMENT_CHARACTER !== $chunk ) {
+ $failures[] = self::failure(
+ 'numeric-invalid-not-replacement',
+ $context,
+ array_merge(
+ array(
+ 'context' => $context,
+ 'at' => $amp_at,
+ 'reason' => $invalid_numeric_reason,
+ 'expected_base64' => base64_encode( self::REPLACEMENT_CHARACTER ),
+ 'got_base64' => base64_encode( $chunk ),
+ 'match_byte_length' => $match_byte_length,
+ ),
+ self::byte_detail( 'reference', $reference )
+ )
+ );
+ }
+
+ $local_match_byte_length = null;
+ try {
+ $local_chunk = ( $this->targets['read_character_reference'] )( $decoder_context, $reference, 0, $local_match_byte_length );
+ } catch ( \Throwable $error ) {
+ $failures[] = self::failure(
+ 'target-exception',
+ "{$context}:read-character-reference-local",
+ array(
+ 'context' => $context,
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ );
+ break;
+ }
+
+ if ( $local_chunk !== $chunk || $local_match_byte_length !== $match_byte_length ) {
+ $failures[] = self::failure(
+ 'reader-composition-mismatch',
+ $context,
+ array_merge(
+ array(
+ 'context' => $context,
+ 'at' => $amp_at,
+ 'match_byte_length' => $match_byte_length,
+ 'local_match_byte_length' => $local_match_byte_length,
+ 'expected_chunk_base64' => base64_encode( $chunk ),
+ 'local_chunk_base64' => is_string( $local_chunk ) ? base64_encode( $local_chunk ) : null,
+ 'local_chunk_type' => gettype( $local_chunk ),
+ ),
+ self::byte_detail( 'reference', $reference )
+ )
+ );
+ }
+
+ $decoded .= substr( $payload, $was_at, $amp_at - $was_at );
+ $decoded .= $chunk;
+ $at = $amp_at + $match_byte_length;
+ $was_at = $at;
+ }
+
+ if ( $was_at < $end ) {
+ $decoded .= substr( $payload, $was_at );
+ }
+
+ if ( array() === $failures ) {
+ if ( $walk_at < $end ) {
+ $walk_spans[] = array(
+ 'type' => 'literal',
+ 'start' => $walk_at,
+ 'end' => $end,
+ );
+ }
+ if ( isset( $this->targets['reader_span_filter'] ) ) {
+ $walk_spans = ( $this->targets['reader_span_filter'] )( $walk_spans );
+ }
+ $failures = array_merge( $failures, $this->validate_reader_walk( $context, $payload, $walk_spans ) );
+ }
+
+ return array(
+ 'decoded' => $decoded,
+ 'failures' => $failures,
+ );
+ }
+
+ /**
+ * @param array $spans
+ *
+ * @return array
+ */
+ private function validate_reader_walk( string $context, string $payload, array $spans ): array {
+ $cursor = 0;
+ $consumed_bytes = 0;
+ $input_length = strlen( $payload );
+
+ foreach ( $spans as $index => $span ) {
+ if ( $span['start'] !== $cursor ) {
+ return array(
+ self::failure(
+ 'reader-walk-not-gapless',
+ $context,
+ array(
+ 'context' => $context,
+ 'reason' => $span['start'] < $cursor ? 'overlap' : 'gap',
+ 'span_index' => $index,
+ 'expected_start' => $cursor,
+ 'actual_start' => $span['start'],
+ 'actual_end' => $span['end'],
+ 'input_length' => $input_length,
+ 'spans' => self::preview_reader_spans( $spans ),
+ )
+ ),
+ );
+ }
+
+ if ( $span['end'] < $span['start'] || $span['end'] > $input_length ) {
+ return array(
+ self::failure(
+ 'reader-walk-not-gapless',
+ $context,
+ array(
+ 'context' => $context,
+ 'reason' => $span['end'] < $span['start'] ? 'negative-span' : 'overrun',
+ 'span_index' => $index,
+ 'expected_start' => $cursor,
+ 'actual_start' => $span['start'],
+ 'actual_end' => $span['end'],
+ 'input_length' => $input_length,
+ 'spans' => self::preview_reader_spans( $spans ),
+ )
+ ),
+ );
+ }
+
+ $consumed_bytes += $span['end'] - $span['start'];
+ $cursor = $span['end'];
+ }
+
+ if ( $cursor !== $input_length || $consumed_bytes !== $input_length ) {
+ return array(
+ self::failure(
+ 'reader-walk-not-gapless',
+ $context,
+ array(
+ 'context' => $context,
+ 'reason' => 'length-mismatch',
+ 'covered_until' => $cursor,
+ 'consumed_bytes' => $consumed_bytes,
+ 'input_length' => $input_length,
+ 'spans' => self::preview_reader_spans( $spans ),
+ )
+ ),
+ );
+ }
+
+ return array();
+ }
+
+ /**
+ * @return array
+ */
+ private function check_reader_non_amp_offsets( string $context, string $decoder_context, string $payload ): array {
+ $failures = array();
+ foreach ( $this->reader_non_amp_probe_offsets( $payload ) as $offset ) {
+ $match_byte_length = self::MATCH_BYTE_LENGTH_SENTINEL;
+ try {
+ $chunk = ( $this->targets['read_character_reference'] )( $decoder_context, $payload, $offset, $match_byte_length );
+ } catch ( \Throwable $error ) {
+ $failures[] = self::failure(
+ 'target-exception',
+ "{$context}:read-character-reference-non-amp",
+ array(
+ 'context' => $context,
+ 'at' => $offset,
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ );
+ break;
+ }
+
+ if ( null !== $chunk || self::MATCH_BYTE_LENGTH_SENTINEL !== $match_byte_length ) {
+ $failures[] = self::failure(
+ 'reader-non-amp-match',
+ $context,
+ array(
+ 'context' => $context,
+ 'at' => $offset,
+ 'byte_hex' => bin2hex( $payload[ $offset ] ),
+ 'chunk_type' => gettype( $chunk ),
+ 'chunk_base64' => is_string( $chunk ) ? base64_encode( $chunk ) : null,
+ 'match_byte_length' => $match_byte_length,
+ 'match_byte_length_type' => gettype( $match_byte_length ),
+ )
+ );
+ break;
+ }
+ }
+
+ return $failures;
+ }
+
+ /**
+ * @return int[]
+ */
+ private function reader_non_amp_probe_offsets( string $payload ): array {
+ $length = strlen( $payload );
+ if ( 0 === $length ) {
+ return array();
+ }
+
+ $candidates = array( 0, intdiv( $length, 2 ), $length - 1 );
+ $amp_at = strpos( $payload, '&' );
+ if ( false !== $amp_at ) {
+ $candidates[] = $amp_at - 1;
+ $candidates[] = $amp_at + 1;
+ }
+
+ $offsets = array();
+ foreach ( $candidates as $offset ) {
+ if ( $offset < 0 || $offset >= $length || '&' === $payload[ $offset ] ) {
+ continue;
+ }
+ $offsets[ $offset ] = true;
+ }
+
+ return array_keys( $offsets );
+ }
+
+ /**
+ * @return array
+ */
+ private function check_attribute_starts_with( string $payload ): array {
+ $failures = array();
+
+ try {
+ $decoded = $this->oracles->decode( 'attribute', $payload );
+ } catch ( \Throwable $error ) {
+ return array(
+ self::failure(
+ 'oracle-exception',
+ 'attribute:decode-for-prefix',
+ array(
+ 'context' => 'attribute',
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ ),
+ );
+ }
+
+ $searches = $this->attribute_searches( $decoded );
+ $results = array();
+ $get_result = function ( string $search, string $case_sensitivity ) use ( $payload, &$failures, &$results ): ?bool {
+ $result_key = $case_sensitivity . "\0" . $search;
+ if ( array_key_exists( $result_key, $results ) ) {
+ return $results[ $result_key ];
+ }
+
+ try {
+ $results[ $result_key ] = ( $this->targets['attribute_starts_with'] )( $payload, $search, $case_sensitivity );
+ } catch ( \Throwable $error ) {
+ $failures[] = self::failure(
+ 'target-exception',
+ "attribute-starts-with:{$case_sensitivity}",
+ array(
+ 'target' => 'attribute_starts_with',
+ 'case_sensitivity' => $case_sensitivity,
+ 'class' => get_class( $error ),
+ 'message' => $error->getMessage(),
+ )
+ );
+ $results[ $result_key ] = null;
+ }
+
+ return $results[ $result_key ];
+ };
+
+ foreach ( $searches as $search ) {
+ foreach ( array( 'case-sensitive', 'ascii-case-insensitive' ) as $case_sensitivity ) {
+ $expected = $this->expected_prefix_match( $decoded, $search, $case_sensitivity );
+ $got = $get_result( $search, $case_sensitivity );
+ if ( null === $got ) {
+ continue;
+ }
+
+ if ( $got !== $expected ) {
+ $failures[] = self::failure(
+ 'attribute-starts-with-mismatch',
+ $case_sensitivity,
+ array_merge(
+ array(
+ 'case_sensitivity' => $case_sensitivity,
+ 'expected' => $expected,
+ 'got' => $got,
+ 'decoded' => self::preview( $decoded ),
+ ),
+ self::byte_detail( 'search', $search )
+ )
+ );
+ }
+ }
+ }
+
+ $monotonicity_failures = $this->check_attribute_starts_with_monotonicity( $searches, $get_result );
+ $failures = array_merge( $failures, $monotonicity_failures );
+
+ return $failures;
+ }
+
+ /**
+ * @return array
+ */
+ private function check_attribute_starts_with_monotonicity( array $searches, callable $get_result ): array {
+ $failures = array();
+ $candidates = array();
+
+ foreach ( $searches as $search ) {
+ $candidates[ $search ] = true;
+ foreach ( self::byte_prefixes( $search ) as $prefix ) {
+ $candidates[ $prefix ] = true;
+ }
+ }
+
+ $case_sensitivities = array( 'case-sensitive', 'ascii-case-insensitive' );
+ foreach ( array_keys( $candidates ) as $search ) {
+ foreach ( $case_sensitivities as $case_sensitivity ) {
+ $got = $get_result( $search, $case_sensitivity );
+ if ( true === $got ) {
+ foreach ( self::byte_prefixes( $search ) as $prefix ) {
+ $prefix_got = $get_result( $prefix, $case_sensitivity );
+ if ( false === $prefix_got ) {
+ $failures[] = self::failure(
+ 'attribute-starts-with-prefix-monotonicity',
+ $case_sensitivity,
+ array_merge(
+ array(
+ 'case_sensitivity' => $case_sensitivity,
+ ),
+ self::byte_detail( 'search', $search ),
+ self::byte_detail( 'prefix', $prefix )
+ )
+ );
+ break;
+ }
+ }
+ }
+
+ if ( false === $got ) {
+ foreach ( self::attribute_search_extensions() as $suffix ) {
+ $extension = $search . $suffix;
+ $extension_got = $get_result( $extension, $case_sensitivity );
+ if ( true === $extension_got ) {
+ $failures[] = self::failure(
+ 'attribute-starts-with-extension-monotonicity',
+ $case_sensitivity,
+ array_merge(
+ array(
+ 'case_sensitivity' => $case_sensitivity,
+ ),
+ self::byte_detail( 'search', $search ),
+ self::byte_detail( 'extension', $extension )
+ )
+ );
+ break;
+ }
+ }
+ }
+ }
+
+ $case_sensitive = $get_result( $search, 'case-sensitive' );
+ if ( true === $case_sensitive ) {
+ $case_insensitive = $get_result( $search, 'ascii-case-insensitive' );
+ if ( false === $case_insensitive ) {
+ $failures[] = self::failure(
+ 'attribute-starts-with-case-monotonicity',
+ 'case-sensitive',
+ self::byte_detail( 'search', $search )
+ );
+ }
+ }
+ }
+
+ return $failures;
+ }
+
+ /**
+ * @return string[]
+ */
+ private function attribute_searches( string $decoded ): array {
+ $searches = array( '', 'a', 'A', 'http', 'https:', 'javascript:', ':', '&' );
+
+ foreach ( array( 1, 2, 4, 8, 11 ) as $length ) {
+ $prefix = substr( $decoded, 0, $length );
+ if ( '' !== $prefix && self::is_ascii( $prefix ) ) {
+ $searches[] = $prefix;
+ $searches[] = $prefix . 'x';
+ $searches[] = strtoupper( $prefix );
+ }
+ }
+
+ $max_prefix_length = min( strlen( $decoded ), self::ATTRIBUTE_SEARCH_PREFIX_BYTES );
+ for ( $length = 1; $length <= $max_prefix_length; $length++ ) {
+ $prefix = substr( $decoded, 0, $length );
+ $searches[] = $prefix;
+ $searches[] = $prefix . 'x';
+ }
+
+ return array_values( array_unique( $searches ) );
+ }
+
+ private function expected_prefix_match( string $decoded, string $search, string $case_sensitivity ): bool {
+ if ( '' === $search ) {
+ return true;
+ }
+
+ if ( strlen( $decoded ) < strlen( $search ) ) {
+ return false;
+ }
+
+ $prefix = substr( $decoded, 0, strlen( $search ) );
+ if ( 'ascii-case-insensitive' === $case_sensitivity ) {
+ return self::ascii_lower( $prefix ) === self::ascii_lower( $search );
+ }
+
+ return $prefix === $search;
+ }
+
+ private static function is_ascii( string $text ): bool {
+ return ! preg_match( '/[\x80-\xFF]/', $text );
+ }
+
+ private static function ascii_lower( string $text ): string {
+ return strtr( $text, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz' );
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function byte_prefixes( string $text ): array {
+ $prefixes = array();
+ for ( $length = 0; $length < strlen( $text ); $length++ ) {
+ $prefixes[] = substr( $text, 0, $length );
+ }
+ return $prefixes;
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function attribute_search_extensions(): array {
+ return array( "\x7F", 'x', 'A', '0', ':' );
+ }
+
+ private static function numeric_c1_replacement( string $reference ): ?string {
+ $value = self::numeric_reference_value( $reference );
+ if ( null === $value || $value < 0x80 || $value > 0x9F ) {
+ return null;
+ }
+
+ $replacement = mb_chr( self::C1_NUMERIC_REMAP[ $value - 0x80 ], 'UTF-8' );
+ return false === $replacement ? null : $replacement;
+ }
+
+ private static function invalid_numeric_replacement_reason( string $reference ): ?string {
+ $value = self::numeric_reference_value( $reference );
+ if ( null === $value ) {
+ return null;
+ }
+
+ if ( 0 === $value ) {
+ return 'zero';
+ }
+
+ if ( $value >= 0xD800 && $value <= 0xDFFF ) {
+ return 'surrogate';
+ }
+
+ if ( $value > 0x10FFFF ) {
+ return 'above-unicode';
+ }
+
+ return null;
+ }
+
+ private static function numeric_reference_value( string $reference ): ?int {
+ if ( 1 !== preg_match( '/^(?:([xX])([0-9A-Fa-f]+)|([0-9]+));?$/', $reference, $match ) ) {
+ return null;
+ }
+
+ $is_hex = '' !== ( $match[1] ?? '' );
+ $digits = $is_hex ? $match[2] : $match[3];
+ $base = $is_hex ? 16 : 10;
+ $max_digits = $is_hex ? 6 : 7;
+ $significant_digits = substr( $digits, strspn( $digits, '0' ) );
+
+ if ( '' === $significant_digits ) {
+ return 0;
+ }
+
+ if ( strlen( $significant_digits ) > $max_digits ) {
+ return null;
+ }
+
+ return intval( $significant_digits, $base );
+ }
+
+ private static function contains_raw_c1_byte( string $bytes ): bool {
+ return 1 === preg_match( '/[\x80-\x9F]/', $bytes );
+ }
+
+ private static function single_level_decode_expected( string $payload ): ?string {
+ $expected = '';
+ $offset = 0;
+ $matched = false;
+
+ while ( false !== ( $amp_at = strpos( $payload, '&', $offset ) ) ) {
+ $expected .= substr( $payload, $offset, $amp_at - $offset );
+
+ foreach ( self::SINGLE_LEVEL_DECODE_FIXTURES as $fixture => $decoded ) {
+ if ( str_starts_with( substr( $payload, $amp_at ), $fixture ) ) {
+ $expected .= $decoded;
+ $offset = $amp_at + strlen( $fixture );
+ $matched = true;
+ continue 2;
+ }
+ }
+
+ return null;
+ }
+
+ return $matched ? $expected . substr( $payload, $offset ) : null;
+ }
+
+ private static function byte_detail( string $name, string $bytes ): array {
+ $detail = array(
+ "{$name}_length" => strlen( $bytes ),
+ "{$name}_base64" => base64_encode( $bytes ),
+ "{$name}_preview" => self::preview( $bytes ),
+ );
+
+ if ( mb_check_encoding( $bytes, 'UTF-8' ) ) {
+ $detail[ "{$name}_text" ] = $bytes;
+ }
+
+ return $detail;
+ }
+
+ /**
+ * @param array $spans
+ *
+ * @return array
+ */
+ private static function preview_reader_spans( array $spans ): array {
+ return array_slice( $spans, 0, 16 );
+ }
+
+ private static function failure( string $check, string $party, array $detail ): array {
+ return array(
+ 'check' => $check,
+ 'signature' => "{$check}:{$party}",
+ 'detail' => $detail,
+ );
+ }
+
+ private static function diff_detail( string $context, string $expected, string $got ): array {
+ $offset = self::first_difference( $expected, $got );
+
+ return array(
+ 'context' => $context,
+ 'expected_length' => strlen( $expected ),
+ 'got_length' => strlen( $got ),
+ 'first_diff_at' => $offset,
+ 'expected_base64' => base64_encode( $expected ),
+ 'got_base64' => base64_encode( $got ),
+ 'expected_window' => self::preview( $expected, $offset ),
+ 'got_window' => self::preview( $got, $offset ),
+ );
+ }
+
+ private static function first_difference( string $a, string $b ): int {
+ $max = min( strlen( $a ), strlen( $b ) );
+ for ( $i = 0; $i < $max; $i++ ) {
+ if ( $a[ $i ] !== $b[ $i ] ) {
+ return $i;
+ }
+ }
+ return $max;
+ }
+
+ private static function preview( string $bytes, int $center = 0 ): string {
+ $start = max( 0, $center - intdiv( self::PREVIEW_BYTES, 2 ) );
+ return bin2hex( substr( $bytes, $start, self::PREVIEW_BYTES ) );
+ }
+}
diff --git a/tools/html-decoder-fuzz/lib/Cli.php b/tools/html-decoder-fuzz/lib/Cli.php
new file mode 100644
index 0000000000000..b3c2935d4ea5a
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/Cli.php
@@ -0,0 +1,251 @@
+ $defaults
+ * @return array
+ */
+ public static function parse_args( array $argv, array $defaults ): array {
+ $options = $defaults;
+ $count = count( $argv );
+
+ for ( $i = 1; $i < $count; $i++ ) {
+ $arg = $argv[ $i ];
+ if ( 0 !== strncmp( $arg, '--', 2 ) ) {
+ fwrite( STDERR, "Unexpected argument: {$arg}\n" );
+ exit( 2 );
+ }
+
+ $body = substr( $arg, 2 );
+ if ( false !== strpos( $body, '=' ) ) {
+ list( $name, $value ) = explode( '=', $body, 2 );
+ } else {
+ $name = $body;
+ if ( $i + 1 >= $count ) {
+ fwrite( STDERR, "Missing value for --{$name}\n" );
+ exit( 2 );
+ }
+ $value = $argv[ ++$i ];
+ }
+
+ if ( ! array_key_exists( $name, $defaults ) ) {
+ fwrite( STDERR, "Unknown option --{$name}\n" );
+ exit( 2 );
+ }
+
+ if ( is_int( $defaults[ $name ] ) ) {
+ if ( 1 !== preg_match( '/^-?\d+$/', $value ) ) {
+ fwrite( STDERR, "--{$name} must be an integer\n" );
+ exit( 2 );
+ }
+ $digits = '-' === $value[0] ? substr( $value, 1 ) : $value;
+ $digits = ltrim( $digits, '0' );
+ $digits = '' === $digits ? '0' : $digits;
+ $max = (string) PHP_INT_MAX;
+ if ( strlen( $digits ) > strlen( $max ) || ( strlen( $digits ) === strlen( $max ) && strcmp( $digits, $max ) > 0 ) ) {
+ fwrite( STDERR, "--{$name} is outside the supported integer range\n" );
+ exit( 2 );
+ }
+ $options[ $name ] = (int) $value;
+ } else {
+ $options[ $name ] = $value;
+ }
+ }
+
+ return $options;
+ }
+
+ public static function emit( array $record ): void {
+ $json = json_encode( $record, JSON_UNESCAPED_SLASHES );
+ if ( false === $json || ! self::write_stream( STDOUT, $json . "\n" ) ) {
+ fwrite( STDERR, "Cannot write worker event\n" );
+ exit( 2 );
+ }
+ }
+
+ /**
+ * @param resource $stream
+ */
+ public static function write_stream( $stream, string $contents ): bool {
+ $written = fwrite( $stream, $contents );
+ return is_int( $written ) && strlen( $contents ) === $written;
+ }
+
+ public static function write_file( string $path, string $contents ): bool {
+ if ( self::is_linked_file( $path ) ) {
+ return false;
+ }
+
+ $written = file_put_contents( $path, $contents );
+ return is_int( $written ) && strlen( $contents ) === $written;
+ }
+
+ public static function append_file( string $path, string $contents ): bool {
+ if ( '' === $contents ) {
+ return true;
+ }
+ if ( self::is_linked_file( $path ) ) {
+ return false;
+ }
+
+ $written = file_put_contents( $path, $contents, FILE_APPEND );
+ return is_int( $written ) && strlen( $contents ) === $written;
+ }
+
+ public static function is_linked_file( string $path ): bool {
+ if ( is_link( $path ) ) {
+ return true;
+ }
+ if ( ! file_exists( $path ) || is_dir( $path ) ) {
+ return false;
+ }
+ if ( ! is_file( $path ) ) {
+ return true;
+ }
+
+ $stat = @lstat( $path );
+ return is_array( $stat ) && isset( $stat['nlink'] ) && $stat['nlink'] > 1;
+ }
+
+ public static function failure_signature_key( array $signatures, string $mode = 'oracle' ): string {
+ $normalized = array_map( 'strval', $signatures );
+ sort( $normalized, SORT_STRING );
+ return hash( 'sha256', $mode . "\0" . implode( "\0", $normalized ) );
+ }
+
+ public static function remove_tree( string $path, string $root ): bool {
+ if ( is_link( $path ) || is_file( $path ) ) {
+ $real_root = realpath( $root );
+ $real_parent = realpath( dirname( $path ) );
+ if ( false === $real_root || false === $real_parent ) {
+ return false;
+ }
+
+ $root_prefix = rtrim( $real_root, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR;
+ if ( $real_parent !== $real_root && 0 !== strncmp( $real_parent . DIRECTORY_SEPARATOR, $root_prefix, strlen( $root_prefix ) ) ) {
+ return false;
+ }
+
+ return @unlink( $path );
+ }
+ if ( ! is_dir( $path ) ) {
+ return true;
+ }
+
+ $real_path = realpath( $path );
+ $real_root = realpath( $root );
+ if ( false === $real_path || false === $real_root || $real_path === $real_root ) {
+ return false;
+ }
+
+ $root_prefix = rtrim( $real_root, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR;
+ if ( 0 !== strncmp( $real_path . DIRECTORY_SEPARATOR, $root_prefix, strlen( $root_prefix ) ) ) {
+ return false;
+ }
+
+ $items = new \RecursiveIteratorIterator(
+ new \RecursiveDirectoryIterator( $real_path, \FilesystemIterator::SKIP_DOTS ),
+ \RecursiveIteratorIterator::CHILD_FIRST
+ );
+
+ foreach ( $items as $item ) {
+ $pathname = $item->getPathname();
+ if ( $item->isDir() && ! $item->isLink() ) {
+ if ( ! @rmdir( $pathname ) ) {
+ return false;
+ }
+ } elseif ( ! @unlink( $pathname ) ) {
+ return false;
+ }
+ }
+
+ return @rmdir( $real_path );
+ }
+
+ public static function require_int_at_least( array $options, string $name, int $minimum ): void {
+ if ( ! isset( $options[ $name ] ) || ! is_int( $options[ $name ] ) || $options[ $name ] < $minimum ) {
+ fwrite( STDERR, "--{$name} must be at least {$minimum}\n" );
+ exit( 2 );
+ }
+ }
+
+ /**
+ * @param string[] $allowed
+ */
+ public static function require_one_of( array $options, string $name, array $allowed ): void {
+ if ( ! isset( $options[ $name ] ) || ! in_array( $options[ $name ], $allowed, true ) ) {
+ fwrite( STDERR, "--{$name} must be one of: " . implode( ', ', $allowed ) . "\n" );
+ exit( 2 );
+ }
+ }
+
+ public static function git_metadata( string $repo_root ): array {
+ $run = static function ( array $command ) use ( $repo_root ): ?string {
+ $process = @proc_open(
+ $command,
+ array(
+ 0 => array( 'file', '/dev/null', 'r' ),
+ 1 => array( 'pipe', 'w' ),
+ 2 => array( 'file', '/dev/null', 'a' ),
+ ),
+ $pipes,
+ $repo_root
+ );
+ if ( ! is_resource( $process ) ) {
+ return null;
+ }
+ $out = stream_get_contents( $pipes[1] );
+ fclose( $pipes[1] );
+ $code = proc_close( $process );
+ return 0 === $code ? trim( (string) $out ) : null;
+ };
+
+ $commit = $run( array( 'git', 'rev-parse', 'HEAD' ) );
+ $branch = $run( array( 'git', 'rev-parse', '--abbrev-ref', 'HEAD' ) );
+ $status = $run( array( 'git', 'status', '--porcelain', '--untracked-files=no' ) );
+
+ return array(
+ 'commit' => $commit,
+ 'branch' => $branch,
+ 'dirty' => null === $status ? null : '' !== $status,
+ );
+ }
+
+ public static function environment_metadata( Oracles $oracles ): array {
+ return array(
+ 'php' => PHP_VERSION,
+ 'os' => PHP_OS_FAMILY,
+ 'oracles' => $oracles->names(),
+ );
+ }
+
+ public static function payload_preview( string $payload ): array {
+ return array(
+ 'bytes' => strlen( $payload ),
+ 'sha256' => hash( 'sha256', $payload ),
+ 'hex' => bin2hex( substr( $payload, 0, 80 ) ) . ( strlen( $payload ) > 80 ? '...' : '' ),
+ );
+ }
+}
diff --git a/tools/html-decoder-fuzz/lib/CoverageGuidance.php b/tools/html-decoder-fuzz/lib/CoverageGuidance.php
new file mode 100644
index 0000000000000..239abe787820d
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/CoverageGuidance.php
@@ -0,0 +1,268 @@
+ */
+ private array $seen_edges = array();
+
+ /** @var string[] */
+ private array $target_files;
+
+ /** @var array */
+ private array $target_file_set;
+
+ private string $provider;
+
+ public function __construct() {
+ $this->target_files = self::target_files();
+ $this->target_file_set = array_fill_keys( $this->target_files, true );
+ $this->provider = self::fake_enabled() ? 'fake' : 'pcov';
+ }
+
+ public static function available(): bool {
+ return self::fake_enabled() || self::pcov_available();
+ }
+
+ public static function unavailable_reason(): string {
+ if ( getenv( 'HTML_DECODER_FUZZ_DISABLE_PCOV' ) ) {
+ return 'coverage mode requires pcov; pcov was disabled by HTML_DECODER_FUZZ_DISABLE_PCOV';
+ }
+ if ( ! extension_loaded( 'pcov' ) ) {
+ return 'coverage mode requires the pcov extension';
+ }
+ if ( '0' === (string) ini_get( 'pcov.enabled' ) ) {
+ return 'coverage mode requires pcov.enabled=1';
+ }
+ if ( ! function_exists( 'pcov\\start' ) || ! function_exists( 'pcov\\stop' ) || ! function_exists( 'pcov\\collect' ) || ! function_exists( 'pcov\\clear' ) ) {
+ return 'coverage mode requires the pcov start, stop, collect, and clear functions';
+ }
+
+ return 'coverage mode is unavailable';
+ }
+
+ public function provider(): string {
+ return $this->provider;
+ }
+
+ public function begin_case(): void {
+ if ( 'pcov' !== $this->provider ) {
+ return;
+ }
+
+ \pcov\stop();
+ \pcov\clear();
+ \pcov\start();
+ }
+
+ /**
+ * @return array
+ */
+ public function finish_case( string $payload, string $context, string $strategy ): array {
+ if ( 'fake' === $this->provider ) {
+ return $this->fake_edges( $payload, $context, $strategy );
+ }
+
+ \pcov\stop();
+ $type = defined( 'pcov\\inclusive' ) ? constant( 'pcov\\inclusive' ) : 1;
+ $coverage = \pcov\collect( $type, $this->target_files );
+ \pcov\clear();
+
+ return $this->normalize_coverage( $coverage );
+ }
+
+ /**
+ * @param array $edges
+ * @return array
+ */
+ public function new_edges( array $edges ): array {
+ $new_edges = array();
+ foreach ( $edges as $edge ) {
+ if ( isset( $this->seen_edges[ $edge['key'] ] ) ) {
+ continue;
+ }
+ $this->seen_edges[ $edge['key'] ] = true;
+ $new_edges[] = $edge;
+ }
+
+ return $new_edges;
+ }
+
+ public function seen_edge_count(): int {
+ return count( $this->seen_edges );
+ }
+
+ /**
+ * @param array{context: string, strategy: string, payload: string} $generated
+ * @param array $new_edges
+ * @return array{artifact_dir: ?string, artifact_retained: bool, artifact_reused: bool}
+ */
+ public function retain_payload( string $output_dir, string $seed, int $case, array $generated, string $payload, array $new_edges ): array {
+ if ( '' === $output_dir ) {
+ return array(
+ 'artifact_dir' => null,
+ 'artifact_retained' => false,
+ 'artifact_reused' => false,
+ );
+ }
+
+ $coverage_dir = rtrim( $output_dir, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR . 'coverage-corpus';
+ if ( is_link( $coverage_dir ) || ( file_exists( $coverage_dir ) && ! is_dir( $coverage_dir ) ) ) {
+ throw new \RuntimeException( "coverage corpus path is not a directory: {$coverage_dir}" );
+ }
+ if ( ! is_dir( $coverage_dir ) && ! mkdir( $coverage_dir, 0777, true ) && ! is_dir( $coverage_dir ) ) {
+ throw new \RuntimeException( "cannot create coverage corpus dir {$coverage_dir}" );
+ }
+
+ $payload_hash = hash( 'sha256', $payload );
+ $case_dir = sprintf(
+ '%s/payload-seed%s-case%d-%s',
+ $coverage_dir,
+ preg_replace( '/[^A-Za-z0-9_-]/', '_', $seed ),
+ $case,
+ substr( $payload_hash, 0, 16 )
+ );
+ if ( is_link( $case_dir ) || ( file_exists( $case_dir ) && ! is_dir( $case_dir ) ) ) {
+ throw new \RuntimeException( "coverage corpus artifact path is not a directory: {$case_dir}" );
+ }
+
+ $artifact_reused = is_dir( $case_dir );
+ if ( ! $artifact_reused && ! mkdir( $case_dir, 0777, false ) && ! is_dir( $case_dir ) ) {
+ throw new \RuntimeException( "cannot create coverage corpus artifact {$case_dir}" );
+ }
+
+ if ( ! $artifact_reused ) {
+ $manifest = array(
+ 'type' => 'coverage',
+ 'seed' => $seed,
+ 'case' => $case,
+ 'mode' => 'coverage',
+ 'context' => $generated['context'],
+ 'strategy' => $generated['strategy'],
+ 'input_size' => strlen( $payload ),
+ 'payload_base64' => base64_encode( $payload ),
+ 'payload_preview' => Cli::payload_preview( $payload ),
+ 'coverage_provider' => $this->provider,
+ 'new_edge_count' => count( $new_edges ),
+ 'new_edges' => $new_edges,
+ 'git' => Cli::git_metadata( Bootstrap::repo_root() ),
+ );
+ $manifest_json = json_encode( $manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES );
+ if (
+ false === $manifest_json ||
+ ! Cli::write_file( "{$case_dir}/payload.txt", $payload ) ||
+ ! Cli::write_file( "{$case_dir}/coverage.json", $manifest_json )
+ ) {
+ throw new \RuntimeException( "cannot write coverage corpus artifact under {$case_dir}" );
+ }
+ }
+
+ return array(
+ 'artifact_dir' => $case_dir,
+ 'artifact_retained' => ! $artifact_reused,
+ 'artifact_reused' => $artifact_reused,
+ );
+ }
+
+ /**
+ * @return string[]
+ */
+ public static function target_files(): array {
+ $root = Bootstrap::repo_root();
+ $files = array(
+ $root . '/src/wp-includes/html-api/class-wp-html-decoder.php',
+ $root . '/src/wp-includes/class-wp-token-map.php',
+ );
+
+ return array_values(
+ array_filter(
+ array_map(
+ static function ( string $file ): ?string {
+ $real = realpath( $file );
+ return false === $real ? null : $real;
+ },
+ $files
+ )
+ )
+ );
+ }
+
+ private static function fake_enabled(): bool {
+ $value = getenv( 'HTML_DECODER_FUZZ_FAKE_COVERAGE' );
+ return false !== $value && '' !== $value && '0' !== $value;
+ }
+
+ private static function pcov_available(): bool {
+ return (
+ ! getenv( 'HTML_DECODER_FUZZ_DISABLE_PCOV' ) &&
+ extension_loaded( 'pcov' ) &&
+ '0' !== (string) ini_get( 'pcov.enabled' ) &&
+ function_exists( 'pcov\\start' ) &&
+ function_exists( 'pcov\\stop' ) &&
+ function_exists( 'pcov\\collect' ) &&
+ function_exists( 'pcov\\clear' )
+ );
+ }
+
+ /**
+ * @param mixed $coverage
+ * @return array
+ */
+ private function normalize_coverage( $coverage ): array {
+ if ( ! is_array( $coverage ) ) {
+ return array();
+ }
+
+ $edges = array();
+ foreach ( $coverage as $file => $lines ) {
+ $file = realpath( (string) $file ) ?: (string) $file;
+ if ( ! isset( $this->target_file_set[ $file ] ) || ! is_array( $lines ) ) {
+ continue;
+ }
+
+ foreach ( $lines as $line => $hits ) {
+ $line = (int) $line;
+ $hits = (int) $hits;
+ if ( $line <= 0 || $hits <= 0 ) {
+ continue;
+ }
+ $edge = $this->edge( $file, $line, $hits );
+ $edges[ $edge['key'] ] = $edge;
+ }
+ }
+ ksort( $edges, SORT_STRING );
+
+ return array_values( $edges );
+ }
+
+ /**
+ * @return array
+ */
+ private function fake_edges( string $payload, string $context, string $strategy ): array {
+ $digest = hash( 'sha256', $context . "\0" . $strategy . "\0" . $payload );
+ $edges = array();
+
+ foreach ( $this->target_files as $index => $file ) {
+ $line_count = count( file( $file, FILE_IGNORE_NEW_LINES ) ?: array() );
+ $line_count = max( 1, $line_count );
+ $offset = hexdec( substr( $digest, $index * 8, 8 ) );
+ $edges[] = $this->edge( $file, 1 + ( $offset % $line_count ), 1 );
+ }
+
+ return $edges;
+ }
+
+ /**
+ * @return array{key: string, file: string, line: int, hits: int}
+ */
+ private function edge( string $file, int $line, int $hits ): array {
+ return array(
+ 'key' => hash( 'sha256', $file . "\0" . $line ),
+ 'file' => $file,
+ 'line' => $line,
+ 'hits' => $hits,
+ );
+ }
+}
diff --git a/tools/html-decoder-fuzz/lib/Generator.php b/tools/html-decoder-fuzz/lib/Generator.php
new file mode 100644
index 0000000000000..85d455c0b526c
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/Generator.php
@@ -0,0 +1,1560 @@
+ */
+ private ?array $name_sweep_base_name_set = null;
+
+ public function __construct( Prng $prng, int $max_bytes = 4096, ?array $named_reference_names = null ) {
+ $this->prng = $prng;
+ $this->max_bytes = max( 1, $max_bytes );
+
+ $names = $named_reference_names ?? Bootstrap::named_reference_names();
+
+ $this->semicolon_names = array_values(
+ array_filter(
+ $names,
+ static fn( string $name ): bool => str_ends_with( $name, ';' )
+ )
+ );
+ $this->legacy_names = array_values(
+ array_filter(
+ $names,
+ static fn( string $name ): bool => ! str_ends_with( $name, ';' )
+ )
+ );
+
+ if ( array() === $this->semicolon_names ) {
+ $this->semicolon_names = self::PREFERRED_SEMICOLON;
+ } else {
+ self::sort_reference_names( $this->semicolon_names );
+ }
+ if ( array() === $this->legacy_names ) {
+ $this->legacy_names = self::PREFERRED_LEGACY;
+ } else {
+ self::sort_reference_names( $this->legacy_names );
+ }
+ }
+
+ /**
+ * @return array{context: string, strategy: string, payload: string}
+ */
+ public function generate(): array {
+ // Preserve seed-to-payload mapping from the former one-context lane.
+ $this->prng->chance( 50 );
+ $strategy = $this->prng->weighted(
+ array(
+ 'plain-no-amp' => 8,
+ 'named-exact' => 16,
+ 'named-missing-semi' => 15,
+ 'attribute-discriminator' => 15,
+ 'numeric' => 22,
+ 'adjacency' => 10,
+ 'truncation-sweep' => 9,
+ 'reference-at-eof' => 12,
+ 'multibyte-around' => 9,
+ 'attribute-prefix' => 8,
+ 'lookalike' => 8,
+ 'composition' => 9,
+ 'case-mangled-name' => 8,
+ )
+ );
+
+ $method = 'gen_' . str_replace( '-', '_', $strategy );
+ $payload = $this->$method();
+
+ return array(
+ 'context' => 'both',
+ 'strategy' => $strategy,
+ 'payload' => self::trim_to_safe_max( $payload, $this->max_bytes ),
+ );
+ }
+
+ /**
+ * @return array{context: string, strategy: string, payload: string}
+ */
+ public function generate_bytes(): array {
+ $strategy = $this->prng->weighted(
+ array(
+ 'bytes-uniform' => 35,
+ 'bytes-no-amp' => 20,
+ 'bytes-with-amp' => 20,
+ 'bytes-invalid-utf8' => 15,
+ 'bytes-delimiters' => 10,
+ )
+ );
+
+ $method = 'gen_' . str_replace( '-', '_', $strategy );
+ $payload = $this->$method();
+
+ return array(
+ 'context' => 'both',
+ 'strategy' => $strategy,
+ 'payload' => substr( $payload, 0, $this->max_bytes ),
+ );
+ }
+
+ /**
+ * @return array{context: string, strategy: string, payload: string}
+ */
+ public function generate_name_sweep( int $case_index ): array {
+ $base_names = $this->name_sweep_base_names();
+ $followers = self::name_sweep_followers();
+ $variants = 2 * count( $followers );
+ $case_index = max( 0, $case_index );
+ $name_index = intdiv( $case_index, $variants ) % count( $base_names );
+ $variant = $case_index % $variants;
+ $with_semicolon = $variant >= count( $followers );
+ $follower = $followers[ $variant % count( $followers ) ];
+
+ $payload = '&' . $base_names[ $name_index ] . ( $with_semicolon ? ';' : '' ) . $follower;
+
+ return array(
+ 'context' => 'both',
+ 'strategy' => 'name-sweep',
+ 'payload' => self::trim_to_safe_max( $payload, $this->max_bytes ),
+ );
+ }
+
+ public function name_sweep_period(): int {
+ return count( $this->name_sweep_base_names() ) * 2 * count( self::name_sweep_followers() );
+ }
+
+ /**
+ * @return array{context: string, strategy: string, payload: string}
+ */
+ public function generate_legacy_follower_sweep( int $case_index ): array {
+ $followers = self::legacy_follower_sweep_followers();
+ $case_index = max( 0, $case_index );
+ $name_index = intdiv( $case_index, count( $followers ) ) % count( $this->legacy_names );
+ $follower = $followers[ $case_index % count( $followers ) ];
+ $payload = '&' . $this->legacy_names[ $name_index ] . $follower;
+
+ return array(
+ 'context' => 'both',
+ 'strategy' => 'legacy-follower-sweep',
+ 'payload' => self::trim_to_safe_max( $payload, $this->max_bytes ),
+ );
+ }
+
+ public function legacy_follower_sweep_period(): int {
+ return count( $this->legacy_names ) * count( self::legacy_follower_sweep_followers() );
+ }
+
+ /**
+ * @return array{context: string, strategy: string, payload: string}
+ */
+ public function generate_prefix_family_sweep( int $case_index ): array {
+ $cases = $this->prefix_family_sweep_cases();
+ $case_index = max( 0, $case_index ) % count( $cases );
+ $case = $cases[ $case_index ];
+ $prefix = substr( $case['reference'], 0, $case['split'] );
+
+ return array(
+ 'context' => 'both',
+ 'strategy' => 'prefix-family-sweep',
+ 'payload' => self::trim_to_safe_max( $prefix . $case['follower'], $this->max_bytes ),
+ );
+ }
+
+ public function prefix_family_sweep_period(): int {
+ return count( $this->prefix_family_sweep_cases() );
+ }
+
+ /**
+ * @return array{context: string, strategy: string, payload: string}
+ */
+ public function generate_numeric_boundary_sweep( int $case_index ): array {
+ $cases = self::numeric_boundary_sweep_cases();
+ $case_index = max( 0, $case_index ) % count( $cases );
+
+ return array(
+ 'context' => 'both',
+ 'strategy' => 'numeric-boundary-sweep',
+ 'payload' => self::trim_to_safe_max( $cases[ $case_index ], $this->max_bytes ),
+ );
+ }
+
+ public function numeric_boundary_sweep_period(): int {
+ return count( self::numeric_boundary_sweep_cases() );
+ }
+
+ /**
+ * @return array{context: string, strategy: string, payload: string}
+ */
+ public function generate_corpus_mutation( int $case_index ): array {
+ $corpus = self::corpus_payloads();
+ $case_index = max( 0, $case_index );
+ $operation = $this->prng->weighted(
+ array(
+ 'splice' => 25,
+ 'byte-perturb' => 25,
+ 'semicolon-toggle' => 25,
+ 'reference-duplication' => 25,
+ )
+ );
+ $payload = $corpus[ $case_index % count( $corpus ) ];
+ $payload = $this->mutate_corpus_payload( $payload, $operation, $corpus );
+
+ if ( $this->prng->chance( 35 ) ) {
+ $payload = $this->mutate_corpus_payload(
+ $payload,
+ $this->prng->choice( array( 'splice', 'byte-perturb', 'semicolon-toggle', 'reference-duplication' ) ),
+ $corpus
+ );
+ }
+
+ return array(
+ 'context' => 'both',
+ 'strategy' => 'corpus-' . $operation,
+ 'payload' => self::trim_to_safe_max( $payload, $this->max_bytes ),
+ );
+ }
+
+ public function corpus_period(): int {
+ return count( self::corpus_payloads() );
+ }
+
+ /**
+ * @return array{context: string, strategy: string, payload: string}
+ */
+ public function generate_token_map_sweep( int $case_index ): array {
+ $cases = self::token_map_sweep_cases();
+ $case_index = max( 0, $case_index ) % count( $cases );
+
+ return array(
+ 'context' => 'both',
+ 'strategy' => 'token-map-structure-sweep',
+ 'payload' => self::trim_to_safe_max( $cases[ $case_index ]['payload'], $this->max_bytes ),
+ );
+ }
+
+ public function token_map_period(): int {
+ return count( self::token_map_sweep_cases() );
+ }
+
+ public static function is_oracle_safe_payload( string $payload ): bool {
+ return (
+ mb_check_encoding( $payload, 'UTF-8' ) &&
+ ! str_contains( $payload, '<' ) &&
+ ! str_contains( $payload, '"' ) &&
+ ! str_contains( $payload, "\r" ) &&
+ ! str_contains( $payload, "\x00" )
+ );
+ }
+
+ private function gen_plain_no_amp(): string {
+ return $this->plain_text( false );
+ }
+
+ private function gen_named_exact(): string {
+ return $this->plain_text() . $this->named_exact() . $this->plain_text();
+ }
+
+ private function gen_named_missing_semi(): string {
+ $name = $this->pick_legacy_name();
+ $follower = $this->prng->weighted(
+ array(
+ 'end' => 35,
+ 'punct' => 35,
+ 'alpha' => 20,
+ 'eq' => 10,
+ )
+ );
+
+ $suffix = '';
+ if ( 'punct' === $follower ) {
+ $suffix = $this->prng->choice( array( ' ', '.', '/', ':', ';', '-' ) );
+ } elseif ( 'alpha' === $follower ) {
+ $suffix = $this->ascii_run( $this->prng->int( 1, 5 ) );
+ } elseif ( 'eq' === $follower ) {
+ $suffix = '=' . $this->ascii_run( $this->prng->int( 0, 4 ) );
+ }
+
+ return $this->plain_text() . '&' . $name . $suffix . $this->plain_text();
+ }
+
+ private function gen_attribute_discriminator(): string {
+ $name = $this->prng->choice( array_values( array_intersect( self::PREFERRED_LEGACY, $this->legacy_names ) ) ?: $this->legacy_names );
+ $follower = $this->prng->choice( array( '=', 'x', 'Z', '0', 'later;' ) );
+
+ return $this->plain_text() . '&' . $name . $follower . $this->plain_text();
+ }
+
+ private function gen_numeric(): string {
+ return $this->plain_text() . $this->numeric_reference() . $this->plain_text();
+ }
+
+ private function gen_adjacency(): string {
+ $count = $this->prng->int( 2, 8 );
+ $out = $this->plain_text();
+
+ for ( $i = 0; $i < $count; $i++ ) {
+ $out .= $this->prng->chance( 48 ) ? $this->named_exact() : $this->numeric_reference();
+ if ( $this->prng->chance( 20 ) ) {
+ $out .= $this->plain_text();
+ }
+ }
+
+ return $out . $this->plain_text();
+ }
+
+ private function gen_truncation_sweep(): string {
+ $reference = $this->prng->chance( 50 ) ? $this->named_exact() : $this->numeric_reference( true );
+ $length = strlen( $reference );
+ $prefix = substr( $reference, 0, $this->prng->int( 1, max( 1, $length - 1 ) ) );
+
+ return $this->plain_text() . $prefix . $this->plain_text();
+ }
+
+ private function gen_reference_at_eof(): string {
+ $kind = $this->prng->weighted(
+ array(
+ 'fixed' => 45,
+ 'named-prefix' => 25,
+ 'decimal-digits' => 15,
+ 'hex-digits' => 15,
+ )
+ );
+ $suffix = '';
+
+ if ( 'named-prefix' === $kind ) {
+ $name = $this->pick_semicolon_name();
+ $reference = '&' . $name;
+ $suffix = substr( $reference, 0, $this->prng->int( 1, strlen( $reference ) - 1 ) );
+ } elseif ( 'decimal-digits' === $kind ) {
+ $digits = $this->ascii_digits( $this->prng->int( 1, 9 ) );
+ $suffix = substr( '' . $digits, 0, max( 1, min( strlen( '' . $digits ), $this->max_bytes ) ) );
+ } elseif ( 'hex-digits' === $kind ) {
+ $prefix = $this->prng->chance( 50 ) ? '' : '';
+ $digits = $this->hex_digits( $this->prng->int( 1, 8 ) );
+ $suffix = substr( $prefix . $digits, 0, max( 1, min( strlen( $prefix . $digits ), $this->max_bytes ) ) );
+ } else {
+ $suffix = $this->prng->choice(
+ array(
+ '&',
+ '',
+ '',
+ '',
+ '&g',
+ '>',
+ '¬',
+ '¬i',
+ '&',
+ '{',
+ '',
+ )
+ );
+ $suffix = substr( $suffix, 0, max( 1, min( strlen( $suffix ), $this->max_bytes ) ) );
+ }
+
+ return $this->plain_text_up_to( max( 0, $this->max_bytes - strlen( $suffix ) ) ) . $suffix;
+ }
+
+ private function gen_multibyte_around(): string {
+ $atoms = array( 'e', "\u{00E9}", "\u{96EA}", "\u{1F642}", "\u{03B2}", "\u{05E2}\u{05D1}", "\u{0928}\u{092E}" );
+ $out = '';
+ $count = $this->prng->int( 2, 7 );
+ for ( $i = 0; $i < $count; $i++ ) {
+ $out .= $this->prng->choice( $atoms );
+ $out .= $this->prng->chance( 55 ) ? $this->named_exact() : $this->numeric_reference();
+ }
+ return $out . $this->prng->choice( $atoms );
+ }
+
+ private function gen_attribute_prefix(): string {
+ if ( $this->prng->chance( 72 ) ) {
+ $encoded = $this->encode_attribute_prefix_target( $this->prng->choice( self::ATTRIBUTE_PREFIX_TARGETS ) );
+ $suffix = $this->plain_text_up_to( max( 0, $this->max_bytes - strlen( $encoded['payload'] ) ) );
+ if ( null !== $encoded['semicolonless_base'] && '' !== $suffix && self::would_extend_semicolonless_numeric( $encoded['semicolonless_base'], $suffix[0] ) ) {
+ $suffix = '_' . substr( $suffix, 1 );
+ }
+
+ return $encoded['payload'] . $suffix;
+ }
+
+ $prefix = $this->prng->choice(
+ array(
+ '<⃒',
+ '>⃒',
+ '≪̸',
+ '=⃥',
+ 'jav',
+ )
+ );
+
+ return $prefix . $this->plain_text_up_to( max( 0, $this->max_bytes - strlen( $prefix ) ) );
+ }
+
+ private function gen_lookalike(): string {
+ $lookalike = $this->prng->chance( 85 )
+ ? $this->edit_distance_lookalike()
+ : $this->legacy_lookalike();
+
+ return $this->plain_text() . $lookalike . $this->plain_text();
+ }
+
+ private function gen_case_mangled_name(): string {
+ return $this->plain_text() . $this->case_mangled_name() . $this->plain_text();
+ }
+
+ private function gen_composition(): string {
+ if ( $this->max_bytes < 3 ) {
+ return self::trim_to_safe_max( $this->named_exact(), $this->max_bytes );
+ }
+
+ $strategies = array(
+ 'named-exact',
+ 'named-missing-semi',
+ 'attribute-discriminator',
+ 'numeric',
+ 'adjacency',
+ 'truncation-sweep',
+ 'reference-at-eof',
+ 'multibyte-around',
+ 'attribute-prefix',
+ 'lookalike',
+ 'case-mangled-name',
+ );
+ $max_count = min( 3, intdiv( $this->max_bytes + strlen( self::COMPOSITION_SEPARATOR ), 1 + strlen( self::COMPOSITION_SEPARATOR ) ) );
+ $count = $this->prng->int( 2, $max_count );
+ $original_max_bytes = $this->max_bytes;
+ $out = '';
+
+ for ( $i = 0; $i < $count; $i++ ) {
+ if ( $i > 0 ) {
+ $out .= self::COMPOSITION_SEPARATOR;
+ }
+
+ $remaining_fragments = $count - $i - 1;
+ $reserved_bytes = $remaining_fragments * ( 1 + strlen( self::COMPOSITION_SEPARATOR ) );
+ $fragment_max_bytes = max( 1, $original_max_bytes - strlen( $out ) - $reserved_bytes );
+
+ $out .= $this->composition_fragment( $strategies, $fragment_max_bytes );
+ }
+
+ return $out;
+ }
+
+ /**
+ * @param string[] $strategies
+ */
+ private function composition_fragment( array $strategies, int $max_bytes ): string {
+ for ( $attempt = 0; $attempt < 8; $attempt++ ) {
+ $strategy = $this->prng->choice( $strategies );
+ $method = 'gen_' . str_replace( '-', '_', $strategy );
+ $fragment = $this->with_max_bytes(
+ $max_bytes,
+ function () use ( $method ): string {
+ return $this->$method();
+ }
+ );
+ $fragment = self::trim_to_safe_max( $fragment, $max_bytes );
+
+ if ( '' !== $fragment && ! str_contains( $fragment, self::COMPOSITION_SEPARATOR ) ) {
+ return $fragment;
+ }
+ }
+
+ return '&';
+ }
+
+ /**
+ * @param string[] $corpus
+ */
+ private function mutate_corpus_payload( string $payload, string $operation, array $corpus ): string {
+ switch ( $operation ) {
+ case 'splice':
+ return $this->mutate_corpus_splice( $payload, $corpus );
+
+ case 'byte-perturb':
+ return $this->mutate_corpus_byte_perturb( $payload );
+
+ case 'semicolon-toggle':
+ return $this->mutate_corpus_semicolon_toggle( $payload );
+
+ case 'reference-duplication':
+ return $this->mutate_corpus_reference_duplication( $payload );
+ }
+
+ return $payload;
+ }
+
+ /**
+ * @param string[] $corpus
+ */
+ private function mutate_corpus_splice( string $payload, array $corpus ): string {
+ $other = $this->prng->choice( $corpus );
+
+ $left_at = $this->utf8_boundary( $payload );
+ $right_at = $this->utf8_boundary( $payload );
+ if ( $right_at < $left_at ) {
+ list( $left_at, $right_at ) = array( $right_at, $left_at );
+ }
+
+ $other_left = $this->utf8_boundary( $other );
+ $other_right = $this->utf8_boundary( $other );
+ if ( $other_right < $other_left ) {
+ list( $other_left, $other_right ) = array( $other_right, $other_left );
+ }
+ $splice = substr( $other, $other_left, $other_right - $other_left );
+ if ( '' === $splice ) {
+ $splice = $this->prng->choice( array( '&', '', '∉', '>' ) );
+ }
+
+ return substr( $payload, 0, $left_at ) . $splice . substr( $payload, $right_at );
+ }
+
+ private function mutate_corpus_byte_perturb( string $payload ): string {
+ $operation = $this->prng->weighted(
+ array(
+ 'insert' => 35,
+ 'replace' => 45,
+ 'delete' => 20,
+ )
+ );
+
+ if ( '' === $payload || 'insert' === $operation ) {
+ $at = $this->utf8_boundary( $payload );
+ return substr( $payload, 0, $at ) . $this->safe_corpus_byte() . substr( $payload, $at );
+ }
+
+ list( $at, $next ) = $this->utf8_character_span( $payload );
+ if ( 'delete' === $operation ) {
+ return substr( $payload, 0, $at ) . substr( $payload, $next );
+ }
+
+ return substr( $payload, 0, $at ) . $this->safe_corpus_byte() . substr( $payload, $next );
+ }
+
+ private function mutate_corpus_semicolon_toggle( string $payload ): string {
+ $matches = $this->reference_matches( $payload );
+ if ( array() === $matches ) {
+ return $payload . $this->prng->choice( array( '&', '&', ':', ':' ) );
+ }
+
+ $match = $this->prng->choice( $matches );
+ $reference = $match['text'];
+ if ( str_ends_with( $reference, ';' ) ) {
+ $replacement = substr( $reference, 0, -1 );
+ } else {
+ $replacement = $reference . ';';
+ }
+
+ return substr( $payload, 0, $match['offset'] ) . $replacement . substr( $payload, $match['offset'] + strlen( $reference ) );
+ }
+
+ private function mutate_corpus_reference_duplication( string $payload ): string {
+ $matches = $this->reference_matches( $payload );
+ if ( array() === $matches ) {
+ return $payload . $this->prng->choice( array( '>>', '', '∉∉' ) );
+ }
+
+ $match = $this->prng->choice( $matches );
+ return substr( $payload, 0, $match['offset'] + strlen( $match['text'] ) ) . $match['text'] . substr( $payload, $match['offset'] + strlen( $match['text'] ) );
+ }
+
+ private function safe_corpus_byte(): string {
+ return self::ASCII_ALPHABET[ $this->prng->int( 0, strlen( self::ASCII_ALPHABET ) - 1 ) ];
+ }
+
+ private function utf8_boundary( string $payload ): int {
+ return $this->prng->choice( self::utf8_boundaries( $payload ) );
+ }
+
+ /**
+ * @return array{0: int, 1: int}
+ */
+ private function utf8_character_span( string $payload ): array {
+ $boundaries = self::utf8_boundaries( $payload );
+ if ( count( $boundaries ) < 2 ) {
+ return array( 0, 0 );
+ }
+
+ $index = $this->prng->int( 0, count( $boundaries ) - 2 );
+ return array( $boundaries[ $index ], $boundaries[ $index + 1 ] );
+ }
+
+ /**
+ * @return int[]
+ */
+ private static function utf8_boundaries( string $payload ): array {
+ $boundaries = array( 0 );
+ if ( '' === $payload ) {
+ return $boundaries;
+ }
+
+ $match_count = preg_match_all( '/./us', $payload, $matches, PREG_OFFSET_CAPTURE );
+ if ( false === $match_count || 0 === $match_count ) {
+ return array( 0, strlen( $payload ) );
+ }
+
+ foreach ( $matches[0] as $match ) {
+ $boundaries[] = $match[1] + strlen( $match[0] );
+ }
+
+ return array_values( array_unique( $boundaries ) );
+ }
+
+ /**
+ * @return array
+ */
+ private function reference_matches( string $payload ): array {
+ $matches = array();
+ $match_count = preg_match_all( '/&(?:#[xX][0-9A-Fa-f]+|#[0-9]+|[A-Za-z][A-Za-z0-9]+);?/', $payload, $raw_matches, PREG_OFFSET_CAPTURE );
+ if ( false === $match_count || 0 === $match_count ) {
+ return $matches;
+ }
+
+ foreach ( $raw_matches[0] as $match ) {
+ $matches[] = array(
+ 'text' => $match[0],
+ 'offset' => $match[1],
+ );
+ }
+
+ return $matches;
+ }
+
+ /**
+ * @return array{payload: string, semicolonless_base: ?string}
+ */
+ private function encode_attribute_prefix_target( string $target ): array {
+ if ( '' === $target ) {
+ return array(
+ 'payload' => '',
+ 'semicolonless_base' => null,
+ );
+ }
+
+ $length = strlen( $target );
+ $force_reference_at = $this->prng->int( 0, $length - 1 );
+ $out = '';
+ $previous_base = null;
+
+ for ( $i = 0; $i < $length; $i++ ) {
+ $char = $target[ $i ];
+ $allow_literal = $i !== $force_reference_at && self::is_oracle_safe_literal( $char ) && ! self::would_extend_semicolonless_numeric( $previous_base, $char );
+ $encoded = $this->encode_attribute_prefix_character( ord( $char ), $allow_literal );
+ $out .= $encoded['payload'];
+ $previous_base = $encoded['semicolonless_base'];
+ }
+
+ return array(
+ 'payload' => $out,
+ 'semicolonless_base' => $previous_base,
+ );
+ }
+
+ /**
+ * @return array{payload: string, semicolonless_base: ?string}
+ */
+ private function encode_attribute_prefix_character( int $code_point, bool $allow_literal ): array {
+ $encoding = $this->prng->weighted(
+ array(
+ 'literal' => $allow_literal ? 34 : 0,
+ 'decimal' => 17,
+ 'decimal-leading-zero' => 17,
+ 'hex-lower' => 14,
+ 'hex-upper' => 10,
+ 'hex-leading-zero' => 8,
+ )
+ );
+
+ if ( 'literal' === $encoding ) {
+ return array(
+ 'payload' => chr( $code_point ),
+ 'semicolonless_base' => null,
+ );
+ }
+
+ $is_hex = str_starts_with( $encoding, 'hex' );
+ $digits = $is_hex ? dechex( $code_point ) : (string) $code_point;
+
+ if ( str_ends_with( $encoding, 'leading-zero' ) ) {
+ $digits = str_repeat( '0', $this->prng->int( 1, 4 ) ) . $digits;
+ }
+ if ( 'hex-upper' === $encoding || ( 'hex-leading-zero' === $encoding && $this->prng->chance( 50 ) ) ) {
+ $digits = strtoupper( $digits );
+ }
+
+ $semicolon = $this->prng->chance( 68 ) ? ';' : '';
+ $prefix = $is_hex
+ ? ( $this->prng->chance( 50 ) ? '' : '' )
+ : '';
+
+ return array(
+ 'payload' => $prefix . $digits . $semicolon,
+ 'semicolonless_base' => '' === $semicolon ? ( $is_hex ? 'hex' : 'decimal' ) : null,
+ );
+ }
+
+ private static function is_oracle_safe_literal( string $char ): bool {
+ return ! in_array( $char, array( '<', '"', "\r", "\x00" ), true );
+ }
+
+ private static function would_extend_semicolonless_numeric( ?string $base, string $char ): bool {
+ if ( null === $base ) {
+ return false;
+ }
+
+ if ( ';' === $char ) {
+ return true;
+ }
+
+ if ( 'decimal' === $base ) {
+ return self::is_ascii_digit( $char );
+ }
+
+ return self::is_ascii_hex_digit( $char );
+ }
+
+ private static function is_ascii_digit( string $char ): bool {
+ $ord = ord( $char );
+ return $ord >= 0x30 && $ord <= 0x39;
+ }
+
+ private static function is_ascii_hex_digit( string $char ): bool {
+ $ord = ord( $char );
+ return (
+ ( $ord >= 0x30 && $ord <= 0x39 ) ||
+ ( $ord >= 0x41 && $ord <= 0x46 ) ||
+ ( $ord >= 0x61 && $ord <= 0x66 )
+ );
+ }
+
+ private static function is_ascii_alpha( string $char ): bool {
+ $ord = ord( $char );
+ return ( $ord >= 0x41 && $ord <= 0x5A ) || ( $ord >= 0x61 && $ord <= 0x7A );
+ }
+
+ /**
+ * @param callable(): string $callback
+ */
+ private function with_max_bytes( int $max_bytes, callable $callback ): string {
+ $previous_max_bytes = $this->max_bytes;
+ $this->max_bytes = max( 1, $max_bytes );
+
+ try {
+ return $callback();
+ } finally {
+ $this->max_bytes = $previous_max_bytes;
+ }
+ }
+
+ private function edit_distance_lookalike(): string {
+ for ( $attempt = 0; $attempt < 40; $attempt++ ) {
+ $base = $this->prng->choice( $this->name_sweep_base_names() );
+ $operation = $this->prng->weighted(
+ array(
+ 'delete' => 25,
+ 'insert' => 25,
+ 'substitute' => 25,
+ 'transpose' => 25,
+ )
+ );
+ $mutated = $this->mutate_name_base( $base, $operation );
+
+ if ( '' === $mutated || $mutated === $base || isset( $this->name_sweep_base_name_set()[ $mutated ] ) ) {
+ continue;
+ }
+
+ return '&' . $mutated . ( $this->prng->chance( 80 ) ? ';' : '' );
+ }
+
+ return $this->legacy_lookalike();
+ }
+
+ private function case_mangled_name(): string {
+ $base_set = $this->name_sweep_base_name_set();
+ for ( $attempt = 0; $attempt < 60; $attempt++ ) {
+ $base = $this->prng->choice( $this->name_sweep_base_names() );
+ $mutated = $this->case_mangle_name_base( $base );
+ if ( '' === $mutated || $mutated === $base || isset( $base_set[ $mutated ] ) ) {
+ continue;
+ }
+
+ return '&' . $mutated . ';';
+ }
+
+ return $this->legacy_lookalike();
+ }
+
+ private function case_mangle_name_base( string $base ): string {
+ $letter_offsets = array();
+ for ( $i = 0; $i < strlen( $base ); $i++ ) {
+ if ( self::is_ascii_alpha( $base[ $i ] ) ) {
+ $letter_offsets[] = $i;
+ }
+ }
+
+ if ( array() === $letter_offsets ) {
+ return '';
+ }
+
+ $mutated = $base;
+ $flips = $this->prng->int( 1, min( 3, count( $letter_offsets ) ) );
+ for ( $i = 0; $i < $flips; $i++ ) {
+ $index = $this->prng->int( 0, count( $letter_offsets ) - 1 );
+ $offset = $letter_offsets[ $index ];
+ array_splice( $letter_offsets, $index, 1 );
+ $char = $mutated[ $offset ];
+ $mutated[ $offset ] = strtolower( $char ) === $char ? strtoupper( $char ) : strtolower( $char );
+ }
+
+ return $mutated;
+ }
+
+ private function legacy_lookalike(): string {
+ return $this->prng->choice(
+ array(
+ '&bogus;',
+ '&NoSuchEntity',
+ '&;',
+ '& ;',
+ '¬i;',
+ '¬it;',
+ '©right;',
+ '¢erdo;',
+ '&ngE',
+ '÷ontime;',
+ '&&',
+ '&>',
+ '&am',
+ '&',
+ )
+ );
+ }
+
+ private function mutate_name_base( string $base, string $operation ): string {
+ $length = strlen( $base );
+
+ switch ( $operation ) {
+ case 'delete':
+ if ( $length < 2 ) {
+ return '';
+ }
+ $offset = $this->prng->int( 0, $length - 1 );
+ return substr( $base, 0, $offset ) . substr( $base, $offset + 1 );
+
+ case 'insert':
+ $offset = $this->prng->int( 0, $length );
+ return substr( $base, 0, $offset ) . $this->random_name_char() . substr( $base, $offset );
+
+ case 'substitute':
+ if ( 0 === $length ) {
+ return '';
+ }
+ $offset = $this->prng->int( 0, $length - 1 );
+ return substr( $base, 0, $offset ) . $this->random_name_char( $base[ $offset ] ) . substr( $base, $offset + 1 );
+
+ case 'transpose':
+ if ( $length < 2 ) {
+ return '';
+ }
+ $offsets = array();
+ for ( $i = 0; $i < $length - 1; $i++ ) {
+ if ( $base[ $i ] !== $base[ $i + 1 ] ) {
+ $offsets[] = $i;
+ }
+ }
+ if ( array() === $offsets ) {
+ return '';
+ }
+
+ $offset = $this->prng->choice( $offsets );
+ return substr( $base, 0, $offset ) . $base[ $offset + 1 ] . $base[ $offset ] . substr( $base, $offset + 2 );
+ }
+
+ return '';
+ }
+
+ private function random_name_char( ?string $except = null ): string {
+ $alphabet = self::NAME_MUTATION_ALPHABET;
+ $char = $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ];
+ if ( null === $except || $char !== $except ) {
+ return $char;
+ }
+
+ $offset = strpos( $alphabet, $except );
+ if ( false === $offset ) {
+ return $char;
+ }
+
+ return $alphabet[ ( $offset + $this->prng->int( 1, strlen( $alphabet ) - 1 ) ) % strlen( $alphabet ) ];
+ }
+
+ private function gen_bytes_uniform(): string {
+ $length = max( 1, $this->prng->biased_length( $this->max_bytes ) );
+ return $this->prng->bytes( $length );
+ }
+
+ private function gen_bytes_no_amp(): string {
+ $length = max( 1, $this->prng->biased_length( $this->max_bytes ) );
+ $out = '';
+ while ( strlen( $out ) < $length ) {
+ $byte = $this->prng->int( 0, 255 );
+ if ( 0x26 === $byte ) {
+ $byte = 0x00;
+ }
+ $out .= chr( $byte );
+ }
+ return $out;
+ }
+
+ private function gen_bytes_with_amp(): string {
+ $prefixes = array( '&', '', '', '', '&', '¬', '©', '&NoSuchEntity;' );
+ $payload = $this->prng->bytes( $this->prng->int( 0, min( 32, $this->max_bytes ) ) );
+ $payload .= $this->prng->choice( $prefixes );
+ $payload .= $this->prng->bytes( $this->prng->int( 0, min( 64, $this->max_bytes ) ) );
+ return $payload;
+ }
+
+ private function gen_bytes_invalid_utf8(): string {
+ $atoms = array(
+ "\x80",
+ "\xBF",
+ "\xC0\xAF",
+ "\xE0\x80\x80",
+ "\xF0\x80\x80\x80",
+ "\xF5\x80\x80\x80",
+ "\xED\xA0\x80",
+ "\xFE",
+ "\xFF",
+ );
+
+ $out = '';
+ $count = $this->prng->int( 1, 12 );
+ for ( $i = 0; $i < $count; $i++ ) {
+ $out .= $this->prng->bytes( $this->prng->int( 0, 4 ) );
+ $out .= $this->prng->choice( $atoms );
+ }
+ return $out;
+ }
+
+ private function gen_bytes_delimiters(): string {
+ $delimiters = array( "\x00", "\r", '<', '"', '&', '=', "\n", "\t", "\f" );
+ $out = '';
+ $count = $this->prng->int( 1, 24 );
+ for ( $i = 0; $i < $count; $i++ ) {
+ $out .= $this->prng->choice( $delimiters );
+ if ( $this->prng->chance( 35 ) ) {
+ $out .= $this->prng->bytes( $this->prng->int( 1, 4 ) );
+ }
+ }
+ return $out;
+ }
+
+ private function named_exact(): string {
+ return '&' . $this->pick_semicolon_name();
+ }
+
+ private function pick_semicolon_name(): string {
+ $preferred = array_values( array_intersect( self::PREFERRED_SEMICOLON, $this->semicolon_names ) );
+ if ( array() !== $preferred && $this->prng->chance( 75 ) ) {
+ return $this->prng->choice( $preferred );
+ }
+
+ return $this->prng->choice( $this->semicolon_names );
+ }
+
+ private function pick_legacy_name(): string {
+ $preferred = array_values( array_intersect( self::PREFERRED_LEGACY, $this->legacy_names ) );
+ if ( array() !== $preferred && $this->prng->chance( 80 ) ) {
+ return $this->prng->choice( $preferred );
+ }
+
+ return $this->prng->choice( $this->legacy_names );
+ }
+
+ /**
+ * @param string[] $names
+ */
+ private static function sort_reference_names( array &$names ): void {
+ usort(
+ $names,
+ static function ( string $a, string $b ): int {
+ return strlen( $b ) <=> strlen( $a ) ?: strcmp( $a, $b );
+ }
+ );
+ }
+
+ /**
+ * @return string[]
+ */
+ private function name_sweep_base_names(): array {
+ if ( null !== $this->name_sweep_base_names ) {
+ return $this->name_sweep_base_names;
+ }
+
+ $base_names = array();
+ foreach ( array_merge( $this->semicolon_names, $this->legacy_names ) as $name ) {
+ $base = rtrim( $name, ';' );
+ if ( '' !== $base ) {
+ $base_names[ $base ] = true;
+ }
+ }
+
+ $this->name_sweep_base_names = array_keys( $base_names );
+ return $this->name_sweep_base_names;
+ }
+
+ /**
+ * @return array
+ */
+ private function name_sweep_base_name_set(): array {
+ if ( null === $this->name_sweep_base_name_set ) {
+ $this->name_sweep_base_name_set = array_fill_keys( $this->name_sweep_base_names(), true );
+ }
+
+ return $this->name_sweep_base_name_set;
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function name_sweep_followers(): array {
+ return array( '', 'x', 'X', '0', '=', '-', ' ', '/', "\u{00E9}" );
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function legacy_follower_sweep_followers(): array {
+ static $followers = null;
+ if ( null !== $followers ) {
+ return $followers;
+ }
+
+ $followers = array();
+
+ for ( $byte = 1; $byte <= 0x7F; $byte++ ) {
+ if ( in_array( $byte, array( 0x0D, 0x22, 0x3C ), true ) ) {
+ continue;
+ }
+ $followers[] = chr( $byte );
+ }
+
+ for ( $lead = 0xC2; $lead <= 0xF4; $lead++ ) {
+ if ( $lead < 0xE0 ) {
+ $followers[] = chr( $lead ) . "\x80";
+ } elseif ( 0xE0 === $lead ) {
+ $followers[] = "\xE0\xA0\x80";
+ } elseif ( $lead < 0xF0 ) {
+ $followers[] = chr( $lead ) . "\x80\x80";
+ } elseif ( 0xF0 === $lead ) {
+ $followers[] = "\xF0\x90\x80\x80";
+ } elseif ( $lead < 0xF4 ) {
+ $followers[] = chr( $lead ) . "\x80\x80\x80";
+ } else {
+ $followers[] = "\xF4\x80\x80\x80";
+ }
+ }
+
+ for ( $continuation = 0x80; $continuation <= 0xBF; $continuation++ ) {
+ $followers[] = "\xC2" . chr( $continuation );
+ }
+
+ $followers = array_values( array_unique( $followers ) );
+ return $followers;
+ }
+
+ /**
+ * @return array
+ */
+ private function prefix_family_sweep_cases(): array {
+ $name_set = $this->name_sweep_base_name_set();
+ $cases = array();
+
+ foreach ( self::prefix_family_sweep_references() as $reference ) {
+ $base = rtrim( $reference, ';' );
+ if ( ! isset( $name_set[ $base ] ) ) {
+ continue;
+ }
+
+ $full_reference = '&' . $reference;
+ for ( $split = 1; $split < strlen( $full_reference ); $split++ ) {
+ foreach ( self::prefix_family_sweep_followers() as $follower ) {
+ $cases[] = array(
+ 'reference' => $full_reference,
+ 'split' => $split,
+ 'follower' => $follower,
+ );
+ }
+ }
+ }
+
+ return $cases;
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function prefix_family_sweep_references(): array {
+ return array(
+ 'not',
+ 'not;',
+ 'notin;',
+ 'notinva;',
+ 'ngt;',
+ 'nGt;',
+ 'nGtv;',
+ 'nge;',
+ 'ngeq;',
+ 'ngeqq;',
+ );
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function prefix_family_sweep_followers(): array {
+ return array( '', 'x', 'X', '0', '=', "\u{00E9}" );
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function numeric_boundary_sweep_cases(): array {
+ static $cases = null;
+ if ( null !== $cases ) {
+ return $cases;
+ }
+
+ $cases = array();
+ foreach ( array( 'decimal', 'hex-lower', 'hex-upper', 'hex-mixed' ) as $kind ) {
+ $is_decimal = 'decimal' === $kind;
+ $max_digits = $is_decimal ? 7 : 6;
+ foreach ( array( $max_digits, $max_digits + 1 ) as $digit_count ) {
+ foreach ( array( false, true ) as $leading_zero ) {
+ foreach ( array( false, true ) as $semicolon ) {
+ $cases[] = self::numeric_boundary_reference( $kind, $digit_count, $leading_zero, $semicolon );
+ }
+ }
+ }
+ }
+
+ return array_values( array_unique( $cases ) );
+ }
+
+ private static function numeric_boundary_reference( string $kind, int $digit_count, bool $leading_zero, bool $semicolon ): string {
+ if ( 'decimal' === $kind ) {
+ $prefix = '';
+ $digits = 7 === $digit_count ? '1114111' : substr( str_repeat( '9', $digit_count ), 0, $digit_count );
+ } else {
+ $prefix = 'hex-upper' === $kind ? '' : '';
+ $digits = 6 === $digit_count ? '10ffee' : substr( str_repeat( 'abcdef', (int) ceil( $digit_count / 6 ) ), 0, $digit_count );
+ if ( 'hex-upper' === $kind ) {
+ $digits = strtoupper( $digits );
+ } elseif ( 'hex-mixed' === $kind ) {
+ $chars = str_split( $digits );
+ foreach ( $chars as $i => $char ) {
+ if ( 0 === $i % 2 ) {
+ $chars[ $i ] = strtoupper( $char );
+ }
+ }
+ $digits = implode( '', $chars );
+ }
+ }
+
+ if ( $leading_zero ) {
+ $digits = '0' . $digits;
+ }
+
+ return $prefix . $digits . ( $semicolon ? ';' : '' );
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function corpus_payloads(): array {
+ static $payloads = null;
+ if ( null !== $payloads ) {
+ return $payloads;
+ }
+
+ $payloads = array(
+ '',
+ 'plain text',
+ 'FOO>BAR',
+ 'FOO>BAR',
+ 'FOO>;;BAR',
+ 'FOO&&&>BAR',
+ "I'm ¬it; I tell you",
+ "I'm ∉ I tell you",
+ '&ammmp;',
+ '&',
+ '∉∉¬',
+ 'ZZ>=YY',
+ 'ZZ>0YY',
+ 'ZZ> YY',
+ 'ZZ>',
+ 'javascript:alert(1)',
+ 'javascript:alert(1)',
+ 'javascript:alert(1)',
+ '',
+ '',
+ '<⃒tail',
+ '&NoSuchEntity;&',
+ );
+
+ foreach ( Oracles::battery() as $vector ) {
+ $payloads[] = $vector[1];
+ }
+
+ foreach ( self::html5lib_entity_payloads() as $payload ) {
+ $payloads[] = $payload;
+ }
+
+ $payloads = array_values(
+ array_unique(
+ array_filter(
+ $payloads,
+ static fn( string $payload ): bool => self::is_oracle_safe_payload( $payload )
+ )
+ )
+ );
+
+ if ( array() === $payloads ) {
+ $payloads = array( '&' );
+ }
+
+ return $payloads;
+ }
+
+ /**
+ * @return array
+ */
+ private static function token_map_sweep_cases(): array {
+ static $cases = null;
+ if ( null !== $cases ) {
+ return $cases;
+ }
+
+ $structure = Bootstrap::named_reference_structure();
+ $key_length = $structure['key_length'];
+ $cases = array();
+
+ foreach ( $structure['group_prefixes'] as $prefix ) {
+ $cases[] = array(
+ 'shape' => 'large-prefix-divergent',
+ 'prefix' => $prefix,
+ 'payload' => '&' . $prefix . self::token_map_divergent_suffix( $prefix, $structure['large_names_by_prefix'][ $prefix ] ?? array() ),
+ );
+ }
+
+ foreach ( $structure['small_names'] as $name ) {
+ $cases[] = array(
+ 'shape' => 'small-boundary-exact',
+ 'name' => $name,
+ 'payload' => '&' . $name,
+ );
+ $cases[] = array(
+ 'shape' => 'small-boundary-extended',
+ 'name' => $name,
+ 'payload' => '&' . $name . 'Q;',
+ );
+ }
+
+ foreach ( $structure['large_names'] as $name ) {
+ if ( strlen( $name ) !== $key_length + 1 ) {
+ continue;
+ }
+
+ $cases[] = array(
+ 'shape' => 'large-boundary-exact',
+ 'name' => $name,
+ 'payload' => '&' . $name,
+ );
+ $cases[] = array(
+ 'shape' => 'large-boundary-extended',
+ 'name' => $name,
+ 'payload' => '&' . $name . 'Q;',
+ );
+ }
+
+ $cases = array_values(
+ array_filter(
+ $cases,
+ static fn( array $case ): bool => self::is_oracle_safe_payload( $case['payload'] )
+ )
+ );
+
+ return array() === $cases
+ ? array( array( 'shape' => 'fallback', 'payload' => '&NoSuchEntity;' ) )
+ : $cases;
+ }
+
+ /**
+ * @param string[] $names
+ */
+ private static function token_map_divergent_suffix( string $prefix, array $names ): string {
+ $used_first_rest_chars = array();
+ $prefix_length = strlen( $prefix );
+ foreach ( $names as $name ) {
+ $rest = substr( $name, $prefix_length );
+ if ( '' !== $rest ) {
+ $used_first_rest_chars[ $rest[0] ] = true;
+ }
+ }
+
+ for ( $i = 0; $i < strlen( self::NAME_MUTATION_ALPHABET ); $i++ ) {
+ $char = self::NAME_MUTATION_ALPHABET[ $i ];
+ if ( ! isset( $used_first_rest_chars[ $char ] ) ) {
+ return $char . 'QQ;';
+ }
+ }
+
+ return '_QQ;';
+ }
+
+ /**
+ * @return string[]
+ */
+ private static function html5lib_entity_payloads(): array {
+ $payloads = array();
+ foreach ( array( 'entities01.dat', 'entities02.dat' ) as $file ) {
+ $path = Bootstrap::repo_root() . '/tests/phpunit/data/html5lib-tests/tree-construction/' . $file;
+ if ( ! is_file( $path ) ) {
+ continue;
+ }
+
+ $lines = file( $path, FILE_IGNORE_NEW_LINES );
+ if ( ! is_array( $lines ) ) {
+ continue;
+ }
+
+ for ( $i = 0; $i + 1 < count( $lines ); $i++ ) {
+ if ( '#data' !== $lines[ $i ] ) {
+ continue;
+ }
+
+ $payload = self::html5lib_entity_payload_from_data_line( $lines[ $i + 1 ] );
+ if ( strlen( $payload ) > 512 ) {
+ $payload = substr( $payload, 0, 512 );
+ }
+ $payloads[] = $payload;
+ }
+ }
+
+ return $payloads;
+ }
+
+ private static function html5lib_entity_payload_from_data_line( string $line ): string {
+ if ( 1 === preg_match( '/^\s]+))><\/div>$/', $line, $match ) ) {
+ foreach ( array( 1, 2, 3 ) as $index ) {
+ if ( isset( $match[ $index ] ) && '' !== $match[ $index ] ) {
+ return $match[ $index ];
+ }
+ }
+ }
+
+ if ( 1 === preg_match( '/^
(.*)<\/div>$/', $line, $match ) ) {
+ return $match[1];
+ }
+
+ return $line;
+ }
+
+ private function numeric_reference( bool $allow_missing_digits = false ): string {
+ $kind = $this->prng->weighted(
+ array(
+ 'decimal' => 45,
+ 'hex' => 45,
+ 'missing' => $allow_missing_digits ? 10 : 0,
+ )
+ );
+
+ if ( 'missing' === $kind ) {
+ return $this->prng->choice( array( '', '', '' ) );
+ }
+
+ $value = $this->numeric_code_point( 'hex' === $kind ? 16 : 10 );
+
+ if ( 'hex' === $kind ) {
+ $digits = dechex( $value );
+ if ( $this->prng->chance( 50 ) ) {
+ $digits = strtoupper( $digits );
+ }
+ $prefix = $this->prng->chance( 50 ) ? '' : '';
+ } else {
+ $digits = (string) $value;
+ $prefix = '';
+ }
+
+ if ( $this->prng->chance( 35 ) ) {
+ $digits = str_repeat( '0', $this->prng->int( 1, 10 ) ) . $digits;
+ }
+
+ return $prefix . $digits . ( $this->prng->chance( 82 ) ? ';' : '' );
+ }
+
+ private function numeric_code_point( int $numeric_base ): int {
+ $bucket = $this->prng->weighted(
+ array(
+ 'zero' => 5,
+ 'c0-control' => 8,
+ 'ascii' => 10,
+ 'c1-control' => 14,
+ 'bmp' => 12,
+ 'surrogate' => 12,
+ 'bmp-noncharacter' => 8,
+ 'plane-noncharacter' => 10,
+ 'astral' => 10,
+ 'above-unicode-legal-digits' => 8,
+ 'digit-count-overflow' => 5,
+ )
+ );
+
+ switch ( $bucket ) {
+ case 'zero':
+ return 0;
+
+ case 'c0-control':
+ return $this->prng->int( 1, 0x1F );
+
+ case 'ascii':
+ return $this->prng->int( 0x20, 0x7F );
+
+ case 'c1-control':
+ return $this->prng->int( 0x80, 0x9F );
+
+ case 'bmp':
+ if ( $this->prng->chance( 50 ) ) {
+ return $this->prng->int( 0xA0, 0xD7FF );
+ }
+ if ( $this->prng->chance( 50 ) ) {
+ return $this->prng->int( 0xE000, 0xFDCF );
+ }
+ return $this->prng->int( 0xFDF0, 0xFFFD );
+
+ case 'surrogate':
+ return $this->prng->int( 0xD800, 0xDFFF );
+
+ case 'bmp-noncharacter':
+ if ( $this->prng->chance( 75 ) ) {
+ return $this->prng->int( 0xFDD0, 0xFDEF );
+ }
+ return $this->prng->choice( array( 0xFFFE, 0xFFFF ) );
+
+ case 'plane-noncharacter':
+ return ( $this->prng->int( 1, 16 ) << 16 ) + $this->prng->choice( array( 0xFFFE, 0xFFFF ) );
+
+ case 'astral':
+ return ( $this->prng->int( 1, 16 ) << 16 ) + $this->prng->int( 0, 0xFFFD );
+
+ case 'above-unicode-legal-digits':
+ return $this->prng->int( 0x110000, 16 === $numeric_base ? 0xFFFFFF : 9999999 );
+
+ case 'digit-count-overflow':
+ return $this->prng->int( 16 === $numeric_base ? 0x1000000 : 10000000, 16 === $numeric_base ? 0xFFFFFFF : 99999999 );
+ }
+
+ return 0x41;
+ }
+
+ private function plain_text( bool $allow_amp = false ): string {
+ return $this->plain_text_up_to( min( 128, $this->max_bytes ), $allow_amp );
+ }
+
+ private function plain_text_up_to( int $max_bytes, bool $allow_amp = false ): string {
+ $length = $this->prng->biased_length( max( 0, $max_bytes ) );
+ if ( 0 === $length ) {
+ return '';
+ }
+
+ $out = '';
+ for ( $i = 0; $i < $length; $i++ ) {
+ if ( $allow_amp && $this->prng->chance( 3 ) ) {
+ $out .= '&';
+ continue;
+ }
+ $out .= self::ASCII_ALPHABET[ $this->prng->int( 0, strlen( self::ASCII_ALPHABET ) - 1 ) ];
+ }
+
+ return $out;
+ }
+
+ private function ascii_run( int $length ): string {
+ $out = '';
+ for ( $i = 0; $i < $length; $i++ ) {
+ $out .= self::ASCII_ALPHABET[ $this->prng->int( 0, strlen( self::ASCII_ALPHABET ) - 1 ) ];
+ }
+ return $out;
+ }
+
+ private function ascii_digits( int $length ): string {
+ $out = '';
+ for ( $i = 0; $i < $length; $i++ ) {
+ $out .= (string) $this->prng->int( 0, 9 );
+ }
+ return $out;
+ }
+
+ private function hex_digits( int $length ): string {
+ $digits = '0123456789abcdefABCDEF';
+ $out = '';
+ for ( $i = 0; $i < $length; $i++ ) {
+ $out .= $digits[ $this->prng->int( 0, strlen( $digits ) - 1 ) ];
+ }
+ return $out;
+ }
+
+ private static function trim_to_safe_max( string $payload, int $max_bytes ): string {
+ $payload = str_replace( array( '<', '"', "\r", "\x00" ), array( '', "'", "\n", '' ), $payload );
+
+ while ( '' !== $payload && ! mb_check_encoding( $payload, 'UTF-8' ) ) {
+ $payload = substr( $payload, 0, -1 );
+ }
+
+ if ( strlen( $payload ) <= $max_bytes ) {
+ return $payload;
+ }
+
+ $trimmed = substr( $payload, 0, $max_bytes );
+ while ( '' !== $trimmed && ! mb_check_encoding( $trimmed, 'UTF-8' ) ) {
+ $trimmed = substr( $trimmed, 0, -1 );
+ }
+
+ return $trimmed;
+ }
+}
diff --git a/tools/html-decoder-fuzz/lib/Oracles.php b/tools/html-decoder-fuzz/lib/Oracles.php
new file mode 100644
index 0000000000000..77b4b61f7cc7c
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/Oracles.php
@@ -0,0 +1,304 @@
+ */
+ private array $events = array();
+
+ private bool $dom_available = false;
+ private bool $entity_decode_available = true;
+ private bool $mb_available = false;
+
+ public static function build(): self {
+ $oracles = new self();
+
+ $oracles->dom_available = class_exists( \Dom\HTMLDocument::class );
+ $oracles->entity_decode_available = function_exists( 'html_entity_decode' ) && defined( 'ENT_HTML5' ) && defined( 'ENT_QUOTES' );
+ $oracles->mb_available = function_exists( 'mb_check_encoding' );
+
+ if ( ! $oracles->dom_available ) {
+ $oracles->events[] = array(
+ 'type' => 'oracle-unavailable',
+ 'oracle' => 'dom',
+ 'detail' => 'PHP 8.4 Dom\\HTMLDocument is required',
+ );
+ }
+
+ if ( ! $oracles->entity_decode_available ) {
+ $oracles->events[] = array(
+ 'type' => 'oracle-unavailable',
+ 'oracle' => 'entity-decode',
+ 'detail' => 'html_entity_decode with ENT_HTML5 and ENT_QUOTES is required',
+ );
+ }
+
+ if ( ! $oracles->mb_available ) {
+ $oracles->events[] = array(
+ 'type' => 'oracle-unavailable',
+ 'oracle' => 'mb',
+ 'detail' => 'mb_check_encoding is required for UTF-8 output checks',
+ );
+ }
+
+ if ( $oracles->dom_available ) {
+ $oracles->verify_battery();
+ }
+
+ if ( $oracles->entity_decode_available ) {
+ $oracles->verify_entity_decode_battery();
+ }
+
+ return $oracles;
+ }
+
+ /**
+ * @return array
[context, payload, expected]
+ */
+ public static function battery(): array {
+ return array(
+ array( 'text', '', '' ),
+ array( 'attribute', '', '' ),
+ array( 'text', 'plain text', 'plain text' ),
+ array( 'attribute', 'plain text', 'plain text' ),
+ array( 'text', '&', '&' ),
+ array( 'attribute', '&', '&' ),
+ array( 'text', '&', '&' ),
+ array( 'attribute', '&', '&' ),
+ array( 'text', '&', '&' ),
+ array( 'attribute', '&', '&' ),
+ array( 'text', '&x', '&x' ),
+ array( 'attribute', '&x', '&x' ),
+ array( 'text', '∉', "\u{2209}" ),
+ array( 'attribute', '∉', "\u{2209}" ),
+ array( 'text', '¬in', "\u{00AC}" . 'in' ),
+ array( 'attribute', '¬in', '¬in' ),
+ array( 'text', '&NoSuchEntity;', '&NoSuchEntity;' ),
+ array( 'attribute', '&NoSuchEntity;', '&NoSuchEntity;' ),
+ array( 'text', '', "\u{20AC}" ),
+ array( 'attribute', '', "\u{20AC}" ),
+ array( 'text', '', "\u{20AC}" ),
+ array( 'attribute', '', "\u{20AC}" ),
+ array( 'text', '', "\u{FFFD}" ),
+ array( 'attribute', '', "\u{FFFD}" ),
+ array( 'text', '', "\u{FFFD}" ),
+ array( 'attribute', '', "\u{FFFD}" ),
+ array( 'text', '', "\u{FFFD}" ),
+ array( 'attribute', '', "\u{FFFD}" ),
+ array( 'text', '', '' ),
+ array( 'attribute', '', '' ),
+ array( 'text', '', '' ),
+ array( 'attribute', '', '' ),
+ array( 'text', 'a:b', 'a:b' ),
+ array( 'attribute', 'a:b', 'a:b' ),
+ );
+ }
+
+ public function has_required(): bool {
+ return $this->dom_available && $this->entity_decode_available && $this->mb_available;
+ }
+
+ public function names(): array {
+ $names = array();
+ if ( $this->dom_available ) {
+ $names[] = 'dom';
+ }
+ if ( $this->entity_decode_available ) {
+ $names[] = 'entity-decode';
+ }
+ if ( $this->mb_available ) {
+ $names[] = 'mb';
+ }
+ return $names;
+ }
+
+ /** @return array */
+ public function drain_events(): array {
+ $events = $this->events;
+ $this->events = array();
+ return $events;
+ }
+
+ public function decode( string $context, string $payload ): string {
+ if ( 'text' === $context ) {
+ return $this->decode_text( $payload );
+ }
+
+ if ( 'attribute' === $context ) {
+ return $this->decode_attribute( $payload );
+ }
+
+ throw new \InvalidArgumentException( "Unknown context {$context}" );
+ }
+
+ public function decode_text_with_entity_decode( string $payload ): ?string {
+ if ( ! $this->entity_decode_available || ! self::supports_entity_decode_text_payload( $payload ) ) {
+ return null;
+ }
+
+ return html_entity_decode( $payload, ENT_HTML5 | ENT_QUOTES, 'UTF-8' );
+ }
+
+ private function decode_text( string $payload ): string {
+ $document = $this->parse( '' . $payload . '
' );
+ $div = $document->getElementById( 'fuzz' );
+ if ( null === $div ) {
+ throw new \RuntimeException( 'DOM oracle could not find text wrapper element.' );
+ }
+
+ return $div->textContent;
+ }
+
+ private function decode_attribute( string $payload ): string {
+ $document = $this->parse( '' );
+ $div = $document->getElementById( 'fuzz' );
+ if ( null === $div ) {
+ throw new \RuntimeException( 'DOM oracle could not find attribute wrapper element.' );
+ }
+
+ return $div->getAttribute( 'title' );
+ }
+
+ private function parse( string $html ): \Dom\HTMLDocument {
+ $document = @\Dom\HTMLDocument::createFromString( $html );
+ if ( ! $document instanceof \Dom\HTMLDocument ) {
+ throw new \RuntimeException( 'DOM oracle parse failed.' );
+ }
+
+ return $document;
+ }
+
+ private static function supports_entity_decode_text_payload( string $payload ): bool {
+ $length = strlen( $payload );
+ $offset = 0;
+
+ while ( false !== ( $amp_at = strpos( $payload, '&', $offset ) ) ) {
+ $name_at = $amp_at + 1;
+ if ( $name_at >= $length ) {
+ return true;
+ }
+
+ if ( '#' === $payload[ $name_at ] ) {
+ return false;
+ }
+
+ $name_length = strspn( $payload, '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', $name_at );
+ if ( 0 === $name_length ) {
+ $offset = $name_at;
+ continue;
+ }
+
+ $after_name = $name_at + $name_length;
+ if ( $after_name >= $length || ';' !== $payload[ $after_name ] ) {
+ return false;
+ }
+
+ $reference_name = substr( $payload, $name_at, $name_length + 1 );
+ if ( ! isset( self::entity_decode_named_reference_set()[ $reference_name ] ) ) {
+ return false;
+ }
+
+ $offset = $after_name + 1;
+ }
+
+ return true;
+ }
+
+ /**
+ * @return array
+ */
+ private static function entity_decode_named_reference_set(): array {
+ static $names = null;
+ if ( null !== $names ) {
+ return $names;
+ }
+
+ $names = array();
+ foreach ( Bootstrap::named_reference_names() as $name ) {
+ if ( str_ends_with( $name, ';' ) ) {
+ $names[ $name ] = true;
+ }
+ }
+
+ return $names;
+ }
+
+ private function verify_battery(): void {
+ foreach ( self::battery() as $i => $vector ) {
+ list( $context, $payload, $expected ) = $vector;
+ try {
+ $got = $this->decode( $context, $payload );
+ } catch ( \Throwable $error ) {
+ $this->dom_available = false;
+ $this->events[] = array(
+ 'type' => 'oracle-disabled',
+ 'oracle' => 'dom',
+ 'detail' => "battery vector {$i} threw " . get_class( $error ) . ': ' . $error->getMessage(),
+ );
+ return;
+ }
+
+ if ( $got !== $expected ) {
+ $this->dom_available = false;
+ $this->events[] = array(
+ 'type' => 'oracle-disabled',
+ 'oracle' => 'dom',
+ 'detail' => sprintf(
+ 'battery vector %d (%s, %s): expected %s, got %s',
+ $i,
+ $context,
+ bin2hex( $payload ),
+ bin2hex( $expected ),
+ bin2hex( $got )
+ ),
+ );
+ return;
+ }
+ }
+ }
+
+ private function verify_entity_decode_battery(): void {
+ $battery = array(
+ array( '', '' ),
+ array( 'plain text', 'plain text' ),
+ array( 'a&b', 'a&b' ),
+ array( '"'', "\"'" ),
+ array( '∉', "\u{2209}" ),
+ array( '<⃒', "<\u{20D2}" ),
+ array( '
', "\n" ),
+ );
+
+ foreach ( $battery as $i => $vector ) {
+ list( $payload, $expected ) = $vector;
+ try {
+ $got = $this->decode_text_with_entity_decode( $payload );
+ } catch ( \Throwable $error ) {
+ $this->entity_decode_available = false;
+ $this->events[] = array(
+ 'type' => 'oracle-disabled',
+ 'oracle' => 'entity-decode',
+ 'detail' => "battery vector {$i} threw " . get_class( $error ) . ': ' . $error->getMessage(),
+ );
+ return;
+ }
+
+ if ( $got !== $expected ) {
+ $this->entity_decode_available = false;
+ $this->events[] = array(
+ 'type' => 'oracle-disabled',
+ 'oracle' => 'entity-decode',
+ 'detail' => sprintf(
+ 'battery vector %d (%s): expected %s, got %s',
+ $i,
+ bin2hex( $payload ),
+ bin2hex( $expected ),
+ bin2hex( is_string( $got ) ? $got : '' )
+ ),
+ );
+ return;
+ }
+ }
+ }
+}
diff --git a/tools/html-decoder-fuzz/lib/Prng.php b/tools/html-decoder-fuzz/lib/Prng.php
new file mode 100644
index 0000000000000..09622543344a4
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/Prng.php
@@ -0,0 +1,84 @@
+seed = $seed;
+ }
+
+ public function bytes( int $length ): string {
+ while ( strlen( $this->buffer ) < $length ) {
+ $this->buffer .= hash( 'sha256', $this->seed . ':' . $this->counter++, true );
+ }
+
+ $out = substr( $this->buffer, 0, $length );
+ $this->buffer = (string) substr( $this->buffer, $length );
+ return $out;
+ }
+
+ public function uint32(): int {
+ $parts = unpack( 'Nvalue', $this->bytes( 4 ) );
+ return (int) $parts['value'];
+ }
+
+ public function int( int $min, int $max ): int {
+ if ( $max <= $min ) {
+ return $min;
+ }
+
+ return $min + ( $this->uint32() % ( $max - $min + 1 ) );
+ }
+
+ public function chance( int $numerator, int $denominator = 100 ): bool {
+ return $this->int( 1, $denominator ) <= $numerator;
+ }
+
+ public function choice( array $values ) {
+ return $values[ $this->int( 0, count( $values ) - 1 ) ];
+ }
+
+ /**
+ * @param array $weights Map of value to integer weight.
+ */
+ public function weighted( array $weights ) {
+ $total = (int) array_sum( $weights );
+ $pick = $this->int( 1, max( 1, $total ) );
+ foreach ( $weights as $value => $weight ) {
+ $pick -= $weight;
+ if ( $pick <= 0 ) {
+ return $value;
+ }
+ }
+
+ return array_key_first( $weights );
+ }
+
+ public function biased_length( int $max ): int {
+ $bucket = $this->weighted(
+ array(
+ 'tiny' => 38,
+ 'short' => 38,
+ 'mid' => 20,
+ 'large' => 4,
+ )
+ );
+
+ switch ( $bucket ) {
+ case 'tiny':
+ return $this->int( 0, min( 8, $max ) );
+ case 'short':
+ return $this->int( 0, min( 64, $max ) );
+ case 'mid':
+ return $this->int( 0, min( 1024, $max ) );
+ default:
+ return $this->int( 0, $max );
+ }
+ }
+}
diff --git a/tools/html-decoder-fuzz/lib/Targets.php b/tools/html-decoder-fuzz/lib/Targets.php
new file mode 100644
index 0000000000000..519f951f37db0
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/Targets.php
@@ -0,0 +1,256 @@
+
+ */
+ public static function resolve(): array {
+ $targets = self::real();
+
+ switch ( getenv( 'HTML_DECODER_FUZZ_FAULT' ) ) {
+ case 'skip-c1-remap':
+ $targets['decode_text'] = static fn( string $text ): string => self::undo_c1_remap( \WP_HTML_Decoder::decode_text_node( $text ) );
+ $targets['decode_attribute'] = static fn( string $text ): string => self::undo_c1_remap( \WP_HTML_Decoder::decode_attribute( $text ) );
+ break;
+
+ case 'attribute-semicolonless':
+ $targets['decode_attribute'] = static fn( string $text ): string => \WP_HTML_Decoder::decode_text_node( $text );
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ return \WP_HTML_Decoder::read_character_reference( 'attribute' === $context ? 'data' : $context, $text, $at, $match_byte_length );
+ };
+ break;
+
+ case 'match-length-off-by-one':
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( null !== $result ) {
+ ++$match_byte_length;
+ }
+ return $result;
+ };
+ break;
+
+ case 'reader-empty-chunk':
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( null !== $result && str_starts_with( substr( $text, $at ), '&' ) ) {
+ return '';
+ }
+ return $result;
+ };
+ break;
+
+ case 'reader-short-match-length':
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( null !== $result && str_starts_with( substr( $text, $at ), '&' ) ) {
+ $match_byte_length = 1;
+ }
+ return $result;
+ };
+ break;
+
+ case 'reader-substring-composition':
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( null !== $result && 0 === $at && ':' === $text ) {
+ return '.';
+ }
+ return $result;
+ };
+ break;
+
+ case 'reader-null-mutates-match-length':
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( null === $result && str_starts_with( substr( $text, $at ), '&' ) ) {
+ $match_byte_length = 0;
+ }
+ return $result;
+ };
+ break;
+
+ case 'reader-non-amp-match':
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( isset( $text[ $at ] ) && '&' !== $text[ $at ] ) {
+ $match_byte_length = 1;
+ return $text[ $at ];
+ }
+ return $result;
+ };
+ break;
+
+ case 'reader-gapless-drop-span':
+ $targets['reader_span_filter'] = static function ( array $spans ): array {
+ foreach ( $spans as $index => $span ) {
+ if ( ( $span['end'] ?? 0 ) > ( $span['start'] ?? 0 ) ) {
+ unset( $spans[ $index ] );
+ return array_values( $spans );
+ }
+ }
+ return $spans;
+ };
+ break;
+
+ case 'numeric-invalid-not-replacement':
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( null !== $result && is_int( $match_byte_length ) && self::is_invalid_numeric_replacement_reference( substr( $text, $at, $match_byte_length ) ) ) {
+ return '?';
+ }
+ return $result;
+ };
+ break;
+
+ case 'numeric-c1-not-remapped':
+ $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( null !== $result && is_int( $match_byte_length ) ) {
+ $value = self::numeric_c1_reference_value( substr( $text, $at, $match_byte_length ) );
+ if ( null !== $value ) {
+ $replacement = mb_chr( $value, 'UTF-8' );
+ return false === $replacement ? $result : $replacement;
+ }
+ }
+ return $result;
+ };
+ break;
+
+ case 'raw-c1-not-pass-through':
+ $targets['decode_text'] = static fn( string $text ): string => self::rewrite_raw_c1_bytes( \WP_HTML_Decoder::decode_text_node( $text ) );
+ $targets['decode_attribute'] = static fn( string $text ): string => self::rewrite_raw_c1_bytes( \WP_HTML_Decoder::decode_attribute( $text ) );
+ break;
+
+ case 'text-secondary-oracle':
+ $targets['decode_text'] = static function ( string $text ): string {
+ $decoded = \WP_HTML_Decoder::decode_text_node( $text );
+ return str_contains( $text, '&' ) ? '!' . $decoded : $decoded;
+ };
+ break;
+
+ case 'single-level-overdecode':
+ $targets['decode_text'] = static function ( string $text ): string {
+ return \WP_HTML_Decoder::decode_text_node( \WP_HTML_Decoder::decode_text_node( $text ) );
+ };
+ $targets['decode_attribute'] = static function ( string $text ): string {
+ return \WP_HTML_Decoder::decode_attribute( \WP_HTML_Decoder::decode_attribute( $text ) );
+ };
+ break;
+
+ case 'byte-no-amp-identity':
+ $targets['decode_text'] = static fn( string $text ): string => str_replace( "\x00", '', \WP_HTML_Decoder::decode_text_node( $text ) );
+ $targets['decode_attribute'] = static fn( string $text ): string => str_replace( "\x00", '', \WP_HTML_Decoder::decode_attribute( $text ) );
+ break;
+
+ case 'attribute-no-amp-identity':
+ $targets['decode_attribute'] = static function ( string $text ): string {
+ $decoded = \WP_HTML_Decoder::decode_attribute( $text );
+ return str_contains( $text, '&' ) ? $decoded : '!' . $decoded;
+ };
+ break;
+
+ case 'attribute-prefix-monotonicity':
+ $attribute_starts_with = $targets['attribute_starts_with'];
+ $targets['attribute_starts_with'] = static function ( string $haystack, string $search, string $case_sensitivity ) use ( $attribute_starts_with ): bool {
+ if ( 'jav' === $search ) {
+ return false;
+ }
+ return $attribute_starts_with( $haystack, $search, $case_sensitivity );
+ };
+ break;
+
+ case 'attribute-extension-monotonicity':
+ $attribute_starts_with = $targets['attribute_starts_with'];
+ $targets['attribute_starts_with'] = static function ( string $haystack, string $search, string $case_sensitivity ) use ( $attribute_starts_with ): bool {
+ if ( str_ends_with( $search, "\x7F" ) ) {
+ return true;
+ }
+ return $attribute_starts_with( $haystack, $search, $case_sensitivity );
+ };
+ break;
+
+ case 'attribute-case-monotonicity':
+ $attribute_starts_with = $targets['attribute_starts_with'];
+ $targets['attribute_starts_with'] = static function ( string $haystack, string $search, string $case_sensitivity ) use ( $attribute_starts_with ): bool {
+ if ( 'ascii-case-insensitive' === $case_sensitivity && 'jav' === $search ) {
+ return false;
+ }
+ return $attribute_starts_with( $haystack, $search, $case_sensitivity );
+ };
+ break;
+
+ case 'attribute-multicodepoint-prefix':
+ $attribute_starts_with = $targets['attribute_starts_with'];
+ $targets['attribute_starts_with'] = static function ( string $haystack, string $search, string $case_sensitivity ) use ( $attribute_starts_with ): bool {
+ if ( str_starts_with( $haystack, '<⃒' ) && "<\xE2" === $search ) {
+ return false;
+ }
+ return $attribute_starts_with( $haystack, $search, $case_sensitivity );
+ };
+ break;
+ }
+
+ return $targets;
+ }
+
+ /**
+ * @return array
+ */
+ public static function real(): array {
+ return array(
+ 'decode_text' => static fn( string $text ): string => \WP_HTML_Decoder::decode_text_node( $text ),
+ 'decode_attribute' => static fn( string $text ): string => \WP_HTML_Decoder::decode_attribute( $text ),
+ 'read_character_reference' => static fn( string $context, string $text, int $at, &$match_byte_length = null ): ?string => \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ),
+ 'attribute_starts_with' => static fn( string $haystack, string $search, string $case_sensitivity ): bool => \WP_HTML_Decoder::attribute_starts_with( $haystack, $search, $case_sensitivity ),
+ );
+ }
+
+ private static function undo_c1_remap( string $decoded ): string {
+ return str_replace( "\u{20AC}", "\u{0080}", $decoded );
+ }
+
+ private static function rewrite_raw_c1_bytes( string $decoded ): string {
+ return preg_replace( '/[\x80-\x9F]/', '?', $decoded ) ?? $decoded;
+ }
+
+ private static function numeric_c1_reference_value( string $reference ): ?int {
+ $value = self::numeric_reference_value( $reference );
+ return null !== $value && $value >= 0x80 && $value <= 0x9F ? $value : null;
+ }
+
+ private static function is_invalid_numeric_replacement_reference( string $reference ): bool {
+ $value = self::numeric_reference_value( $reference );
+ if ( null === $value ) {
+ return false;
+ }
+
+ return 0 === $value || ( $value >= 0xD800 && $value <= 0xDFFF ) || $value > 0x10FFFF;
+ }
+
+ private static function numeric_reference_value( string $reference ): ?int {
+ if ( 1 !== preg_match( '/^(?:([xX])([0-9A-Fa-f]+)|([0-9]+));?$/', $reference, $match ) ) {
+ return null;
+ }
+
+ $is_hex = '' !== ( $match[1] ?? '' );
+ $digits = $is_hex ? $match[2] : $match[3];
+ $base = $is_hex ? 16 : 10;
+ $max_digits = $is_hex ? 6 : 7;
+ $significant_digits = substr( $digits, strspn( $digits, '0' ) );
+
+ if ( '' === $significant_digits ) {
+ return 0;
+ }
+
+ if ( strlen( $significant_digits ) > $max_digits ) {
+ return null;
+ }
+
+ return intval( $significant_digits, $base );
+ }
+}
diff --git a/tools/html-decoder-fuzz/lib/autoload.php b/tools/html-decoder-fuzz/lib/autoload.php
new file mode 100644
index 0000000000000..9a967a473ba5f
--- /dev/null
+++ b/tools/html-decoder-fuzz/lib/autoload.php
@@ -0,0 +1,17 @@
+ '',
+ 'input' => '',
+ 'context' => 'both',
+ 'mode' => 'oracle',
+ 'signature' => '',
+ 'output-dir' => '',
+ )
+);
+
+Cli::require_one_of( $options, 'context', array( 'text', 'attribute', 'both' ) );
+Cli::require_one_of( $options, 'mode', Cli::valid_modes() );
+
+Bootstrap::load_targets();
+
+$payload = null;
+$context = $options['context'];
+$mode = $options['mode'];
+$signature = $options['signature'];
+$source_dir = $options['output-dir'];
+
+if ( '' !== $options['failure'] ) {
+ $manifest = json_decode( (string) file_get_contents( $options['failure'] ), true );
+ if ( ! is_array( $manifest ) || ! isset( $manifest['payload_base64'] ) ) {
+ fwrite( STDERR, "Cannot read failure manifest {$options['failure']}\n" );
+ exit( 2 );
+ }
+ $payload = base64_decode( $manifest['payload_base64'], true );
+ $context = $manifest['context'] ?? $context;
+ $mode = $manifest['mode'] ?? 'oracle';
+ if ( ! in_array( $context, array( 'text', 'attribute', 'both' ), true ) ) {
+ fwrite( STDERR, "Invalid context in failure manifest: {$context}\n" );
+ exit( 2 );
+ }
+ if ( ! in_array( $mode, Cli::valid_modes(), true ) ) {
+ fwrite( STDERR, "Invalid mode in failure manifest: {$mode}\n" );
+ exit( 2 );
+ }
+ if ( '' === $signature ) {
+ $signature = $manifest['signatures'][0] ?? '';
+ }
+ if ( '' === $source_dir ) {
+ $source_dir = dirname( $options['failure'] );
+ }
+} elseif ( '' !== $options['input'] ) {
+ $payload = file_get_contents( $options['input'] );
+ if ( false === $payload ) {
+ fwrite( STDERR, "Cannot read input file {$options['input']}\n" );
+ exit( 2 );
+ }
+ if ( '' === $source_dir ) {
+ $source_dir = dirname( $options['input'] );
+ }
+} else {
+ fwrite( STDERR, "Provide --failure or --input.\n" );
+ exit( 2 );
+}
+
+if ( ! is_string( $payload ) ) {
+ fwrite( STDERR, "Payload could not be loaded.\n" );
+ exit( 2 );
+}
+
+if ( '' === $signature ) {
+ fwrite( STDERR, "No signature given and none found in the manifest.\n" );
+ exit( 2 );
+}
+
+$oracles = Oracles::build();
+if ( Cli::mode_uses_oracle( $mode ) && ! $oracles->has_required() ) {
+ fwrite( STDERR, "Required oracle unavailable; cannot minimize.\n" );
+ exit( 2 );
+}
+
+$checks = new Checks( $oracles );
+
+$reproduces = static function ( string $candidate ) use ( $checks, $context, $mode, $signature ): bool {
+ $failures = 'bytes' === $mode ? $checks->run_without_oracle( $context, $candidate ) : $checks->run( $context, $candidate );
+ foreach ( $failures as $failure ) {
+ if ( $failure['signature'] === $signature ) {
+ return true;
+ }
+ }
+ return false;
+};
+
+if ( ! $reproduces( $payload ) ) {
+ fwrite( STDERR, "Signature {$signature} does not reproduce on the given payload.\n" );
+ exit( 1 );
+}
+
+$current = $payload;
+$tries = 0;
+
+$chunk = (int) ceil( max( 1, strlen( $current ) ) / 2 );
+while ( $chunk >= 1 ) {
+ $progress = false;
+
+ for ( $at = 0; $at < strlen( $current ); ) {
+ $candidate = substr( $current, 0, $at ) . substr( $current, $at + $chunk );
+ ++$tries;
+
+ if ( strlen( $candidate ) < strlen( $current ) && $reproduces( $candidate ) ) {
+ $current = $candidate;
+ $progress = true;
+ } else {
+ $at += max( 1, intdiv( $chunk, 2 ) );
+ }
+ }
+
+ if ( ! $progress && $chunk > 1 ) {
+ $chunk = intdiv( $chunk, 2 );
+ } elseif ( ! $progress ) {
+ break;
+ }
+}
+
+for ( $at = 0; $at < strlen( $current ); $at++ ) {
+ if ( 'a' === $current[ $at ] ) {
+ continue;
+ }
+
+ $candidate = $current;
+ $candidate[ $at ] = 'a';
+ ++$tries;
+
+ if ( $reproduces( $candidate ) ) {
+ $current = $candidate;
+ }
+}
+
+$out_dir = '' !== $source_dir ? $source_dir : '.';
+if ( ! is_dir( $out_dir ) && ! mkdir( $out_dir, 0777, true ) ) {
+ fwrite( STDERR, "Cannot create output dir {$out_dir}\n" );
+ exit( 2 );
+}
+
+$payload_path = "{$out_dir}/minimized-payload.txt";
+$manifest_path = "{$out_dir}/minimized.json";
+$manifest = json_encode(
+ array(
+ 'mode' => $mode,
+ 'context' => $context,
+ 'signature' => $signature,
+ 'original_size' => strlen( $payload ),
+ 'minimized_size' => strlen( $current ),
+ 'tries' => $tries,
+ 'payload_base64' => base64_encode( $current ),
+ 'payload_hex' => strlen( $current ) <= 256 ? bin2hex( $current ) : null,
+ 'environment' => Cli::environment_metadata( $oracles ),
+ 'git' => Cli::git_metadata( Bootstrap::repo_root() ),
+ ),
+ JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
+);
+if ( false === $manifest || ! Cli::write_file( $payload_path, $current ) || ! Cli::write_file( $manifest_path, $manifest ) ) {
+ fwrite( STDERR, "Cannot write minimized artifacts under {$out_dir}\n" );
+ exit( 2 );
+}
+
+echo "Minimized {$signature}: " . strlen( $payload ) . ' -> ' . strlen( $current ) . " bytes in {$tries} tries.\n";
+echo 'Hex: ' . bin2hex( substr( $current, 0, 128 ) ) . ( strlen( $current ) > 128 ? '...' : '' ) . "\n";
+echo "Artifacts: {$payload_path}, {$manifest_path}\n";
+
+exit( 0 );
diff --git a/tools/html-decoder-fuzz/replay.php b/tools/html-decoder-fuzz/replay.php
new file mode 100644
index 0000000000000..20bb7e298334f
--- /dev/null
+++ b/tools/html-decoder-fuzz/replay.php
@@ -0,0 +1,132 @@
+ '',
+ 'input' => '',
+ 'seed' => -1,
+ 'case' => -1,
+ 'context' => 'both',
+ 'mode' => 'oracle',
+ 'max-bytes' => 4096,
+ )
+);
+
+Cli::require_int_at_least( $options, 'max-bytes', 1 );
+Cli::require_one_of( $options, 'context', array( 'text', 'attribute', 'both' ) );
+Cli::require_one_of( $options, 'mode', Cli::valid_modes() );
+
+Bootstrap::load_targets();
+
+$payload = null;
+$context = $options['context'];
+$mode = $options['mode'];
+$source = null;
+
+if ( '' !== $options['failure'] ) {
+ $manifest = json_decode( (string) file_get_contents( $options['failure'] ), true );
+ if ( ! is_array( $manifest ) || ! isset( $manifest['payload_base64'] ) ) {
+ fwrite( STDERR, "Cannot read failure manifest {$options['failure']}\n" );
+ exit( 2 );
+ }
+ $payload = base64_decode( $manifest['payload_base64'], true );
+ $context = $manifest['context'] ?? $context;
+ $mode = $manifest['mode'] ?? 'oracle';
+ if ( ! in_array( $context, array( 'text', 'attribute', 'both' ), true ) ) {
+ fwrite( STDERR, "Invalid context in failure manifest: {$context}\n" );
+ exit( 2 );
+ }
+ if ( ! in_array( $mode, Cli::valid_modes(), true ) ) {
+ fwrite( STDERR, "Invalid mode in failure manifest: {$mode}\n" );
+ exit( 2 );
+ }
+ $source = "failure manifest {$options['failure']}";
+} elseif ( '' !== $options['input'] ) {
+ $payload = file_get_contents( $options['input'] );
+ if ( false === $payload ) {
+ fwrite( STDERR, "Cannot read input file {$options['input']}\n" );
+ exit( 2 );
+ }
+ $source = "input file {$options['input']}";
+} elseif ( $options['seed'] >= 0 && $options['case'] >= 0 ) {
+ $generator = new Generator( new Prng( "{$options['seed']}:{$options['case']}" ), $options['max-bytes'], Bootstrap::named_reference_names() );
+ if ( 'bytes' === $mode ) {
+ $generated = $generator->generate_bytes();
+ } elseif ( 'names' === $mode ) {
+ $generated = $generator->generate_name_sweep( $options['case'] );
+ } elseif ( 'legacy-followers' === $mode ) {
+ $generated = $generator->generate_legacy_follower_sweep( $options['case'] );
+ } elseif ( 'prefix-families' === $mode ) {
+ $generated = $generator->generate_prefix_family_sweep( $options['case'] );
+ } elseif ( 'numeric-boundaries' === $mode ) {
+ $generated = $generator->generate_numeric_boundary_sweep( $options['case'] );
+ } elseif ( 'corpus' === $mode ) {
+ $generated = $generator->generate_corpus_mutation( $options['case'] );
+ } elseif ( 'token-map' === $mode ) {
+ $generated = $generator->generate_token_map_sweep( $options['case'] );
+ } elseif ( 'coverage' === $mode ) {
+ $generated = $generator->generate();
+ } else {
+ $generated = $generator->generate();
+ }
+ $payload = $generated['payload'];
+ $context = $generated['context'];
+ $source = "seed {$options['seed']} case {$options['case']} (mode {$mode}, strategy {$generated['strategy']}, context {$context})";
+} else {
+ fwrite( STDERR, "Provide --failure, --input, or --seed with --case.\n" );
+ exit( 2 );
+}
+
+if ( ! is_string( $payload ) ) {
+ fwrite( STDERR, "Payload could not be loaded.\n" );
+ exit( 2 );
+}
+
+$oracles = Oracles::build();
+foreach ( $oracles->drain_events() as $event ) {
+ fwrite( STDERR, "oracle event: {$event['oracle']}: {$event['detail']}\n" );
+}
+if ( Cli::mode_uses_oracle( $mode ) && ! $oracles->has_required() ) {
+ fwrite( STDERR, "Required oracle unavailable; cannot replay.\n" );
+ exit( 2 );
+}
+
+$checks = new Checks( $oracles );
+$failures = 'bytes' === $mode ? $checks->run_without_oracle( $context, $payload ) : $checks->run( $context, $payload );
+
+echo "Replaying {$source}\n";
+echo "Mode: {$mode}\n";
+echo "Context: {$context}\n";
+echo 'Payload: ' . strlen( $payload ) . ' bytes, sha256 ' . hash( 'sha256', $payload ) . "\n";
+echo 'Hex preview: ' . bin2hex( substr( $payload, 0, 96 ) ) . ( strlen( $payload ) > 96 ? '...' : '' ) . "\n";
+echo 'Oracles: ' . implode( ', ', $oracles->names() ) . "\n\n";
+
+if ( array() === $failures ) {
+ echo "All checks passed.\n";
+ exit( 0 );
+}
+
+echo count( $failures ) . " failure(s):\n";
+foreach ( $failures as $failure ) {
+ echo "- {$failure['signature']}\n";
+ echo ' ' . json_encode( $failure['detail'], JSON_UNESCAPED_SLASHES ) . "\n";
+}
+
+exit( 1 );
diff --git a/tools/html-decoder-fuzz/runner.php b/tools/html-decoder-fuzz/runner.php
new file mode 100644
index 0000000000000..d5bfd3602766a
--- /dev/null
+++ b/tools/html-decoder-fuzz/runner.php
@@ -0,0 +1,945 @@
+ 4,
+ 'duration-seconds' => 60,
+ 'max-cases' => 0,
+ 'cases-per-batch' => 2000,
+ 'seed-base' => 0,
+ 'max-bytes' => 4096,
+ 'mode' => 'oracle',
+ 'output-dir' => '',
+ 'stall-timeout' => 120,
+ 'artifact-retention' => 'bounded',
+ 'max-artifacts-per-signature' => 5,
+ 'summary-mode' => 'failures',
+ 'max-stderr-bytes' => 65536,
+ )
+);
+
+Cli::require_int_at_least( $options, 'lanes', 1 );
+Cli::require_int_at_least( $options, 'duration-seconds', 0 );
+Cli::require_int_at_least( $options, 'max-cases', 0 );
+Cli::require_int_at_least( $options, 'cases-per-batch', 1 );
+Cli::require_int_at_least( $options, 'seed-base', 0 );
+Cli::require_int_at_least( $options, 'max-bytes', 1 );
+Cli::require_int_at_least( $options, 'stall-timeout', 1 );
+Cli::require_int_at_least( $options, 'max-artifacts-per-signature', 0 );
+Cli::require_int_at_least( $options, 'max-stderr-bytes', 0 );
+Cli::require_one_of( $options, 'mode', Cli::valid_modes() );
+Cli::require_one_of( $options, 'artifact-retention', array( 'bounded', 'all', 'none' ) );
+Cli::require_one_of( $options, 'summary-mode', array( 'all', 'failures', 'none' ) );
+
+$repo_root = Bootstrap::repo_root();
+$output_dir = $options['output-dir'];
+if ( '' === $output_dir ) {
+ $now = microtime( true );
+ $output_dir = sprintf(
+ '%s/artifacts/html-decoder-fuzz/run-%s-%06d-p%d',
+ $repo_root,
+ gmdate( 'Ymd-His', (int) $now ),
+ (int) ( ( $now - floor( $now ) ) * 1000000 ),
+ getmypid()
+ );
+}
+if ( ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) {
+ fwrite( STDERR, "Cannot create output dir {$output_dir}\n" );
+ exit( 2 );
+}
+if ( ! is_writable( $output_dir ) ) {
+ fwrite( STDERR, "Output dir is not writable: {$output_dir}\n" );
+ exit( 2 );
+}
+if ( ! is_readable( $output_dir ) ) {
+ fwrite( STDERR, "Output dir is not readable: {$output_dir}\n" );
+ exit( 2 );
+}
+
+$seed_base = $options['seed-base'];
+if ( 0 === $seed_base ) {
+ $seed_base = (int) ( microtime( true ) * 1000 ) % 1000000000;
+}
+
+$stderr_bytes_by_lane = array();
+$stderr_truncated_lanes = array();
+$startup_truncated_stderr_logs = array();
+$stderr_lane_ids = array();
+for ( $lane_id = 0; $lane_id < max( 1, $options['lanes'] ); $lane_id++ ) {
+ $stderr_lane_ids[ $lane_id ] = true;
+}
+$output_items = new \FilesystemIterator( $output_dir, \FilesystemIterator::SKIP_DOTS );
+foreach ( $output_items as $output_item ) {
+ if ( 1 === preg_match( '/^lane-(\d+)-stderr\.log$/', $output_item->getBasename(), $match ) ) {
+ $stderr_lane_ids[ (int) $match[1] ] = true;
+ }
+}
+ksort( $stderr_lane_ids, SORT_NUMERIC );
+
+foreach ( array_keys( $stderr_lane_ids ) as $lane_id ) {
+ $stderr_path = "{$output_dir}/lane-{$lane_id}-stderr.log";
+ if ( Cli::is_linked_file( $stderr_path ) ) {
+ fwrite( STDERR, "Lane stderr log is a linked file: {$stderr_path}\n" );
+ exit( 2 );
+ }
+ if ( ! is_file( $stderr_path ) ) {
+ continue;
+ }
+
+ $stderr_size = filesize( $stderr_path );
+ if ( ! is_int( $stderr_size ) ) {
+ fwrite( STDERR, "Cannot stat lane stderr log {$stderr_path}\n" );
+ exit( 2 );
+ }
+ if ( $stderr_size > $options['max-stderr-bytes'] ) {
+ $truncated = $options['max-stderr-bytes'] > 0
+ ? file_get_contents( $stderr_path, false, null, 0, $options['max-stderr-bytes'] )
+ : '';
+ if ( ! is_string( $truncated ) || ! Cli::write_file( $stderr_path, $truncated ) ) {
+ fwrite( STDERR, "Cannot truncate lane stderr log {$stderr_path}\n" );
+ exit( 2 );
+ }
+ $startup_truncated_stderr_logs[] = array(
+ 'lane' => $lane_id,
+ 'bytes' => $options['max-stderr-bytes'],
+ 'was_bytes' => $stderr_size,
+ );
+ $stderr_size = $options['max-stderr-bytes'];
+ }
+
+ $stderr_bytes_by_lane[ $lane_id ] = $stderr_size;
+}
+
+$summary_path = "{$output_dir}/summary.ndjson";
+if ( 'none' !== $options['summary-mode'] && Cli::is_linked_file( $summary_path ) ) {
+ fwrite( STDERR, "Summary file is a linked file: {$summary_path}\n" );
+ exit( 2 );
+}
+$summary = 'none' === $options['summary-mode'] ? null : fopen( $summary_path, 'ab' );
+if ( false === $summary ) {
+ fwrite( STDERR, "Cannot open summary file {$summary_path}\n" );
+ exit( 2 );
+}
+$started_at = microtime( true );
+$deadline = $options['duration-seconds'] > 0 ? $started_at + $options['duration-seconds'] : null;
+
+$retained_artifacts_by_signature = array();
+$retained_artifact_dirs = array();
+$startup_pruned_artifacts = 0;
+$startup_pruned_partial_artifacts = 0;
+$startup_verification_unavailable = false;
+$existing_artifacts_by_signature = array();
+$unverified_artifact_signatures = array();
+$partial_artifact_dirs = array();
+$startup_checks = array();
+$startup_checks_available = array();
+$is_replayable_failure_manifest = static function ( $manifest ): bool {
+ if ( ! is_array( $manifest ) || ! isset( $manifest['signatures'], $manifest['payload_base64'], $manifest['context'], $manifest['failures'], $manifest['input_size'] ) ) {
+ return false;
+ }
+ if ( ! is_array( $manifest['signatures'] ) || array() === $manifest['signatures'] || ! is_string( $manifest['payload_base64'] ) ) {
+ return false;
+ }
+ if ( ! in_array( $manifest['context'], array( 'text', 'attribute', 'both' ), true ) ) {
+ return false;
+ }
+ if ( isset( $manifest['mode'] ) && ! in_array( $manifest['mode'], Cli::valid_modes(), true ) ) {
+ return false;
+ }
+ $payload = base64_decode( $manifest['payload_base64'], true );
+ if ( ! is_string( $payload ) || '' === $payload || ! is_int( $manifest['input_size'] ) || strlen( $payload ) !== $manifest['input_size'] ) {
+ return false;
+ }
+ if ( ! is_array( $manifest['failures'] ) || array() === $manifest['failures'] ) {
+ return false;
+ }
+ $failure_signatures = array();
+ foreach ( $manifest['failures'] as $failure ) {
+ if ( ! is_array( $failure ) || ! isset( $failure['signature'] ) || ! is_string( $failure['signature'] ) ) {
+ return false;
+ }
+ $failure_signatures[] = $failure['signature'];
+ }
+ $expected = array_values( array_unique( array_map( 'strval', $manifest['signatures'] ) ) );
+ $actual = array_values( array_unique( $failure_signatures ) );
+ sort( $expected, SORT_STRING );
+ sort( $actual, SORT_STRING );
+ return $expected === $actual;
+};
+$startup_verifier_available = static function ( string $mode ) use ( &$startup_checks, &$startup_checks_available ): bool {
+ if ( ! isset( $startup_checks_available[ $mode ] ) ) {
+ Bootstrap::load_targets();
+ $oracles = Oracles::build();
+ $startup_checks_available[ $mode ] = ! Cli::mode_uses_oracle( $mode ) || $oracles->has_required();
+ $startup_checks[ $mode ] = $startup_checks_available[ $mode ] ? new Checks( $oracles ) : null;
+ }
+
+ return $startup_checks_available[ $mode ] && null !== $startup_checks[ $mode ];
+};
+$failure_manifest_reproduces = static function ( array $manifest ) use ( &$startup_checks, $startup_verifier_available ): ?bool {
+ $mode = $manifest['mode'] ?? 'oracle';
+ if ( ! $startup_verifier_available( $mode ) ) {
+ return null;
+ }
+
+ $payload = base64_decode( $manifest['payload_base64'], true );
+ if ( ! is_string( $payload ) ) {
+ return null;
+ }
+
+ $actual = array_values(
+ array_unique(
+ array_map(
+ static fn( array $failure ): string => $failure['signature'],
+ 'bytes' === $mode
+ ? $startup_checks[ $mode ]->run_without_oracle( $manifest['context'], $payload )
+ : $startup_checks[ $mode ]->run( $manifest['context'], $payload )
+ )
+ )
+ );
+ $expected = array_values( array_unique( array_map( 'strval', $manifest['signatures'] ) ) );
+ sort( $actual, SORT_STRING );
+ sort( $expected, SORT_STRING );
+
+ return $expected === $actual;
+};
+$startup_artifact_dirs = array();
+$output_items = new \FilesystemIterator( $output_dir, \FilesystemIterator::SKIP_DOTS );
+foreach ( $output_items as $output_item ) {
+ if ( 0 !== strncmp( $output_item->getBasename(), 'failure-', 8 ) ) {
+ continue;
+ }
+
+ if ( $output_item->isLink() ) {
+ $partial_artifact_dirs[] = $output_item->getPathname();
+ continue;
+ }
+
+ if ( $output_item->isDir() ) {
+ $startup_artifact_dirs[] = $output_item->getPathname();
+ }
+}
+sort( $startup_artifact_dirs, SORT_STRING );
+foreach ( $startup_artifact_dirs as $artifact_dir ) {
+ $failure_file = "{$artifact_dir}/failure.json";
+ if ( ! is_file( $failure_file ) ) {
+ $partial_artifact_dirs[] = $artifact_dir;
+ continue;
+ }
+
+ $manifest = json_decode( (string) file_get_contents( $failure_file ), true );
+ if ( $is_replayable_failure_manifest( $manifest ) ) {
+ $reproduces = $failure_manifest_reproduces( $manifest );
+ $signature_key = Cli::failure_signature_key( $manifest['signatures'], $manifest['mode'] ?? 'oracle' );
+ if ( null === $reproduces ) {
+ $startup_verification_unavailable = true;
+ $unverified_artifact_signatures[ $signature_key ] = true;
+ }
+ if ( false !== $reproduces ) {
+ $existing_artifacts_by_signature[ $signature_key ][] = $artifact_dir;
+ continue;
+ }
+ }
+
+ $partial_artifact_dirs[] = $artifact_dir;
+}
+
+if ( 'all' !== $options['artifact-retention'] ) {
+ foreach ( $partial_artifact_dirs as $artifact_dir ) {
+ if ( ! Cli::remove_tree( $artifact_dir, $output_dir ) ) {
+ fwrite( STDERR, "Cannot prune partial failure artifact {$artifact_dir}\n" );
+ exit( 2 );
+ }
+ ++$startup_pruned_artifacts;
+ ++$startup_pruned_partial_artifacts;
+ }
+}
+
+foreach ( $existing_artifacts_by_signature as $signature_key => $artifact_dirs ) {
+ sort( $artifact_dirs, SORT_STRING );
+ $keep = count( $artifact_dirs );
+ if ( 'none' === $options['artifact-retention'] ) {
+ $keep = 0;
+ } elseif ( 'bounded' === $options['artifact-retention'] && isset( $unverified_artifact_signatures[ $signature_key ] ) ) {
+ $keep = count( $artifact_dirs );
+ } elseif ( 'bounded' === $options['artifact-retention'] ) {
+ $keep = min( $keep, $options['max-artifacts-per-signature'] );
+ }
+
+ foreach ( $artifact_dirs as $index => $artifact_dir ) {
+ if ( $index < $keep ) {
+ $retained_artifacts_by_signature[ $signature_key ] = ( $retained_artifacts_by_signature[ $signature_key ] ?? 0 ) + 1;
+ $artifact_key = realpath( $artifact_dir );
+ $retained_artifact_dirs[ false === $artifact_key ? $artifact_dir : $artifact_key ] = $signature_key;
+ continue;
+ }
+
+ if ( ! Cli::remove_tree( $artifact_dir, $output_dir ) ) {
+ fwrite( STDERR, "Cannot prune existing failure artifact {$artifact_dir}\n" );
+ exit( 2 );
+ }
+ ++$startup_pruned_artifacts;
+ }
+}
+
+$state = array(
+ 'started_at' => gmdate( 'c' ),
+ 'seed_base' => $seed_base,
+ 'options' => $options,
+ 'git' => Cli::git_metadata( $repo_root ),
+ 'cases' => 0,
+ 'failures' => 0,
+ 'bytes' => 0,
+ 'by_strategy' => array(),
+ 'by_context' => array(),
+ 'failure_seeds' => array(),
+ 'stalled_seeds' => array(),
+ 'worker_errors' => array(),
+ 'worker_stderr_truncated' => array(),
+ 'worker_stderr_startup_truncated' => $startup_truncated_stderr_logs,
+ 'harness_errors' => 0,
+ 'oracle_events' => array(),
+ 'batches' => 0,
+ 'coverage' => array(
+ 'edges' => 0,
+ 'payloads' => 0,
+ 'pruned_duplicate_payloads' => 0,
+ 'by_file' => array(),
+ 'edge_keys' => array(),
+ 'corpus' => array(),
+ ),
+ 'artifact_retention' => array(
+ 'mode' => $options['artifact-retention'],
+ 'max_per_signature' => $options['max-artifacts-per-signature'],
+ 'retained_by_signature' => $retained_artifacts_by_signature,
+ 'pruned' => $startup_pruned_artifacts,
+ 'startup_pruned' => $startup_pruned_artifacts,
+ 'startup_pruned_partial' => $startup_pruned_partial_artifacts,
+ 'startup_verification_unavailable' => $startup_verification_unavailable,
+ ),
+ 'stop_reason' => null,
+);
+
+$next_seed = $seed_base;
+$next_start_case = 0;
+$lanes = array();
+
+$spawn_lane = static function ( int $lane_id ) use ( &$next_seed, &$next_start_case, &$stderr_bytes_by_lane, &$stderr_truncated_lanes, $seed_base, $options, $output_dir ): array {
+ if ( Cli::mode_uses_start_case_windows( $options['mode'] ) ) {
+ $seed = $seed_base;
+ $start_case = $next_start_case;
+ $next_start_case += $options['cases-per-batch'];
+ } else {
+ $seed = $next_seed++;
+ $start_case = 0;
+ }
+
+ $command = array(
+ PHP_BINARY,
+ __DIR__ . '/worker.php',
+ '--seed',
+ (string) $seed,
+ '--start-case',
+ (string) $start_case,
+ '--cases',
+ (string) $options['cases-per-batch'],
+ '--max-bytes',
+ (string) $options['max-bytes'],
+ '--mode',
+ $options['mode'],
+ '--output-dir',
+ $output_dir,
+ '--progress-every',
+ '500',
+ );
+
+ $stderr_path = "{$output_dir}/lane-{$lane_id}-stderr.log";
+ if ( Cli::is_linked_file( $stderr_path ) ) {
+ fwrite( STDERR, "Lane stderr log is a linked file: {$stderr_path}\n" );
+ exit( 2 );
+ }
+
+ $process = proc_open(
+ $command,
+ array(
+ 0 => array( 'file', '/dev/null', 'r' ),
+ 1 => array( 'pipe', 'w' ),
+ 2 => array( 'pipe', 'w' ),
+ ),
+ $pipes
+ );
+ if ( ! is_resource( $process ) || ! isset( $pipes[1], $pipes[2] ) || ! is_resource( $pipes[1] ) || ! is_resource( $pipes[2] ) ) {
+ fwrite( STDERR, "Cannot spawn worker lane {$lane_id}\n" );
+ exit( 2 );
+ }
+
+ stream_set_blocking( $pipes[1], false );
+ stream_set_blocking( $pipes[2], false );
+
+ return array(
+ 'id' => $lane_id,
+ 'seed' => $seed,
+ 'start_case' => $start_case,
+ 'process' => $process,
+ 'stdout' => $pipes[1],
+ 'stderr' => $pipes[2],
+ 'stderr_path' => $stderr_path,
+ 'stderr_bytes' => $stderr_bytes_by_lane[ $lane_id ] ?? 0,
+ 'stderr_truncated' => isset( $stderr_truncated_lanes[ $lane_id ] ),
+ 'buffer' => '',
+ 'last_output' => microtime( true ),
+ 'reported_failures' => 0,
+ );
+};
+
+$write_state = static function () use ( &$state, $output_dir, $started_at ): bool {
+ $state['elapsed_sec'] = round( microtime( true ) - $started_at, 1 );
+ $state_json = json_encode( $state, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES );
+ if ( false === $state_json ) {
+ return false;
+ }
+
+ return Cli::write_file( "{$output_dir}/state.json", $state_json );
+};
+
+$stop_requested = false;
+$summary_write_failed = false;
+
+$write_summary_record = static function ( array $record ) use ( &$state, &$stop_requested, &$summary_write_failed, $summary, $summary_path ): bool {
+ if ( null === $summary ) {
+ return true;
+ }
+
+ $summary_line = json_encode( $record, JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_SUBSTITUTE );
+ if ( false === $summary_line || ! Cli::write_stream( $summary, $summary_line . "\n" ) ) {
+ if ( $summary_write_failed ) {
+ return false;
+ }
+ $summary_write_failed = true;
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "Cannot write summary file {$summary_path}\n" );
+ return false;
+ }
+
+ return true;
+};
+
+$summarize_record = static function ( array $record ) use ( $options, $write_summary_record ): bool {
+ if ( 'none' === $options['summary-mode'] ) {
+ return true;
+ }
+ if ( 'all' === $options['summary-mode'] ) {
+ return $write_summary_record( $record );
+ }
+
+ $type = $record['type'] ?? '';
+ if ( 'failure' === $type ) {
+ if ( ! empty( $record['artifact_retained'] ) && empty( $record['artifact_reused'] ) ) {
+ return $write_summary_record( $record );
+ }
+ return true;
+ }
+ if ( 'coverage' === $type ) {
+ if ( ! empty( $record['coverage_retained'] ) ) {
+ return $write_summary_record( $record );
+ }
+ return true;
+ }
+ if ( in_array( $type, array( 'fatal', 'oracle-event', 'invalid-worker-output', 'malformed-worker-record', 'unknown-worker-record' ), true ) ) {
+ return $write_summary_record( $record );
+ }
+
+ return true;
+};
+
+$drain_lane_stderr = static function ( array &$lane ) use ( &$state, &$stop_requested, &$stderr_bytes_by_lane, &$stderr_truncated_lanes, $options ): void {
+ $chunk = stream_get_contents( $lane['stderr'] );
+ if ( false === $chunk || '' === $chunk ) {
+ return;
+ }
+
+ $remaining = $options['max-stderr-bytes'] - $lane['stderr_bytes'];
+ if ( $remaining <= 0 ) {
+ $lane['stderr_truncated'] = true;
+ $stderr_truncated_lanes[ $lane['id'] ] = true;
+ return;
+ }
+
+ $write = substr( $chunk, 0, $remaining );
+ if ( strlen( $chunk ) > strlen( $write ) ) {
+ $lane['stderr_truncated'] = true;
+ $stderr_truncated_lanes[ $lane['id'] ] = true;
+ }
+ if ( ! Cli::append_file( $lane['stderr_path'], $write ) ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "Cannot write lane stderr log {$lane['stderr_path']}\n" );
+ return;
+ }
+
+ $lane['stderr_bytes'] += strlen( $write );
+ $stderr_bytes_by_lane[ $lane['id'] ] = $lane['stderr_bytes'];
+};
+
+$apply_artifact_retention = static function ( array &$record ) use ( &$state, &$retained_artifact_dirs, $options ): ?string {
+ $signature_key = Cli::failure_signature_key( $record['signatures'], $record['mode'] ?? 'oracle' );
+ $record['signature_key'] = $signature_key;
+
+ $artifact_dir = $record['artifact_dir'] ?? null;
+ if ( ! is_string( $artifact_dir ) || '' === $artifact_dir || ! is_dir( $artifact_dir ) ) {
+ $record['artifact_retained'] = false;
+ $record['artifact_pruned'] = false;
+ return null;
+ }
+
+ $artifact_key = realpath( $artifact_dir );
+ $artifact_key = false === $artifact_key ? $artifact_dir : $artifact_key;
+ if ( isset( $retained_artifact_dirs[ $artifact_key ] ) ) {
+ if ( $signature_key === $retained_artifact_dirs[ $artifact_key ] ) {
+ $record['artifact_retained'] = true;
+ $record['artifact_pruned'] = false;
+ $record['artifact_reused'] = true;
+ return null;
+ }
+
+ $previous_signature_key = $retained_artifact_dirs[ $artifact_key ];
+ $state['artifact_retention']['retained_by_signature'][ $previous_signature_key ] =
+ max( 0, ( $state['artifact_retention']['retained_by_signature'][ $previous_signature_key ] ?? 1 ) - 1 );
+ if ( 0 === $state['artifact_retention']['retained_by_signature'][ $previous_signature_key ] ) {
+ unset( $state['artifact_retention']['retained_by_signature'][ $previous_signature_key ] );
+ }
+ unset( $retained_artifact_dirs[ $artifact_key ] );
+ $record['artifact_replaced_signature_key'] = $previous_signature_key;
+ }
+
+ $retain = 'all' === $options['artifact-retention'];
+ if ( 'bounded' === $options['artifact-retention'] ) {
+ $retained = $state['artifact_retention']['retained_by_signature'][ $signature_key ] ?? 0;
+ $retain = $retained < $options['max-artifacts-per-signature'];
+ }
+
+ if ( $retain ) {
+ $record['artifact_retained'] = true;
+ $record['artifact_pruned'] = false;
+ $state['artifact_retention']['retained_by_signature'][ $signature_key ] =
+ ( $state['artifact_retention']['retained_by_signature'][ $signature_key ] ?? 0 ) + 1;
+ $retained_artifact_dirs[ $artifact_key ] = $signature_key;
+ return null;
+ }
+
+ $record['artifact_dir'] = null;
+ $record['artifact_retained'] = false;
+ $record['artifact_pruned'] = true;
+ return $artifact_dir;
+};
+
+$handle_line = static function ( string $line, int $lane_id ) use ( &$state, &$stop_requested, $summarize_record, $apply_artifact_retention, $output_dir ): ?string {
+ $record = json_decode( $line, true );
+ if ( ! is_array( $record ) ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "invalid worker output on lane {$lane_id}\n" );
+ $summarize_record(
+ array(
+ 'type' => 'invalid-worker-output',
+ 'lane' => $lane_id,
+ 'raw_base64' => base64_encode( $line ),
+ )
+ );
+ return 'invalid';
+ }
+
+ $record['lane'] = $lane_id;
+
+ switch ( $record['type'] ?? '' ) {
+ case 'failure':
+ if ( ! isset( $record['seed'], $record['case'], $record['context'], $record['signatures'] ) || ! is_array( $record['signatures'] ) ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "malformed failure record on lane {$lane_id}\n" );
+ $record['type'] = 'malformed-worker-record';
+ $summarize_record( $record );
+ return 'invalid';
+ }
+ $record['mode'] = $record['mode'] ?? 'oracle';
+ if ( ! in_array( $record['mode'], Cli::valid_modes(), true ) ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "malformed failure mode on lane {$lane_id}\n" );
+ $record['type'] = 'malformed-worker-record';
+ $summarize_record( $record );
+ return 'invalid';
+ }
+ $prune_artifact_dir = $apply_artifact_retention( $record );
+ ++$state['failures'];
+ fwrite( STDERR, "FAILURE lane {$lane_id} seed {$record['seed']} case {$record['case']}: " . implode( ', ', $record['signatures'] ) . "\n" );
+ $summary_written = $summarize_record( $record );
+ if ( null !== $prune_artifact_dir ) {
+ if ( $summary_written && Cli::remove_tree( $prune_artifact_dir, $output_dir ) ) {
+ ++$state['artifact_retention']['pruned'];
+ } else {
+ if ( $summary_written ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "Cannot prune failure artifact {$prune_artifact_dir}\n" );
+ }
+ $record['artifact_dir'] = $prune_artifact_dir;
+ $record['artifact_retained'] = true;
+ $record['artifact_pruned'] = false;
+ $state['artifact_retention']['retained_by_signature'][ $record['signature_key'] ] =
+ ( $state['artifact_retention']['retained_by_signature'][ $record['signature_key'] ] ?? 0 ) + 1;
+ }
+ }
+ if ( ! empty( $record['artifact_retained'] ) && empty( $record['artifact_reused'] ) ) {
+ $state['failure_seeds'][] = array(
+ 'seed' => $record['seed'],
+ 'case' => $record['case'],
+ 'mode' => $record['mode'],
+ 'context' => $record['context'],
+ 'signatures' => $record['signatures'],
+ 'signature_key' => $record['signature_key'],
+ 'artifact' => $record['artifact_dir'] ?? null,
+ 'artifact_retained' => $record['artifact_retained'],
+ 'artifact_pruned' => $record['artifact_pruned'],
+ );
+ }
+ return 'failure';
+
+ case 'coverage':
+ if ( ! isset( $record['seed'], $record['case'], $record['context'], $record['strategy'], $record['new_edges'] ) || ! is_array( $record['new_edges'] ) ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "malformed coverage record on lane {$lane_id}\n" );
+ $record['type'] = 'malformed-worker-record';
+ $summarize_record( $record );
+ return 'invalid';
+ }
+ $record['mode'] = $record['mode'] ?? 'coverage';
+ if ( 'coverage' !== $record['mode'] ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "malformed coverage mode on lane {$lane_id}\n" );
+ $record['type'] = 'malformed-worker-record';
+ $summarize_record( $record );
+ return 'invalid';
+ }
+
+ $global_new_edges = array();
+ foreach ( $record['new_edges'] as $edge ) {
+ if ( ! is_array( $edge ) || ! isset( $edge['key'], $edge['file'], $edge['line'] ) || ! is_string( $edge['key'] ) || ! is_string( $edge['file'] ) || ! is_int( $edge['line'] ) ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "malformed coverage edge on lane {$lane_id}\n" );
+ $record['type'] = 'malformed-worker-record';
+ $summarize_record( $record );
+ return 'invalid';
+ }
+ if ( isset( $state['coverage']['edge_keys'][ $edge['key'] ] ) ) {
+ continue;
+ }
+
+ $state['coverage']['edge_keys'][ $edge['key'] ] = true;
+ $state['coverage']['by_file'][ $edge['file'] ] = ( $state['coverage']['by_file'][ $edge['file'] ] ?? 0 ) + 1;
+ $global_new_edges[] = $edge;
+ }
+
+ $artifact_dir = $record['artifact_dir'] ?? null;
+ if ( array() === $global_new_edges ) {
+ $record['new_edges'] = array();
+ $record['new_edge_count'] = 0;
+ $record['coverage_retained'] = false;
+ $record['coverage_duplicate'] = true;
+ $record['coverage_pruned'] = false;
+ if ( is_string( $artifact_dir ) && '' !== $artifact_dir && is_dir( $artifact_dir ) ) {
+ if ( Cli::remove_tree( $artifact_dir, $output_dir ) ) {
+ $record['artifact_dir'] = null;
+ $record['artifact_pruned'] = true;
+ $record['coverage_pruned'] = true;
+ ++$state['coverage']['pruned_duplicate_payloads'];
+ } else {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "Cannot prune duplicate coverage artifact {$artifact_dir}\n" );
+ }
+ }
+ $summarize_record( $record );
+ return 'coverage';
+ }
+
+ $record['new_edges'] = $global_new_edges;
+ $record['new_edge_count'] = count( $global_new_edges );
+ $state['coverage']['edges'] += count( $global_new_edges );
+ $record['coverage_duplicate'] = false;
+ $record['coverage_pruned'] = false;
+ $record['coverage_retained'] = is_string( $artifact_dir ) && '' !== $artifact_dir && is_dir( $artifact_dir ) && ! is_link( $artifact_dir );
+ if ( $record['coverage_retained'] ) {
+ $payload = isset( $record['payload_base64'] ) && is_string( $record['payload_base64'] )
+ ? base64_decode( $record['payload_base64'], true )
+ : null;
+ ++$state['coverage']['payloads'];
+ $state['coverage']['corpus'][] = array(
+ 'seed' => $record['seed'],
+ 'case' => $record['case'],
+ 'context' => $record['context'],
+ 'strategy' => $record['strategy'],
+ 'edges' => count( $global_new_edges ),
+ 'artifact' => $artifact_dir,
+ 'sha256' => is_string( $payload ) ? hash( 'sha256', $payload ) : null,
+ );
+ }
+ $summarize_record( $record );
+ return 'coverage';
+
+ case 'oracle-event':
+ $state['oracle_events'][] = $record;
+ $oracle = $record['oracle'] ?? 'unknown';
+ $detail = $record['detail'] ?? 'no detail';
+ fwrite( STDERR, "oracle event: {$oracle}: {$detail}\n" );
+ $summarize_record( $record );
+ return 'oracle-event';
+
+ case 'fatal':
+ ++$state['harness_errors'];
+ $state['oracle_events'][] = $record;
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ $reason = $record['reason'] ?? 'unknown';
+ fwrite( STDERR, "worker fatal: {$reason}\n" );
+ $summarize_record( $record );
+ return 'fatal';
+
+ case 'done':
+ if ( ! isset( $record['stats'] ) || ! is_array( $record['stats'] ) ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "malformed done record on lane {$lane_id}\n" );
+ $record['type'] = 'malformed-worker-record';
+ $summarize_record( $record );
+ return 'invalid';
+ }
+ $stats = $record['stats'];
+ if ( ! isset( $stats['cases'], $stats['bytes'], $stats['by_strategy'], $stats['by_context'] ) || ! is_array( $stats['by_strategy'] ) || ! is_array( $stats['by_context'] ) ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "malformed done stats on lane {$lane_id}\n" );
+ $record['type'] = 'malformed-worker-record';
+ $summarize_record( $record );
+ return 'invalid';
+ }
+ $state['cases'] += $stats['cases'];
+ $state['bytes'] += $stats['bytes'];
+ foreach ( $stats['by_strategy'] as $strategy => $count ) {
+ $state['by_strategy'][ $strategy ] = ( $state['by_strategy'][ $strategy ] ?? 0 ) + $count;
+ }
+ foreach ( $stats['by_context'] as $context => $count ) {
+ $state['by_context'][ $context ] = ( $state['by_context'][ $context ] ?? 0 ) + $count;
+ }
+ $summarize_record( $record );
+ return 'done';
+
+ case 'progress':
+ case 'start':
+ $summarize_record( $record );
+ return $record['type'];
+ }
+
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "unknown worker record type on lane {$lane_id}\n" );
+ $record['type'] = 'unknown-worker-record';
+ $summarize_record( $record );
+ return 'invalid';
+};
+
+for ( $i = 0; $i < max( 1, $options['lanes'] ); $i++ ) {
+ $lanes[ $i ] = $spawn_lane( $i );
+ ++$state['batches'];
+}
+
+$last_state_write = 0.0;
+
+while ( array() !== $lanes ) {
+ $now = microtime( true );
+
+ if ( ! $stop_requested && null !== $deadline && $now >= $deadline ) {
+ $state['stop_reason'] = 'duration';
+ $stop_requested = true;
+ }
+
+ if ( ! $stop_requested && $options['max-cases'] > 0 && $state['cases'] >= $options['max-cases'] ) {
+ $state['stop_reason'] = 'max-cases';
+ $stop_requested = true;
+ }
+
+ $streams = array();
+ foreach ( $lanes as $lane_id => $lane ) {
+ $streams[ "{$lane_id}:stdout" ] = $lane['stdout'];
+ $streams[ "{$lane_id}:stderr" ] = $lane['stderr'];
+ }
+
+ $read = array_values( $streams );
+ $write = null;
+ $except = null;
+ if ( stream_select( $read, $write, $except, 0, 250000 ) > 0 ) {
+ foreach ( $lanes as $lane_id => &$lane ) {
+ $chunk = stream_get_contents( $lane['stdout'] );
+ if ( false === $chunk || '' === $chunk ) {
+ continue;
+ }
+
+ $lane['last_output'] = microtime( true );
+ $lane['buffer'] .= $chunk;
+
+ while ( false !== ( $newline = strpos( $lane['buffer'], "\n" ) ) ) {
+ $line = substr( $lane['buffer'], 0, $newline );
+ $lane['buffer'] = substr( $lane['buffer'], $newline + 1 );
+ if ( '' !== $line && 'failure' === $handle_line( $line, $lane_id ) ) {
+ ++$lane['reported_failures'];
+ }
+ }
+ }
+ unset( $lane );
+ }
+
+ foreach ( $lanes as &$lane ) {
+ $drain_lane_stderr( $lane );
+ }
+ unset( $lane );
+
+ foreach ( $lanes as $lane_id => $lane ) {
+ $status = proc_get_status( $lane['process'] );
+ $stalled = ( microtime( true ) - $lane['last_output'] ) > $options['stall-timeout'];
+
+ if ( $status['running'] && $stalled ) {
+ proc_terminate( $lane['process'], 9 );
+ $state['stalled_seeds'][] = $lane['seed'];
+ fwrite( STDERR, "STALL lane {$lane_id} seed {$lane['seed']}: no output for {$options['stall-timeout']}s, killed\n" );
+ } elseif ( $status['running'] ) {
+ continue;
+ }
+
+ $rest = stream_get_contents( $lane['stdout'] );
+ if ( false === $rest ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "cannot read remaining worker output on lane {$lane_id}\n" );
+ $rest = '';
+ }
+ $tail = $lane['buffer'] . $rest;
+ if ( '' !== $tail ) {
+ foreach ( explode( "\n", $tail ) as $line ) {
+ if ( '' !== $line && 'failure' === $handle_line( $line, $lane_id ) ) {
+ ++$lane['reported_failures'];
+ }
+ }
+ }
+ $drain_lane_stderr( $lane );
+ fclose( $lane['stdout'] );
+ fclose( $lane['stderr'] );
+ $close_code = proc_close( $lane['process'] );
+ $exit_code = $status['exitcode'] ?? $close_code;
+ if ( -1 === $exit_code ) {
+ $exit_code = $close_code;
+ }
+ $accepted_failure_exit = 1 === $exit_code && $lane['reported_failures'] > 0;
+ if ( 0 !== $exit_code && ! $accepted_failure_exit && ! in_array( $lane['seed'], $state['stalled_seeds'], true ) ) {
+ $state['worker_errors'][] = array(
+ 'lane' => $lane_id,
+ 'seed' => $lane['seed'],
+ 'exit_code' => $exit_code,
+ );
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "worker error lane {$lane_id} seed {$lane['seed']}: exit {$exit_code}\n" );
+ }
+ if ( $lane['stderr_truncated'] ) {
+ $state['worker_stderr_truncated'][ $lane_id ] = array(
+ 'lane' => $lane_id,
+ 'seed' => $lane['seed'],
+ 'bytes' => $lane['stderr_bytes'],
+ );
+ }
+ unset( $lanes[ $lane_id ] );
+
+ if ( ! $stop_requested ) {
+ $lanes[ $lane_id ] = $spawn_lane( $lane_id );
+ ++$state['batches'];
+ }
+ }
+
+ if ( microtime( true ) - $last_state_write > 5 ) {
+ if ( ! $write_state() ) {
+ ++$state['harness_errors'];
+ $state['stop_reason'] = 'harness-error';
+ $stop_requested = true;
+ fwrite( STDERR, "Cannot write state file {$output_dir}/state.json\n" );
+ }
+ $last_state_write = microtime( true );
+ }
+}
+
+if ( null === $state['stop_reason'] ) {
+ $state['stop_reason'] = 'lanes-exited';
+}
+$state['finished_at'] = gmdate( 'c' );
+if ( ! $write_state() ) {
+ fwrite( STDERR, "Cannot write state file {$output_dir}/state.json\n" );
+ if ( is_resource( $summary ) ) {
+ fclose( $summary );
+ }
+ exit( 2 );
+}
+if ( is_resource( $summary ) ) {
+ fclose( $summary );
+}
+
+$elapsed = round( microtime( true ) - $started_at, 1 );
+fwrite(
+ STDERR,
+ sprintf(
+ "Done: %d cases, %d failures, %d stalled, %s bytes in %ss. Artifacts: %s\n",
+ $state['cases'],
+ $state['failures'],
+ count( $state['stalled_seeds'] ),
+ number_format( $state['bytes'] ),
+ $elapsed,
+ $output_dir
+ )
+);
+
+if ( $state['harness_errors'] > 0 || array() !== $state['worker_errors'] ) {
+ exit( 2 );
+}
+
+exit( ( $state['failures'] > 0 || array() !== $state['stalled_seeds'] ) ? 1 : 0 );
diff --git a/tools/html-decoder-fuzz/tests/harness-smoke.php b/tools/html-decoder-fuzz/tests/harness-smoke.php
new file mode 100644
index 0000000000000..515a9639b357d
--- /dev/null
+++ b/tools/html-decoder-fuzz/tests/harness-smoke.php
@@ -0,0 +1,4315 @@
+ array( 'file', '/dev/null', 'r' ),
+ 1 => array( 'pipe', 'w' ),
+ 2 => array( 'pipe', 'w' ),
+ ),
+ $pipes,
+ Bootstrap::repo_root(),
+ array_merge( getenv() ?: array(), $env )
+ );
+
+ if ( ! is_resource( $process ) ) {
+ return array(
+ 'code' => 127,
+ 'stdout' => '',
+ 'stderr' => 'proc_open failed',
+ );
+ }
+
+ $stdout = stream_get_contents( $pipes[1] );
+ $stderr = stream_get_contents( $pipes[2] );
+ fclose( $pipes[1] );
+ fclose( $pipes[2] );
+
+ return array(
+ 'code' => proc_close( $process ),
+ 'stdout' => (string) $stdout,
+ 'stderr' => (string) $stderr,
+ );
+}
+
+function remove_tree( string $path ): void {
+ if ( ! is_dir( $path ) ) {
+ return;
+ }
+
+ $items = new \RecursiveIteratorIterator(
+ new \RecursiveDirectoryIterator( $path, \FilesystemIterator::SKIP_DOTS ),
+ \RecursiveIteratorIterator::CHILD_FIRST
+ );
+
+ foreach ( $items as $item ) {
+ $item->isDir() && ! $item->isLink() ? rmdir( $item->getPathname() ) : unlink( $item->getPathname() );
+ }
+ rmdir( $path );
+}
+
+/**
+ * @return array
+ */
+function summary_start_windows( string $dir, string $mode ): array {
+ $summary = is_file( $dir . '/summary.ndjson' )
+ ? file( $dir . '/summary.ndjson', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES )
+ : array();
+ $windows = array();
+
+ if ( is_array( $summary ) ) {
+ foreach ( $summary as $line ) {
+ $record = json_decode( $line, true );
+ if ( is_array( $record ) && 'start' === ( $record['type'] ?? null ) && $mode === ( $record['mode'] ?? null ) ) {
+ $windows[] = array(
+ 'start' => $record['start_case'] ?? null,
+ 'cases' => $record['cases'] ?? null,
+ );
+ }
+ }
+ }
+
+ usort(
+ $windows,
+ static fn( array $a, array $b ): int => ( $a['start'] ?? -1 ) <=> ( $b['start'] ?? -1 )
+ );
+
+ return $windows;
+}
+
+function start_windows_are_distinct( array $windows, int $cases_per_batch ): bool {
+ if ( count( $windows ) < 2 ) {
+ return false;
+ }
+
+ $previous_window_end = null;
+ foreach ( $windows as $window ) {
+ if ( ! is_int( $window['start'] ) || $cases_per_batch !== $window['cases'] || 0 !== $window['start'] % $cases_per_batch || ( null !== $previous_window_end && $window['start'] < $previous_window_end ) ) {
+ return false;
+ }
+ $previous_window_end = $window['start'] + $window['cases'];
+ }
+
+ return true;
+}
+
+$oracles = Oracles::build();
+$events = $oracles->drain_events();
+$skip_c1_fault_seed = 2;
+$skip_c1_fault_case = 36;
+
+check( 'required oracles available', $oracles->has_required(), json_encode( $events ) );
+check( 'secondary entity-decode oracle available', in_array( 'entity-decode', $oracles->names(), true ), implode( ',', $oracles->names() ) );
+check(
+ 'no oracle disabled by battery',
+ array() === array_filter( $events, static fn( $e ) => 'oracle-disabled' === $e['type'] ),
+ json_encode( $events )
+);
+
+$checks = new Checks( $oracles );
+$battery_fails = array();
+foreach ( Oracles::battery() as $i => $vector ) {
+ list( $context, $payload ) = $vector;
+ foreach ( $checks->run( $context, $payload ) as $failure ) {
+ $battery_fails[] = "vector {$i}: {$failure['signature']}";
+ }
+}
+check( 'real targets clean on oracle battery', array() === $battery_fails, implode( '; ', $battery_fails ) );
+
+$real_targets = Targets::real();
+
+/**
+ * @return string[] Distinct check names observed.
+ */
+function broken_run( Oracles $oracles, array $real_targets, array $overrides ): array {
+ $checks = new Checks( $oracles, array_merge( $real_targets, $overrides ) );
+ $seen = array();
+
+ $cases = array_merge(
+ Oracles::battery(),
+ array(
+ array( 'text', 'a&b' ),
+ array( 'attribute', '¬x' ),
+ array( 'attribute', 'jav' ),
+ array( 'attribute', 'javascript:alert(1)' ),
+ array( 'attribute', '<⃒tail' ),
+ )
+ );
+
+ foreach ( $cases as $case ) {
+ foreach ( $checks->run( $case[0], $case[1] ) as $failure ) {
+ $seen[ $failure['check'] ] = true;
+ }
+ }
+
+ return array_keys( $seen );
+}
+
+/**
+ * @return string[] Distinct check names observed.
+ */
+function fault_run( Oracles $oracles, string $fault, string $payload = 'javascript:alert(1)', string $context = 'attribute' ): array {
+ $old_fault = getenv( 'HTML_DECODER_FUZZ_FAULT' );
+ putenv( "HTML_DECODER_FUZZ_FAULT={$fault}" );
+
+ try {
+ $checks = new Checks( $oracles, Targets::resolve() );
+ $seen = array();
+
+ foreach ( $checks->run( $context, $payload ) as $failure ) {
+ $seen[ $failure['check'] ] = true;
+ }
+
+ return array_keys( $seen );
+ } finally {
+ if ( false === $old_fault ) {
+ putenv( 'HTML_DECODER_FUZZ_FAULT' );
+ } else {
+ putenv( "HTML_DECODER_FUZZ_FAULT={$old_fault}" );
+ }
+ }
+}
+
+/**
+ * @return string[] Distinct check names observed.
+ */
+function fault_run_without_oracle( Oracles $oracles, string $fault, string $payload ): array {
+ $old_fault = getenv( 'HTML_DECODER_FUZZ_FAULT' );
+ putenv( "HTML_DECODER_FUZZ_FAULT={$fault}" );
+
+ try {
+ $checks = new Checks( $oracles, Targets::resolve() );
+ $seen = array();
+
+ foreach ( $checks->run_without_oracle( 'both', $payload ) as $failure ) {
+ $seen[ $failure['check'] ] = true;
+ }
+
+ return array_keys( $seen );
+ } finally {
+ if ( false === $old_fault ) {
+ putenv( 'HTML_DECODER_FUZZ_FAULT' );
+ } else {
+ putenv( "HTML_DECODER_FUZZ_FAULT={$old_fault}" );
+ }
+ }
+}
+
+/**
+ * @return string[] Distinct check names observed.
+ */
+function broken_oracle_free_run( Oracles $oracles, array $real_targets, array $overrides ): array {
+ $checks = new Checks( $oracles, array_merge( $real_targets, $overrides ) );
+ $seen = array();
+ $cases = array(
+ array( 'both', "raw\x00bytes" ),
+ array( 'both', "\xFF\xFE<\"\r" ),
+ array( 'both', "a¬x\x00z" ),
+ );
+
+ foreach ( $cases as $case ) {
+ foreach ( $checks->run_without_oracle( $case[0], $case[1] ) as $failure ) {
+ $seen[ $failure['check'] ] = true;
+ }
+ }
+
+ return array_keys( $seen );
+}
+
+function reference_at_eof_shape( string $payload ): ?string {
+ if ( 1 === preg_match( '/&\z/', $payload ) ) {
+ return 'bare-introducer';
+ }
+ if ( 1 === preg_match( '/(?:[xX])?\z/', $payload ) ) {
+ return 'partial-numeric-introducer';
+ }
+ if ( 1 === preg_match( '/[0-9]+\z/', $payload ) ) {
+ return 'decimal-digits';
+ }
+ if ( 1 === preg_match( '/[xX][0-9A-Fa-f]+\z/', $payload ) ) {
+ return 'hex-digits';
+ }
+ if ( 1 === preg_match( '/&[A-Za-z][A-Za-z0-9]*\z/', $payload ) ) {
+ return 'named-prefix';
+ }
+ return null;
+}
+
+/**
+ * @return array
+ */
+function numeric_reference_ranges( string $payload ): array {
+ $ranges = array();
+ $match_count = preg_match_all( '/(?:([xX])([0-9A-Fa-f]+)|([0-9]+));?/', $payload, $matches, PREG_SET_ORDER );
+ if ( false === $match_count || 0 === $match_count ) {
+ return $ranges;
+ }
+
+ foreach ( $matches as $match ) {
+ $is_hex = '' !== ( $match[1] ?? '' );
+ $digits = $is_hex ? $match[2] : $match[3];
+ $base = $is_hex ? 16 : 10;
+ $max_digits = $is_hex ? 6 : 7;
+ $zero_count = strspn( $digits, '0' );
+ $significant_digits = substr( $digits, $zero_count );
+
+ if ( '' === $significant_digits ) {
+ $ranges['zero-only'] = true;
+ continue;
+ }
+
+ if ( strlen( $significant_digits ) > $max_digits ) {
+ $ranges['digit-count-overflow'] = true;
+ continue;
+ }
+
+ $value = intval( $significant_digits, $base );
+ if ( $value <= 0x1F ) {
+ $ranges['c0-control'] = true;
+ } elseif ( $value >= 0x80 && $value <= 0x9F ) {
+ $ranges['c1-control'] = true;
+ } elseif ( $value >= 0xA0 && $value <= 0xD7FF ) {
+ $ranges['bmp-pre-surrogate'] = true;
+ } elseif ( $value >= 0xD800 && $value <= 0xDFFF ) {
+ $ranges['surrogate'] = true;
+ } elseif ( ( $value >= 0xFDD0 && $value <= 0xFDEF ) || 0xFFFE === $value || 0xFFFF === $value ) {
+ $ranges['bmp-noncharacter'] = true;
+ } elseif ( $value >= 0xE000 && $value <= 0xFFFD ) {
+ $ranges['bmp-post-surrogate'] = true;
+ } elseif ( $value >= 0x1FFFE && $value <= 0x10FFFF && ( $value & 0xFFFF ) >= 0xFFFE ) {
+ $ranges['plane-noncharacter'] = true;
+ } elseif ( $value > 0x10FFFF ) {
+ $ranges['above-unicode-legal-digits'] = true;
+ } elseif ( $value >= 0x10000 ) {
+ $ranges['astral'] = true;
+ }
+ }
+
+ return $ranges;
+}
+
+/**
+ * @param string[] $names
+ * @return string[]
+ */
+function name_sweep_base_names( array $names ): array {
+ $base_names = array();
+ foreach ( $names as $name ) {
+ $base = rtrim( $name, ';' );
+ if ( '' !== $base ) {
+ $base_names[ $base ] = true;
+ }
+ }
+ return array_keys( $base_names );
+}
+
+/**
+ * @return string[]
+ */
+function legacy_follower_sweep_followers(): array {
+ $followers = array();
+
+ for ( $byte = 1; $byte <= 0x7F; $byte++ ) {
+ if ( in_array( $byte, array( 0x0D, 0x22, 0x3C ), true ) ) {
+ continue;
+ }
+ $followers[] = chr( $byte );
+ }
+
+ for ( $lead = 0xC2; $lead <= 0xF4; $lead++ ) {
+ if ( $lead < 0xE0 ) {
+ $followers[] = chr( $lead ) . "\x80";
+ } elseif ( 0xE0 === $lead ) {
+ $followers[] = "\xE0\xA0\x80";
+ } elseif ( $lead < 0xF0 ) {
+ $followers[] = chr( $lead ) . "\x80\x80";
+ } elseif ( 0xF0 === $lead ) {
+ $followers[] = "\xF0\x90\x80\x80";
+ } elseif ( $lead < 0xF4 ) {
+ $followers[] = chr( $lead ) . "\x80\x80\x80";
+ } else {
+ $followers[] = "\xF4\x80\x80\x80";
+ }
+ }
+
+ for ( $continuation = 0x80; $continuation <= 0xBF; $continuation++ ) {
+ $followers[] = "\xC2" . chr( $continuation );
+ }
+
+ return array_values( array_unique( $followers ) );
+}
+
+/**
+ * @return string[]
+ */
+function prefix_family_sweep_references(): array {
+ return array(
+ 'not',
+ 'not;',
+ 'notin;',
+ 'notinva;',
+ 'ngt;',
+ 'nGt;',
+ 'nGtv;',
+ 'nge;',
+ 'ngeq;',
+ 'ngeqq;',
+ );
+}
+
+/**
+ * @return string[]
+ */
+function prefix_family_sweep_followers(): array {
+ return array( '', 'x', 'X', '0', '=', "\u{00E9}" );
+}
+
+/**
+ * @param string[] $base_names
+ * @return array
+ */
+function prefix_family_sweep_cases( array $base_names ): array {
+ $base_set = array_fill_keys( $base_names, true );
+ $cases = array();
+
+ foreach ( prefix_family_sweep_references() as $reference ) {
+ if ( ! isset( $base_set[ rtrim( $reference, ';' ) ] ) ) {
+ continue;
+ }
+
+ $full_reference = '&' . $reference;
+ for ( $split = 1; $split < strlen( $full_reference ); $split++ ) {
+ foreach ( prefix_family_sweep_followers() as $follower ) {
+ $cases[] = array(
+ 'reference' => $full_reference,
+ 'split' => $split,
+ 'follower' => $follower,
+ );
+ }
+ }
+ }
+
+ return $cases;
+}
+
+/**
+ * @return array
+ */
+function token_map_sweep_cases(): array {
+ $method = new \ReflectionMethod( Generator::class, 'token_map_sweep_cases' );
+ $method->setAccessible( true );
+ return $method->invoke( null );
+}
+
+/**
+ * @return string[]
+ */
+function numeric_boundary_sweep_cases(): array {
+ $cases = array();
+ foreach ( array( 'decimal', 'hex-lower', 'hex-upper', 'hex-mixed' ) as $kind ) {
+ $is_decimal = 'decimal' === $kind;
+ $max_digits = $is_decimal ? 7 : 6;
+ foreach ( array( $max_digits, $max_digits + 1 ) as $digit_count ) {
+ foreach ( array( false, true ) as $leading_zero ) {
+ foreach ( array( false, true ) as $semicolon ) {
+ $cases[] = numeric_boundary_reference( $kind, $digit_count, $leading_zero, $semicolon );
+ }
+ }
+ }
+ }
+
+ return array_values( array_unique( $cases ) );
+}
+
+function numeric_boundary_reference( string $kind, int $digit_count, bool $leading_zero, bool $semicolon ): string {
+ if ( 'decimal' === $kind ) {
+ $prefix = '';
+ $digits = 7 === $digit_count ? '1114111' : substr( str_repeat( '9', $digit_count ), 0, $digit_count );
+ } else {
+ $prefix = 'hex-upper' === $kind ? '' : '';
+ $digits = 6 === $digit_count ? '10ffee' : substr( str_repeat( 'abcdef', (int) ceil( $digit_count / 6 ) ), 0, $digit_count );
+ if ( 'hex-upper' === $kind ) {
+ $digits = strtoupper( $digits );
+ } elseif ( 'hex-mixed' === $kind ) {
+ $chars = str_split( $digits );
+ foreach ( $chars as $i => $char ) {
+ if ( 0 === $i % 2 ) {
+ $chars[ $i ] = strtoupper( $char );
+ }
+ }
+ $digits = implode( '', $chars );
+ }
+ }
+
+ if ( $leading_zero ) {
+ $digits = '0' . $digits;
+ }
+
+ return $prefix . $digits . ( $semicolon ? ';' : '' );
+}
+
+/**
+ * @return array{base: string, significant_digits: int, leading_zero: bool, semicolon: bool, mixed_hex: bool}
+ */
+function numeric_boundary_shape( string $payload ): array {
+ if ( 1 !== preg_match( '/^(?:(x|X)([0-9A-Fa-f]+)|([0-9]+))(;?)$/', $payload, $match ) ) {
+ return array(
+ 'base' => 'invalid',
+ 'significant_digits' => 0,
+ 'leading_zero' => false,
+ 'semicolon' => false,
+ 'mixed_hex' => false,
+ );
+ }
+
+ $is_hex = '' !== ( $match[1] ?? '' );
+ $digits = $is_hex ? $match[2] : $match[3];
+ $significant = substr( $digits, strspn( $digits, '0' ) );
+ $letters = preg_replace( '/[^A-Fa-f]/', '', $digits );
+
+ return array(
+ 'base' => $is_hex ? 'hex' : 'decimal',
+ 'significant_digits' => strlen( $significant ),
+ 'leading_zero' => strlen( $digits ) > strlen( $significant ),
+ 'semicolon' => ';' === ( $match[4] ?? '' ),
+ 'mixed_hex' => $is_hex && '' !== $letters && strtolower( $letters ) !== $letters && strtoupper( $letters ) !== $letters,
+ );
+}
+
+/**
+ * @return string[]
+ */
+function attribute_prefix_smoke_targets(): array {
+ return array(
+ 'javascript:',
+ 'JaVaScRiPt:',
+ 'http://',
+ 'https://',
+ 'mailto:user@example.com',
+ 'data:text/plain,',
+ 'urn:wp:html5:',
+ 'ftp://',
+ );
+}
+
+/**
+ * @return string[]
+ */
+function attribute_prefix_encoding_forms( string $payload ): array {
+ $forms = array();
+
+ if ( '' !== $payload && '&' !== $payload[0] ) {
+ $forms['literal'] = true;
+ }
+ if ( 1 === preg_match( '/[1-9][0-9]*;?/', $payload ) ) {
+ $forms['decimal'] = true;
+ }
+ if ( 1 === preg_match( '/+[0-9]+;?/', $payload ) ) {
+ $forms['leading-zero'] = true;
+ }
+ if ( 1 === preg_match( '/[xX][0-9A-Fa-f]+;?/', $payload ) ) {
+ $forms['hex'] = true;
+ }
+ if ( 1 === preg_match( '/(?:[0-9]+(?:$|[^0-9;])|[xX][0-9A-Fa-f]+(?:$|[^0-9A-Fa-f;]))/', $payload ) ) {
+ $forms['semicolonless'] = true;
+ }
+
+ return array_keys( $forms );
+}
+
+/**
+ * @return string[]
+ */
+function expected_weighted_strategies(): array {
+ return array(
+ 'adjacency',
+ 'attribute-discriminator',
+ 'attribute-prefix',
+ 'case-mangled-name',
+ 'composition',
+ 'lookalike',
+ 'multibyte-around',
+ 'named-exact',
+ 'named-missing-semi',
+ 'numeric',
+ 'plain-no-amp',
+ 'reference-at-eof',
+ 'truncation-sweep',
+ );
+}
+
+/**
+ * @return string[]
+ */
+function expected_corpus_strategies(): array {
+ return array(
+ 'corpus-byte-perturb',
+ 'corpus-reference-duplication',
+ 'corpus-semicolon-toggle',
+ 'corpus-splice',
+ );
+}
+
+/**
+ * @return string[]
+ */
+function corpus_seed_payloads(): array {
+ $method = new \ReflectionMethod( Generator::class, 'corpus_payloads' );
+ $method->setAccessible( true );
+ return $method->invoke( null );
+}
+
+/**
+ * @param string[] $base_names
+ * @return array{base_set: array, delete: array, substitution: array>, transpose: array}
+ */
+function lookalike_mutation_indexes( array $base_names ): array {
+ $base_set = array_fill_keys( $base_names, true );
+ $delete_mutants = array();
+ $substitution_patterns = array();
+ $transpose_mutants = array();
+
+ foreach ( $base_names as $base ) {
+ $length = strlen( $base );
+ for ( $i = 0; $i < $length; $i++ ) {
+ $delete = substr( $base, 0, $i ) . substr( $base, $i + 1 );
+ if ( '' !== $delete && ! isset( $base_set[ $delete ] ) ) {
+ $delete_mutants[ $delete ] = true;
+ }
+
+ $substitution_patterns[ $length ][ substr( $base, 0, $i ) . "\0" . substr( $base, $i + 1 ) ] = true;
+ }
+
+ for ( $i = 0; $i < $length - 1; $i++ ) {
+ if ( $base[ $i ] === $base[ $i + 1 ] ) {
+ continue;
+ }
+ $transpose = substr( $base, 0, $i ) . $base[ $i + 1 ] . $base[ $i ] . substr( $base, $i + 2 );
+ if ( ! isset( $base_set[ $transpose ] ) ) {
+ $transpose_mutants[ $transpose ] = true;
+ }
+ }
+ }
+
+ return array(
+ 'base_set' => $base_set,
+ 'delete' => $delete_mutants,
+ 'substitution' => $substitution_patterns,
+ 'transpose' => $transpose_mutants,
+ );
+}
+
+/**
+ * @param array{base_set: array, delete: array, substitution: array>, transpose: array} $indexes
+ * @return string[]
+ */
+function lookalike_candidate_classes( string $candidate, array $indexes ): array {
+ if ( '' === $candidate || isset( $indexes['base_set'][ $candidate ] ) ) {
+ return array();
+ }
+
+ $classes = array();
+ if ( isset( $indexes['delete'][ $candidate ] ) ) {
+ $classes['delete'] = true;
+ }
+
+ $length = strlen( $candidate );
+ for ( $i = 0; $i < $length; $i++ ) {
+ $shorter = substr( $candidate, 0, $i ) . substr( $candidate, $i + 1 );
+ if ( isset( $indexes['base_set'][ $shorter ] ) ) {
+ $classes['insert'] = true;
+ break;
+ }
+ }
+
+ $substitution_patterns = $indexes['substitution'][ $length ] ?? array();
+ for ( $i = 0; $i < $length; $i++ ) {
+ $pattern = substr( $candidate, 0, $i ) . "\0" . substr( $candidate, $i + 1 );
+ if ( isset( $substitution_patterns[ $pattern ] ) ) {
+ $classes['substitute'] = true;
+ break;
+ }
+ }
+
+ if ( isset( $indexes['transpose'][ $candidate ] ) ) {
+ $classes['transpose'] = true;
+ }
+
+ return array_keys( $classes );
+}
+
+function sparse_lookalike_operation( string $candidate, string $base ): ?string {
+ $candidate_length = strlen( $candidate );
+ $base_length = strlen( $base );
+
+ if ( $candidate_length === $base_length - 1 ) {
+ for ( $i = 0; $i < $base_length; $i++ ) {
+ if ( substr( $base, 0, $i ) . substr( $base, $i + 1 ) === $candidate ) {
+ return 'delete';
+ }
+ }
+ }
+
+ if ( $candidate_length === $base_length + 1 ) {
+ for ( $i = 0; $i < $candidate_length; $i++ ) {
+ if ( substr( $candidate, 0, $i ) . substr( $candidate, $i + 1 ) === $base ) {
+ return 'insert';
+ }
+ }
+ }
+
+ if ( $candidate_length !== $base_length ) {
+ return null;
+ }
+
+ $diffs = array();
+ for ( $i = 0; $i < $base_length; $i++ ) {
+ if ( $candidate[ $i ] !== $base[ $i ] ) {
+ $diffs[] = $i;
+ }
+ }
+
+ if ( 1 === count( $diffs ) ) {
+ return 'substitute';
+ }
+
+ if (
+ 2 === count( $diffs ) &&
+ $diffs[1] === $diffs[0] + 1 &&
+ $candidate[ $diffs[0] ] === $base[ $diffs[1] ] &&
+ $candidate[ $diffs[1] ] === $base[ $diffs[0] ]
+ ) {
+ return 'transpose';
+ }
+
+ return null;
+}
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'decode_text' => static fn( string $text ): string => str_replace( "\u{20AC}", "\u{0080}", \WP_HTML_Decoder::decode_text_node( $text ) ),
+ 'decode_attribute' => static fn( string $text ): string => str_replace( "\u{20AC}", "\u{0080}", \WP_HTML_Decoder::decode_attribute( $text ) ),
+ )
+);
+check( 'catches decoder skipping C1 remap', in_array( 'decode-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'decode_attribute' => static fn( string $text ): string => \WP_HTML_Decoder::decode_text_node( $text ),
+ 'read_character_reference' => static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ return \WP_HTML_Decoder::read_character_reference( 'attribute' === $context ? 'data' : $context, $text, $at, $match_byte_length );
+ },
+ )
+);
+check( 'catches semicolonless refs decoded in attributes', in_array( 'decode-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'read_character_reference' => static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string {
+ $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length );
+ if ( null !== $result ) {
+ ++$match_byte_length;
+ }
+ return $result;
+ },
+ )
+);
+check(
+ 'catches off-by-one match length',
+ in_array( 'reader-decode-mismatch', $seen, true ) || in_array( 'reader-overran-input', $seen, true ),
+ implode( ',', $seen )
+);
+
+$seen = fault_run( $oracles, 'reader-empty-chunk', 'a&b' );
+check( 'fault target reader-empty-chunk exposes empty chunks', in_array( 'reader-returned-empty-chunk', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'reader-short-match-length', 'a&b' );
+check( 'fault target reader-short-match-length exposes one-byte matches', in_array( 'reader-match-too-short', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'reader-substring-composition' );
+check( 'fault target reader-substring-composition exposes local-reader mismatches', in_array( 'reader-composition-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'reader-null-mutates-match-length', 'a&bogus;b' );
+check( 'fault target reader-null-mutates-match-length exposes null match-length mutation', in_array( 'reader-mutated-match-length-on-null', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'reader-non-amp-match', 'a&b' );
+check( 'fault target reader-non-amp-match exposes non-amp reader matches', in_array( 'reader-non-amp-match', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'reader-gapless-drop-span', 'a&b' );
+check( 'fault target reader-gapless-drop-span exposes non-gapless reader walks', in_array( 'reader-walk-not-gapless', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'numeric-invalid-not-replacement', 'ab' );
+check( 'fault target numeric-invalid-not-replacement exposes invalid numeric replacements', in_array( 'numeric-invalid-not-replacement', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'numeric-c1-not-remapped', 'ab' );
+check( 'fault target numeric-c1-not-remapped exposes skipped numeric C1 remaps', in_array( 'numeric-c1-not-remapped', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run_without_oracle( $oracles, 'raw-c1-not-pass-through', "\x80\x9F" );
+check( 'fault target raw-c1-not-pass-through exposes raw C1 byte rewrites', in_array( 'raw-c1-not-pass-through', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'text-secondary-oracle', 'a&b', 'text' );
+check( 'fault target text-secondary-oracle exposes secondary text-oracle mismatches', in_array( 'text-secondary-oracle-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'text-secondary-oracle', 'ab', 'text' );
+check( 'secondary text oracle skips numeric references unsupported by html_entity_decode', ! in_array( 'text-secondary-oracle-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'text-secondary-oracle', 'aÆlater;b', 'text' );
+check( 'secondary text oracle skips unknown names with legacy prefixes', ! in_array( 'text-secondary-oracle-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$single_level_failures = $checks->run( 'both', '&' );
+check( 'single-level decode keeps nested ampersand reference literal', array() === $single_level_failures, json_encode( $single_level_failures ) );
+
+$seen = fault_run( $oracles, 'single-level-overdecode', '&', 'text' );
+check( 'fault target single-level-overdecode exposes text double decodes', in_array( 'single-level-decode-overdecoded', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'single-level-overdecode', '&', 'attribute' );
+check( 'fault target single-level-overdecode exposes attribute double decodes', in_array( 'single-level-decode-overdecoded', $seen, true ), implode( ',', $seen ) );
+
+$wrong_text = '!a&b';
+$wrong_primary_oracles = new class( $wrong_text ) extends Oracles {
+ private string $wrong_text;
+
+ public function __construct( string $wrong_text ) {
+ $this->wrong_text = $wrong_text;
+ }
+
+ public function decode( string $context, string $payload ): string {
+ if ( 'text' === $context ) {
+ return $this->wrong_text;
+ }
+
+ return parent::decode( $context, $payload );
+ }
+};
+$wrong_agreement_checks = new Checks(
+ $wrong_primary_oracles,
+ array_merge(
+ $real_targets,
+ array(
+ 'decode_text' => static fn( string $text ): string => $wrong_text,
+ )
+ )
+);
+$wrong_agreement_seen = array();
+foreach ( $wrong_agreement_checks->run( 'text', 'a&b' ) as $failure ) {
+ $wrong_agreement_seen[ $failure['check'] ] = true;
+}
+check(
+ 'secondary text oracle catches primary and target agreement on wrong text',
+ isset( $wrong_agreement_seen['text-secondary-oracle-mismatch'] ) &&
+ ! isset( $wrong_agreement_seen['decode-mismatch'] ),
+ implode( ',', array_keys( $wrong_agreement_seen ) )
+);
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'decode_attribute' => static function ( string $text ): string {
+ $decoded = \WP_HTML_Decoder::decode_attribute( $text );
+ return str_contains( $text, '&' ) ? $decoded : '!' . $decoded;
+ },
+ )
+);
+check( 'catches attribute no-amp identity violations in oracle mode', in_array( 'attribute-without-ampersand-not-identity', $seen, true ), implode( ',', $seen ) );
+
+$seen = fault_run( $oracles, 'attribute-no-amp-identity', 'plain' );
+check( 'fault target attribute-no-amp-identity exposes attribute no-amp identity violations', in_array( 'attribute-without-ampersand-not-identity', $seen, true ), implode( ',', $seen ) );
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ): bool {
+ unset( $case_sensitivity );
+ return '' === $search || strlen( $haystack ) < strlen( $search ) || str_starts_with( \WP_HTML_Decoder::decode_attribute( $haystack ), $search );
+ },
+ )
+);
+check( 'catches partial-prefix attribute matcher', in_array( 'attribute-starts-with-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ) use ( $real_targets ): bool {
+ if ( str_starts_with( $haystack, '<⃒' ) && "<\xE2" === $search ) {
+ return false;
+ }
+ return $real_targets['attribute_starts_with']( $haystack, $search, $case_sensitivity );
+ },
+ )
+);
+check( 'catches partial multi-code-point attribute matcher', in_array( 'attribute-starts-with-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ) use ( $real_targets ): bool {
+ if ( 'jav' === $search ) {
+ return false;
+ }
+ return $real_targets['attribute_starts_with']( $haystack, $search, $case_sensitivity );
+ },
+ )
+);
+check( 'catches attribute_starts_with prefix monotonicity violations', in_array( 'attribute-starts-with-prefix-monotonicity', $seen, true ), implode( ',', $seen ) );
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ) use ( $real_targets ): bool {
+ if ( str_ends_with( $search, "\x7F" ) ) {
+ return true;
+ }
+ return $real_targets['attribute_starts_with']( $haystack, $search, $case_sensitivity );
+ },
+ )
+);
+check( 'catches attribute_starts_with extension monotonicity violations', in_array( 'attribute-starts-with-extension-monotonicity', $seen, true ), implode( ',', $seen ) );
+
+$seen = broken_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ) use ( $real_targets ): bool {
+ if ( 'ascii-case-insensitive' === $case_sensitivity && 'jav' === $search ) {
+ return false;
+ }
+ return $real_targets['attribute_starts_with']( $haystack, $search, $case_sensitivity );
+ },
+ )
+);
+check( 'catches attribute_starts_with case monotonicity violations', in_array( 'attribute-starts-with-case-monotonicity', $seen, true ), implode( ',', $seen ) );
+
+$attribute_faults = array(
+ 'attribute-prefix-monotonicity' => 'attribute-starts-with-prefix-monotonicity',
+ 'attribute-extension-monotonicity' => 'attribute-starts-with-extension-monotonicity',
+ 'attribute-case-monotonicity' => 'attribute-starts-with-case-monotonicity',
+);
+foreach ( $attribute_faults as $fault => $expected_check ) {
+ $seen = fault_run( $oracles, $fault );
+ check( "fault target {$fault} exposes {$expected_check}", in_array( $expected_check, $seen, true ), implode( ',', $seen ) );
+}
+
+$seen = fault_run( $oracles, 'attribute-multicodepoint-prefix', '<⃒tail' );
+check( 'fault target attribute-multicodepoint-prefix exposes partial replacement prefixes', in_array( 'attribute-starts-with-mismatch', $seen, true ), implode( ',', $seen ) );
+
+$seen = broken_oracle_free_run(
+ $oracles,
+ $real_targets,
+ array(
+ 'decode_text' => static fn( string $text ): string => str_replace( "\x00", '', \WP_HTML_Decoder::decode_text_node( $text ) ),
+ 'decode_attribute' => static fn( string $text ): string => str_replace( "\x00", '', \WP_HTML_Decoder::decode_attribute( $text ) ),
+ )
+);
+check(
+ 'catches oracle-free no-amp byte identity violations',
+ in_array( 'text-without-ampersand-not-identity', $seen, true ) &&
+ in_array( 'attribute-without-ampersand-not-identity', $seen, true ),
+ implode( ',', $seen )
+);
+
+$names = Bootstrap::named_reference_names();
+check( 'uses generated named-reference map', count( $names ) > 2000, (string) count( $names ) );
+
+$a = ( new Generator( new Prng( '7:3' ), 4096, $names ) )->generate();
+$b = ( new Generator( new Prng( '7:3' ), 4096, $names ) )->generate();
+check( 'generator deterministic for (seed, case)', $a === $b );
+
+$custom_names = array( 'zz;', 'amp;', 'LongName;', 'abc', 'copy', 'z' );
+$reversed_custom_names = array_reverse( $custom_names );
+$order_stable_error = '';
+for ( $i = 0; $i < 80; $i++ ) {
+ $ordered_generator = new Generator( new Prng( "order-stable:{$i}" ), 4096, $custom_names );
+ $reversed_generator = new Generator( new Prng( "order-stable:{$i}" ), 4096, $reversed_custom_names );
+ if ( $ordered_generator->generate() !== $reversed_generator->generate() ) {
+ $order_stable_error = "weighted case {$i}";
+ break;
+ }
+
+ $ordered_sweep = new Generator( new Prng( "order-stable-name:{$i}" ), 4096, $custom_names );
+ $reversed_sweep = new Generator( new Prng( "order-stable-name:{$i}" ), 4096, $reversed_custom_names );
+ if ( $ordered_sweep->generate_name_sweep( $i ) !== $reversed_sweep->generate_name_sweep( $i ) ) {
+ $order_stable_error = "name sweep case {$i}";
+ break;
+ }
+
+ $ordered_legacy = new Generator( new Prng( "order-stable-legacy:{$i}" ), 4096, $custom_names );
+ $reversed_legacy = new Generator( new Prng( "order-stable-legacy:{$i}" ), 4096, $reversed_custom_names );
+ if ( $ordered_legacy->generate_legacy_follower_sweep( $i ) !== $reversed_legacy->generate_legacy_follower_sweep( $i ) ) {
+ $order_stable_error = "legacy follower case {$i}";
+ break;
+ }
+}
+check( 'generator sorts injected named-reference lists deterministically', '' === $order_stable_error, $order_stable_error );
+
+$name_sweep_generator = new Generator( new Prng( 'name-sweep' ), 4096, $names );
+$name_sweep_base_names = name_sweep_base_names( $names );
+$name_sweep_followers = array( '', 'x', 'X', '0', '=', '-', ' ', '/', "\u{00E9}" );
+$name_sweep_period = count( $name_sweep_base_names ) * 2 * count( $name_sweep_followers );
+$name_sweep_mismatch = '';
+$name_sweep_contexts = array();
+$name_sweep_strategies = array();
+$name_sweep_unsafe = 0;
+for ( $i = 0; $i < $name_sweep_period; $i++ ) {
+ $generated = $name_sweep_generator->generate_name_sweep( $i );
+ $name_sweep_contexts[ $generated['context'] ] = true;
+ $name_sweep_strategies[ $generated['strategy'] ] = true;
+ if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) {
+ ++$name_sweep_unsafe;
+ }
+
+ if ( '' === $name_sweep_mismatch ) {
+ $variant = $i % ( 2 * count( $name_sweep_followers ) );
+ $expected = '&' . $name_sweep_base_names[ intdiv( $i, 2 * count( $name_sweep_followers ) ) ] .
+ ( $variant >= count( $name_sweep_followers ) ? ';' : '' ) .
+ $name_sweep_followers[ $variant % count( $name_sweep_followers ) ];
+ if ( $generated['payload'] !== $expected ) {
+ $name_sweep_mismatch = "case {$i}: expected " . bin2hex( $expected ) . ' got ' . bin2hex( $generated['payload'] );
+ }
+ }
+}
+check( 'name-sweep period covers every base/semicolon/follower case', $name_sweep_generator->name_sweep_period() === $name_sweep_period && $name_sweep_period > count( $names ), (string) $name_sweep_period );
+check( 'name-sweep generator maps cases deterministically', '' === $name_sweep_mismatch, $name_sweep_mismatch );
+check( 'name-sweep cases run both contexts', array( 'both' ) === array_keys( $name_sweep_contexts ), implode( ',', array_keys( $name_sweep_contexts ) ) );
+check( 'name-sweep uses one strategy label', array( 'name-sweep' ) === array_keys( $name_sweep_strategies ), implode( ',', array_keys( $name_sweep_strategies ) ) );
+check( 'name-sweep payloads are oracle-safe', 0 === $name_sweep_unsafe, (string) $name_sweep_unsafe );
+
+$legacy_follower_generator = new Generator( new Prng( 'legacy-follower-sweep' ), 4096, $names );
+$legacy_names = array_values( array_filter( $names, static fn( string $name ): bool => ! str_ends_with( $name, ';' ) ) );
+$legacy_followers = legacy_follower_sweep_followers();
+$legacy_period = count( $legacy_names ) * count( $legacy_followers );
+$legacy_mismatch = '';
+$legacy_contexts = array();
+$legacy_strategies = array();
+$legacy_unsafe = 0;
+$legacy_seen_names = array();
+$legacy_seen_followers = array();
+$legacy_ascii_followers = array();
+$legacy_utf8_leads = array();
+$legacy_utf8_continuations = array();
+for ( $i = 0; $i < $legacy_period; $i++ ) {
+ $generated = $legacy_follower_generator->generate_legacy_follower_sweep( $i );
+ $name = $legacy_names[ intdiv( $i, count( $legacy_followers ) ) ];
+ $follower = $legacy_followers[ $i % count( $legacy_followers ) ];
+ $expected = '&' . $name . $follower;
+
+ $legacy_contexts[ $generated['context'] ] = true;
+ $legacy_strategies[ $generated['strategy'] ] = true;
+ $legacy_seen_names[ $name ] = true;
+ $legacy_seen_followers[ $follower ] = true;
+ if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) {
+ ++$legacy_unsafe;
+ }
+ if ( '' === $legacy_mismatch && $expected !== $generated['payload'] ) {
+ $legacy_mismatch = "case {$i}: expected " . bin2hex( $expected ) . ' got ' . bin2hex( $generated['payload'] );
+ }
+
+ if ( 1 === strlen( $follower ) ) {
+ $legacy_ascii_followers[ ord( $follower ) ] = true;
+ } else {
+ $legacy_utf8_leads[ ord( $follower[0] ) ] = true;
+ for ( $j = 1; $j < strlen( $follower ); $j++ ) {
+ $legacy_utf8_continuations[ ord( $follower[ $j ] ) ] = true;
+ }
+ }
+}
+$expected_ascii_followers = array_values(
+ array_filter(
+ range( 1, 0x7F ),
+ static fn( int $byte ): bool => ! in_array( $byte, array( 0x0D, 0x22, 0x3C ), true )
+ )
+);
+check( 'legacy-follower period covers every legacy name and follower', $legacy_follower_generator->legacy_follower_sweep_period() === $legacy_period && count( $legacy_seen_names ) === count( $legacy_names ) && count( $legacy_seen_followers ) === count( $legacy_followers ), (string) $legacy_period );
+check( 'legacy-follower generator maps cases deterministically', '' === $legacy_mismatch, $legacy_mismatch );
+check( 'legacy-follower cases run both contexts', array( 'both' ) === array_keys( $legacy_contexts ), implode( ',', array_keys( $legacy_contexts ) ) );
+check( 'legacy-follower uses one strategy label', array( 'legacy-follower-sweep' ) === array_keys( $legacy_strategies ), implode( ',', array_keys( $legacy_strategies ) ) );
+check( 'legacy-follower payloads are oracle-safe', 0 === $legacy_unsafe, (string) $legacy_unsafe );
+check( 'legacy-follower covers every oracle-safe ASCII follower byte', array() === array_diff( $expected_ascii_followers, array_keys( $legacy_ascii_followers ) ), implode( ',', array_keys( $legacy_ascii_followers ) ) );
+check( 'legacy-follower covers valid UTF-8 lead bytes', array() === array_diff( range( 0xC2, 0xF4 ), array_keys( $legacy_utf8_leads ) ), implode( ',', array_map( static fn( int $byte ): string => dechex( $byte ), array_keys( $legacy_utf8_leads ) ) ) );
+check( 'legacy-follower covers UTF-8 continuation bytes', array() === array_diff( range( 0x80, 0xBF ), array_keys( $legacy_utf8_continuations ) ), implode( ',', array_map( static fn( int $byte ): string => dechex( $byte ), array_keys( $legacy_utf8_continuations ) ) ) );
+
+$prefix_family_generator = new Generator( new Prng( 'prefix-family-sweep' ), 4096, $names );
+$prefix_family_cases = prefix_family_sweep_cases( $name_sweep_base_names );
+$prefix_family_mismatch = '';
+$prefix_family_contexts = array();
+$prefix_family_strategies = array();
+$prefix_family_unsafe = 0;
+$prefix_family_references = array();
+$prefix_family_split_keys = array();
+$prefix_family_followers = array();
+for ( $i = 0; $i < count( $prefix_family_cases ); $i++ ) {
+ $generated = $prefix_family_generator->generate_prefix_family_sweep( $i );
+ $case = $prefix_family_cases[ $i ];
+ $expected = substr( $case['reference'], 0, $case['split'] ) . $case['follower'];
+
+ $prefix_family_contexts[ $generated['context'] ] = true;
+ $prefix_family_strategies[ $generated['strategy'] ] = true;
+ $prefix_family_references[ $case['reference'] ] = true;
+ $prefix_family_split_keys[ $case['reference'] . ':' . $case['split'] ] = true;
+ $prefix_family_followers[ $case['follower'] ] = true;
+ if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) {
+ ++$prefix_family_unsafe;
+ }
+ if ( '' === $prefix_family_mismatch && $expected !== $generated['payload'] ) {
+ $prefix_family_mismatch = "case {$i}: expected " . bin2hex( $expected ) . ' got ' . bin2hex( $generated['payload'] );
+ }
+}
+$expected_prefix_split_count = 0;
+foreach ( array_keys( $prefix_family_references ) as $reference ) {
+ $expected_prefix_split_count += strlen( $reference ) - 1;
+}
+check(
+ 'prefix-family period covers every reference split and follower',
+ $prefix_family_generator->prefix_family_sweep_period() === count( $prefix_family_cases ) &&
+ array() === array_diff(
+ array_map( static fn( string $reference ): string => '&' . $reference, prefix_family_sweep_references() ),
+ array_keys( $prefix_family_references )
+ ) &&
+ count( $prefix_family_references ) === count( prefix_family_sweep_references() ) &&
+ count( $prefix_family_split_keys ) === $expected_prefix_split_count &&
+ count( $prefix_family_followers ) === count( prefix_family_sweep_followers() ),
+ (string) count( $prefix_family_cases ) . ' ' . implode( ',', array_keys( $prefix_family_references ) )
+);
+check( 'prefix-family generator maps cases deterministically', '' === $prefix_family_mismatch, $prefix_family_mismatch );
+check( 'prefix-family cases run both contexts', array( 'both' ) === array_keys( $prefix_family_contexts ), implode( ',', array_keys( $prefix_family_contexts ) ) );
+check( 'prefix-family uses one strategy label', array( 'prefix-family-sweep' ) === array_keys( $prefix_family_strategies ), implode( ',', array_keys( $prefix_family_strategies ) ) );
+check( 'prefix-family payloads are oracle-safe', 0 === $prefix_family_unsafe, (string) $prefix_family_unsafe );
+check( 'prefix-family covers expected ambiguous followers', array() === array_diff( prefix_family_sweep_followers(), array_keys( $prefix_family_followers ) ), implode( ',', array_keys( $prefix_family_followers ) ) );
+
+$token_map_generator = new Generator( new Prng( 'token-map-sweep' ), 4096, $names );
+$token_map_cases = token_map_sweep_cases();
+$token_map_structure = Bootstrap::named_reference_structure();
+$token_map_minimal_large_names = array_values(
+ array_filter(
+ $token_map_structure['large_names'],
+ static fn( string $name ): bool => strlen( $name ) === $token_map_structure['key_length'] + 1
+ )
+);
+$token_map_large_name_set = array_fill_keys( $token_map_structure['large_names'], true );
+$token_map_small_name_set = array_fill_keys( $token_map_structure['small_names'], true );
+$token_map_mismatch = '';
+$token_map_contexts = array();
+$token_map_strategies = array();
+$token_map_shapes = array();
+$token_map_prefixes = array();
+$token_map_small_exact_names = array();
+$token_map_small_extended_names = array();
+$token_map_large_exact_names = array();
+$token_map_large_extended_names = array();
+$token_map_divergence_errors = array();
+$token_map_unsafe = 0;
+$token_map_fault_case_index = null;
+for ( $i = 0; $i < count( $token_map_cases ); $i++ ) {
+ $case = $token_map_cases[ $i ];
+ $generated = $token_map_generator->generate_token_map_sweep( $i );
+ $shape = $case['shape'];
+
+ $token_map_contexts[ $generated['context'] ] = true;
+ $token_map_strategies[ $generated['strategy'] ] = true;
+ $token_map_shapes[ $shape ] = true;
+ if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) {
+ ++$token_map_unsafe;
+ }
+ if ( '' === $token_map_mismatch && $case['payload'] !== $generated['payload'] ) {
+ $token_map_mismatch = "case {$i}: expected " . bin2hex( $case['payload'] ) . ' got ' . bin2hex( $generated['payload'] );
+ }
+
+ if ( 'large-prefix-divergent' === $shape ) {
+ $prefix = $case['prefix'] ?? '';
+ $token_map_prefixes[ $prefix ] = true;
+ $payload_name = substr( $case['payload'], 1 );
+ $rest = substr( $payload_name, strlen( $prefix ) );
+ $first_rest = '' === $rest ? '' : $rest[0];
+ $used_first_rest_chars = array();
+ foreach ( $token_map_structure['large_names_by_prefix'][ $prefix ] ?? array() as $name ) {
+ $name_rest = substr( $name, strlen( $prefix ) );
+ if ( '' !== $name_rest ) {
+ $used_first_rest_chars[ $name_rest[0] ] = true;
+ }
+ }
+
+ if (
+ strlen( $prefix ) !== $token_map_structure['key_length'] ||
+ ! str_starts_with( $case['payload'], '&' . $prefix ) ||
+ ! str_ends_with( $case['payload'], ';' ) ||
+ isset( $token_map_large_name_set[ $payload_name ] ) ||
+ isset( $token_map_small_name_set[ $payload_name ] ) ||
+ '' === $first_rest ||
+ isset( $used_first_rest_chars[ $first_rest ] )
+ ) {
+ $token_map_divergence_errors[] = "{$i}:" . bin2hex( $case['payload'] );
+ }
+ } elseif ( 'small-boundary-exact' === $shape ) {
+ $token_map_small_exact_names[ $case['name'] ?? '' ] = true;
+ } elseif ( 'small-boundary-extended' === $shape ) {
+ $token_map_small_extended_names[ $case['name'] ?? '' ] = true;
+ if ( null === $token_map_fault_case_index ) {
+ $token_map_fault_case_index = $i;
+ }
+ } elseif ( 'large-boundary-exact' === $shape ) {
+ $token_map_large_exact_names[ $case['name'] ?? '' ] = true;
+ } elseif ( 'large-boundary-extended' === $shape ) {
+ $token_map_large_extended_names[ $case['name'] ?? '' ] = true;
+ }
+}
+$expected_token_map_shapes = array(
+ 'large-prefix-divergent',
+ 'small-boundary-exact',
+ 'small-boundary-extended',
+ 'large-boundary-exact',
+ 'large-boundary-extended',
+);
+check(
+ 'token-map structure exposes two-byte large-word prefixes',
+ 2 === $token_map_structure['key_length'] &&
+ count( $token_map_structure['group_prefixes'] ) > 0 &&
+ count( $token_map_structure['group_prefixes'] ) === count( $token_map_structure['large_names_by_prefix'] ),
+ json_encode(
+ array(
+ 'key_length' => $token_map_structure['key_length'],
+ 'prefixes' => count( $token_map_structure['group_prefixes'] ),
+ )
+ )
+);
+check(
+ 'token-map period covers prefix divergences and boundary names',
+ $token_map_generator->token_map_period() === count( $token_map_cases ) &&
+ array() === array_diff( $token_map_structure['group_prefixes'], array_keys( $token_map_prefixes ) ) &&
+ count( $token_map_prefixes ) === count( $token_map_structure['group_prefixes'] ) &&
+ array() === array_diff( $token_map_structure['small_names'], array_keys( $token_map_small_exact_names ) ) &&
+ array() === array_diff( $token_map_structure['small_names'], array_keys( $token_map_small_extended_names ) ) &&
+ array() === array_diff( $token_map_minimal_large_names, array_keys( $token_map_large_exact_names ) ) &&
+ array() === array_diff( $token_map_minimal_large_names, array_keys( $token_map_large_extended_names ) ),
+ (string) count( $token_map_cases )
+);
+check( 'token-map generator maps cases deterministically', '' === $token_map_mismatch, $token_map_mismatch );
+check( 'token-map cases run both contexts', array( 'both' ) === array_keys( $token_map_contexts ), implode( ',', array_keys( $token_map_contexts ) ) );
+check( 'token-map uses one strategy label', array( 'token-map-structure-sweep' ) === array_keys( $token_map_strategies ), implode( ',', array_keys( $token_map_strategies ) ) );
+check( 'token-map payloads are oracle-safe', 0 === $token_map_unsafe, (string) $token_map_unsafe );
+check(
+ 'token-map emits expected structure-aware shapes',
+ array() === array_diff( $expected_token_map_shapes, array_keys( $token_map_shapes ) ) &&
+ array() === array_diff( array_keys( $token_map_shapes ), $expected_token_map_shapes ),
+ implode( ',', array_keys( $token_map_shapes ) )
+);
+check( 'token-map large-prefix probes diverge after the shared map prefix', array() === $token_map_divergence_errors, implode( ',', $token_map_divergence_errors ) );
+check( 'token-map has semicolonless boundary fault case', null !== $token_map_fault_case_index, json_encode( $token_map_cases ) );
+
+$numeric_boundary_generator = new Generator( new Prng( 'numeric-boundary-sweep' ), 4096, $names );
+$numeric_boundary_cases = numeric_boundary_sweep_cases();
+$numeric_boundary_mismatch = '';
+$numeric_boundary_contexts = array();
+$numeric_boundary_strategies = array();
+$numeric_boundary_unsafe = 0;
+$numeric_boundary_shapes = array();
+$numeric_boundary_mixed_hex = false;
+$numeric_boundary_exact_max_replacements = array();
+$numeric_boundary_overflow_non_replacements = array();
+for ( $i = 0; $i < count( $numeric_boundary_cases ); $i++ ) {
+ $generated = $numeric_boundary_generator->generate_numeric_boundary_sweep( $i );
+ $expected = $numeric_boundary_cases[ $i ];
+ $shape = numeric_boundary_shape( $generated['payload'] );
+ $decoded = \WP_HTML_Decoder::decode_text_node( $generated['payload'] );
+
+ $numeric_boundary_contexts[ $generated['context'] ] = true;
+ $numeric_boundary_strategies[ $generated['strategy'] ] = true;
+ if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) {
+ ++$numeric_boundary_unsafe;
+ }
+ if ( '' === $numeric_boundary_mismatch && $expected !== $generated['payload'] ) {
+ $numeric_boundary_mismatch = "case {$i}: expected {$expected} got {$generated['payload']}";
+ }
+ $numeric_boundary_shapes[] = $shape['base'] . ':' . $shape['significant_digits'] . ':' . ( $shape['leading_zero'] ? 'zero' : 'plain' ) . ':' . ( $shape['semicolon'] ? 'semi' : 'nosemi' );
+ $numeric_boundary_mixed_hex = $numeric_boundary_mixed_hex || $shape['mixed_hex'];
+ if ( ( 'decimal' === $shape['base'] && 7 === $shape['significant_digits'] ) || ( 'hex' === $shape['base'] && 6 === $shape['significant_digits'] ) ) {
+ if ( "\u{FFFD}" === $decoded ) {
+ $numeric_boundary_exact_max_replacements[] = $generated['payload'];
+ }
+ } elseif ( ( 'decimal' === $shape['base'] && 8 === $shape['significant_digits'] ) || ( 'hex' === $shape['base'] && 7 === $shape['significant_digits'] ) ) {
+ if ( "\u{FFFD}" !== $decoded ) {
+ $numeric_boundary_overflow_non_replacements[] = $generated['payload'] . ':' . bin2hex( $decoded );
+ }
+ }
+}
+$expected_numeric_boundary_shapes = array();
+foreach ( array( 'decimal' => 7, 'hex' => 6 ) as $base => $max_digits ) {
+ foreach ( array( $max_digits, $max_digits + 1 ) as $digit_count ) {
+ foreach ( array( 'plain', 'zero' ) as $zero ) {
+ foreach ( array( 'nosemi', 'semi' ) as $semicolon ) {
+ $expected_numeric_boundary_shapes[] = "{$base}:{$digit_count}:{$zero}:{$semicolon}";
+ }
+ }
+ }
+}
+check( 'numeric-boundary period covers digit count, leading zero, and semicolon variants', $numeric_boundary_generator->numeric_boundary_sweep_period() === count( $numeric_boundary_cases ) && array() === array_diff( $expected_numeric_boundary_shapes, array_unique( $numeric_boundary_shapes ) ), implode( ',', array_unique( $numeric_boundary_shapes ) ) );
+check( 'numeric-boundary period keeps decimal and hex casing variants distinct', 32 === count( $numeric_boundary_cases ), (string) count( $numeric_boundary_cases ) );
+check( 'numeric-boundary exact-max digit cases stay in Unicode range', array() === $numeric_boundary_exact_max_replacements, implode( ',', $numeric_boundary_exact_max_replacements ) );
+check( 'numeric-boundary max-plus-one digit cases decode as invalid', array() === $numeric_boundary_overflow_non_replacements, implode( ',', $numeric_boundary_overflow_non_replacements ) );
+check( 'numeric-boundary generator maps cases deterministically', '' === $numeric_boundary_mismatch, $numeric_boundary_mismatch );
+check( 'numeric-boundary cases run both contexts', array( 'both' ) === array_keys( $numeric_boundary_contexts ), implode( ',', array_keys( $numeric_boundary_contexts ) ) );
+check( 'numeric-boundary uses one strategy label', array( 'numeric-boundary-sweep' ) === array_keys( $numeric_boundary_strategies ), implode( ',', array_keys( $numeric_boundary_strategies ) ) );
+check( 'numeric-boundary payloads are oracle-safe', 0 === $numeric_boundary_unsafe, (string) $numeric_boundary_unsafe );
+check( 'numeric-boundary emits mixed-case hex digits', $numeric_boundary_mixed_hex, implode( ',', $numeric_boundary_cases ) );
+
+$corpus_period_generator = new Generator( new Prng( 'corpus-period' ), 4096, $names );
+$corpus_strategies = array();
+$corpus_contexts = array();
+$corpus_payloads = array();
+$corpus_unsafe = 0;
+for ( $i = 0; $i < 600; $i++ ) {
+ $generated = ( new Generator( new Prng( "1:{$i}" ), 4096, $names ) )->generate_corpus_mutation( $i );
+ $corpus_strategies[ $generated['strategy'] ] = true;
+ $corpus_contexts[ $generated['context'] ] = true;
+ $corpus_payloads[ $generated['payload'] ] = true;
+ if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) {
+ ++$corpus_unsafe;
+ }
+}
+$seen_corpus_strategies = array_keys( $corpus_strategies );
+sort( $seen_corpus_strategies );
+$corpus_seed_payloads = corpus_seed_payloads();
+$required_corpus_payloads = array(
+ 'FOO>BAR',
+ 'ZZ>9YY',
+ 'ZZ>aYY',
+ 'ZZ£_id=23',
+ 'ZZ∏_id=23',
+ 'ZZÆ=',
+);
+check( 'corpus mutation seed corpus includes retained and external vectors', $corpus_period_generator->corpus_period() >= 40, (string) $corpus_period_generator->corpus_period() );
+check( 'corpus seed retains html5lib text and attribute entity vectors', array() === array_diff( $required_corpus_payloads, $corpus_seed_payloads ), implode( ',', array_diff( $required_corpus_payloads, $corpus_seed_payloads ) ) );
+check( 'corpus mutation generator emits every mutation strategy', expected_corpus_strategies() === $seen_corpus_strategies, implode( ',', $seen_corpus_strategies ) );
+check( 'corpus mutation cases run both contexts', array( 'both' ) === array_keys( $corpus_contexts ), implode( ',', array_keys( $corpus_contexts ) ) );
+check( 'corpus mutation payloads are oracle-safe', 0 === $corpus_unsafe, (string) $corpus_unsafe );
+check( 'corpus mutation diversifies retained payload shapes', count( $corpus_payloads ) > 300, (string) count( $corpus_payloads ) );
+
+$semicolon_toggle_method = new \ReflectionMethod( Generator::class, 'mutate_corpus_semicolon_toggle' );
+$semicolon_toggle_method->setAccessible( true );
+$duplication_method = new \ReflectionMethod( Generator::class, 'mutate_corpus_reference_duplication' );
+$duplication_method->setAccessible( true );
+$byte_perturb_method = new \ReflectionMethod( Generator::class, 'mutate_corpus_byte_perturb' );
+$byte_perturb_method->setAccessible( true );
+$splice_method = new \ReflectionMethod( Generator::class, 'mutate_corpus_splice' );
+$splice_method->setAccessible( true );
+
+check(
+ 'corpus semicolon toggle adds and removes semicolons',
+ '&' === $semicolon_toggle_method->invoke( new Generator( new Prng( 'corpus-toggle-remove' ), 4096, $names ), '&' ) &&
+ '&' === $semicolon_toggle_method->invoke( new Generator( new Prng( 'corpus-toggle-add' ), 4096, $names ), '&' )
+);
+check(
+ 'corpus reference duplication duplicates matched reference text',
+ 'x∉∉y' === $duplication_method->invoke( new Generator( new Prng( 'corpus-duplication' ), 4096, $names ), 'x∉y' )
+);
+$corpus_utf8_mutation_errors = array();
+for ( $i = 0; $i < 100; $i++ ) {
+ $byte_payload = $byte_perturb_method->invoke( new Generator( new Prng( "corpus-utf8-byte:{$i}" ), 4096, $names ), "\u{00E9}&\u{2603}" );
+ $splice_payload = $splice_method->invoke(
+ new Generator( new Prng( "corpus-utf8-splice:{$i}" ), 4096, $names ),
+ "A\u{00E9}B",
+ array( "\u{2603}&\u{00E9}", 'plain >' )
+ );
+ if ( ! mb_check_encoding( $byte_payload, 'UTF-8' ) ) {
+ $corpus_utf8_mutation_errors[] = 'byte:' . $i . ':' . bin2hex( $byte_payload );
+ }
+ if ( ! mb_check_encoding( $splice_payload, 'UTF-8' ) ) {
+ $corpus_utf8_mutation_errors[] = 'splice:' . $i . ':' . bin2hex( $splice_payload );
+ }
+}
+check( 'corpus byte perturb and splice preserve UTF-8 boundaries', array() === $corpus_utf8_mutation_errors, implode( ',', $corpus_utf8_mutation_errors ) );
+
+$lookalike_indexes = lookalike_mutation_indexes( $name_sweep_base_names );
+$lookalike_candidates = array();
+for ( $i = 0; $i < 6000; $i++ ) {
+ $generated = ( new Generator( new Prng( "lookalike-smoke:{$i}" ), 4096, $names ) )->generate();
+ if ( 'lookalike' !== $generated['strategy'] ) {
+ continue;
+ }
+ if ( 1 !== preg_match( '/&([A-Za-z0-9]+);?/', $generated['payload'], $match ) ) {
+ continue;
+ }
+
+ $candidate = $match[1];
+ $classes = lookalike_candidate_classes( $candidate, $lookalike_indexes );
+ if ( array() === $classes ) {
+ continue;
+ }
+
+ $lookalike_candidates[ $candidate ] = true;
+}
+check( 'lookalike generator emits edit-distance-1 name misses', count( $lookalike_candidates ) >= 100, (string) count( $lookalike_candidates ) );
+
+$sparse_lookalike_names = array( 'abcde;', 'vwxyz' );
+$sparse_lookalike_bases = name_sweep_base_names( $sparse_lookalike_names );
+$sparse_lookalike_classes = array();
+for ( $i = 0; $i < 6000; $i++ ) {
+ $generated = ( new Generator( new Prng( "lookalike-sparse-smoke:{$i}" ), 4096, $sparse_lookalike_names ) )->generate();
+ if ( 'lookalike' !== $generated['strategy'] || 1 !== preg_match( '/&([A-Za-z0-9]+);?/', $generated['payload'], $match ) ) {
+ continue;
+ }
+
+ foreach ( $sparse_lookalike_bases as $base ) {
+ $operation = sparse_lookalike_operation( $match[1], $base );
+ if ( null !== $operation ) {
+ $sparse_lookalike_classes[ $operation ] = true;
+ break;
+ }
+ }
+}
+check(
+ 'lookalike generator exercises every edit operation branch',
+ array() === array_diff( array( 'delete', 'insert', 'substitute', 'transpose' ), array_keys( $sparse_lookalike_classes ) ),
+ implode( ',', array_keys( $sparse_lookalike_classes ) )
+);
+
+$case_mangled_candidates = array();
+$case_mangled_invalid = array();
+$base_names_by_lowercase = array();
+foreach ( $name_sweep_base_names as $base ) {
+ $base_names_by_lowercase[ strtolower( $base ) ][] = $base;
+}
+for ( $i = 0; $i < 8000; $i++ ) {
+ $generated = ( new Generator( new Prng( "case-mangled-smoke:{$i}" ), 4096, $names ) )->generate();
+ if ( 'case-mangled-name' !== $generated['strategy'] ) {
+ continue;
+ }
+ if ( 1 !== preg_match( '/&([A-Za-z0-9]+);/', $generated['payload'], $match ) ) {
+ continue;
+ }
+
+ $candidate = $match[1];
+ $case_mangled_candidates[ $candidate ] = true;
+ if ( isset( $lookalike_indexes['base_set'][ $candidate ] ) || ! isset( $base_names_by_lowercase[ strtolower( $candidate ) ] ) ) {
+ $case_mangled_invalid[] = $candidate;
+ }
+}
+check( 'case-mangled generator emits case-only name misses', count( $case_mangled_candidates ) >= 100 && array() === $case_mangled_invalid, implode( ',', array_slice( $case_mangled_invalid, 0, 20 ) ) . ':' . count( $case_mangled_candidates ) );
+
+$case_mangle_method = new \ReflectionMethod( Generator::class, 'case_mangle_name_base' );
+$case_mangle_method->setAccessible( true );
+$case_mangle_direct_errors = array();
+for ( $i = 0; $i < 50; $i++ ) {
+ $lower_mutated = $case_mangle_method->invoke( new Generator( new Prng( "case-mangle-lower:{$i}" ), 4096, $names ), 'amp' );
+ $upper_mutated = $case_mangle_method->invoke( new Generator( new Prng( "case-mangle-upper:{$i}" ), 4096, $names ), 'AMP' );
+ if ( 'amp' === $lower_mutated || 'amp' !== strtolower( $lower_mutated ) ) {
+ $case_mangle_direct_errors[] = 'lower:' . $lower_mutated;
+ }
+ if ( 'AMP' === $upper_mutated || 'AMP' !== strtoupper( $upper_mutated ) ) {
+ $case_mangle_direct_errors[] = 'upper:' . $upper_mutated;
+ }
+}
+check(
+ 'case-mangle helper flips lowercase and uppercase source letters directly',
+ array() === $case_mangle_direct_errors,
+ implode( ',', array_slice( $case_mangle_direct_errors, 0, 20 ) )
+);
+
+$generator_reflection = new \ReflectionClass( Generator::class );
+$alphabet_constant = $generator_reflection->getReflectionConstant( 'ASCII_ALPHABET' );
+$ascii_alphabet = null === $alphabet_constant ? '' : (string) $alphabet_constant->getValue();
+check(
+ 'oracle-safe generator alphabet includes space, tab, LF, and FF followers',
+ str_contains( $ascii_alphabet, ' ' ) &&
+ str_contains( $ascii_alphabet, "\t" ) &&
+ str_contains( $ascii_alphabet, "\n" ) &&
+ str_contains( $ascii_alphabet, "\f" ) &&
+ Generator::is_oracle_safe_payload( $ascii_alphabet ),
+ bin2hex( $ascii_alphabet )
+);
+
+$strategies = array();
+$contexts = array();
+$unsafe = 0;
+$reference_at_eof = 0;
+$reference_at_eof_bad = 0;
+$reference_at_eof_shapes = array();
+$attribute_multicodepoint_prefix = 0;
+$composition = 0;
+$composition_bad_shape = 0;
+$composition_multi_reference_fragments = 0;
+$total = 1200;
+for ( $i = 0; $i < $total; $i++ ) {
+ $generated = ( new Generator( new Prng( "smoke:{$i}" ), 4096, $names ) )->generate();
+ $strategies[ $generated['strategy'] ] = true;
+ $contexts[ $generated['context'] ] = true;
+ if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) {
+ ++$unsafe;
+ }
+ if ( 'reference-at-eof' === $generated['strategy'] ) {
+ ++$reference_at_eof;
+ $shape = reference_at_eof_shape( $generated['payload'] );
+ if ( null === $shape ) {
+ ++$reference_at_eof_bad;
+ } else {
+ $reference_at_eof_shapes[ $shape ] = true;
+ }
+ }
+ if (
+ 'attribute-prefix' === $generated['strategy'] &&
+ (
+ str_starts_with( $generated['payload'], '<⃒' ) ||
+ str_starts_with( $generated['payload'], '>⃒' ) ||
+ str_starts_with( $generated['payload'], '≪̸' ) ||
+ str_starts_with( $generated['payload'], '=⃥' )
+ )
+ ) {
+ ++$attribute_multicodepoint_prefix;
+ }
+ if ( 'composition' === $generated['strategy'] ) {
+ ++$composition;
+ $fragments = explode( '|', $generated['payload'] );
+ if ( count( $fragments ) < 2 || count( $fragments ) > 3 || in_array( '', $fragments, true ) ) {
+ ++$composition_bad_shape;
+ }
+
+ $reference_fragments = 0;
+ foreach ( $fragments as $fragment ) {
+ if ( str_contains( $fragment, '&' ) ) {
+ ++$reference_fragments;
+ }
+ }
+ if ( $reference_fragments >= 2 ) {
+ ++$composition_multi_reference_fragments;
+ }
+ }
+}
+$seen_strategies = array_keys( $strategies );
+sort( $seen_strategies );
+check( 'all weighted strategies appear', expected_weighted_strategies() === $seen_strategies, implode( ',', $seen_strategies ) );
+check( 'generated cases run both contexts', array( 'both' ) === array_keys( $contexts ), implode( ',', array_keys( $contexts ) ) );
+check( 'generated payloads are oracle-safe', 0 === $unsafe, (string) $unsafe );
+check( 'attribute-prefix generator emits multi-code-point references', $attribute_multicodepoint_prefix > 0, (string) $attribute_multicodepoint_prefix );
+check( 'composition generator emits 2-3 separated fragments', $composition > 0 && 0 === $composition_bad_shape, "{$composition_bad_shape}/{$composition}" );
+check( 'composition generator splices multiple reference-bearing fragments', $composition_multi_reference_fragments > 0, "{$composition_multi_reference_fragments}/{$composition}" );
+check( 'reference-at-EOF cases end inside a reference', $reference_at_eof > 0 && 0 === $reference_at_eof_bad, "{$reference_at_eof_bad}/{$reference_at_eof}" );
+check(
+ 'reference-at-EOF covers expected suffix shapes',
+ array() === array_diff(
+ array( 'bare-introducer', 'partial-numeric-introducer', 'decimal-digits', 'hex-digits', 'named-prefix' ),
+ array_keys( $reference_at_eof_shapes )
+ ),
+ implode( ',', array_keys( $reference_at_eof_shapes ) )
+);
+
+$small_compositions = 0;
+$small_composition_bad = array();
+foreach ( array( 3, 5, 7, 12 ) as $max_bytes ) {
+ for ( $i = 0; $i < 1200; $i++ ) {
+ $generated = ( new Generator( new Prng( "composition-small:{$max_bytes}:{$i}" ), $max_bytes, $names ) )->generate();
+ if ( 'composition' !== $generated['strategy'] ) {
+ continue;
+ }
+
+ ++$small_compositions;
+ $fragments = explode( '|', $generated['payload'] );
+ if (
+ strlen( $generated['payload'] ) > $max_bytes ||
+ count( $fragments ) < 2 ||
+ count( $fragments ) > 3 ||
+ in_array( '', $fragments, true )
+ ) {
+ $small_composition_bad[] = "{$max_bytes}:{$i}:" . bin2hex( $generated['payload'] );
+ }
+ }
+}
+check( 'composition generator keeps small max-bytes fragments nonempty', $small_compositions > 0 && array() === $small_composition_bad, implode( ',', $small_composition_bad ) );
+
+$attribute_prefix_targets = array();
+$attribute_prefix_forms = array();
+$attribute_prefix_bad_targets = array();
+for ( $i = 0; $i < 8000; $i++ ) {
+ $generated = ( new Generator( new Prng( "attribute-prefix-smoke:{$i}" ), 4096, $names ) )->generate();
+ if ( 'attribute-prefix' !== $generated['strategy'] ) {
+ continue;
+ }
+
+ $decoded = $oracles->decode( 'attribute', $generated['payload'] );
+ foreach ( attribute_prefix_smoke_targets() as $target ) {
+ if ( ! str_starts_with( $decoded, $target ) ) {
+ continue;
+ }
+
+ $attribute_prefix_targets[ $target ] = true;
+ foreach ( attribute_prefix_encoding_forms( $generated['payload'] ) as $form ) {
+ $attribute_prefix_forms[ $form ] = true;
+ }
+ if ( ! \WP_HTML_Decoder::attribute_starts_with( $generated['payload'], $target, 'case-sensitive' ) ) {
+ $attribute_prefix_bad_targets[] = $target . ':' . bin2hex( substr( $generated['payload'], 0, 64 ) );
+ }
+ break;
+ }
+}
+check(
+ 'attribute-prefix encoder covers every target string',
+ array() === array_diff( attribute_prefix_smoke_targets(), array_keys( $attribute_prefix_targets ) ),
+ implode( ',', array_keys( $attribute_prefix_targets ) )
+);
+check(
+ 'attribute-prefix encoder covers literal, numeric, zero, hex, and semicolonless forms',
+ array() === array_diff( array( 'literal', 'decimal', 'leading-zero', 'hex', 'semicolonless' ), array_keys( $attribute_prefix_forms ) ),
+ implode( ',', array_keys( $attribute_prefix_forms ) )
+);
+check( 'attribute-prefix encoded targets satisfy attribute_starts_with', array() === $attribute_prefix_bad_targets, implode( ',', $attribute_prefix_bad_targets ) );
+
+$semicolonless_guard = new \ReflectionMethod( Generator::class, 'would_extend_semicolonless_numeric' );
+check(
+ 'attribute-prefix semicolonless numeric guard protects terminators and digits',
+ true === $semicolonless_guard->invoke( null, 'decimal', ';' ) &&
+ true === $semicolonless_guard->invoke( null, 'hex', ';' ) &&
+ true === $semicolonless_guard->invoke( null, 'decimal', '7' ) &&
+ true === $semicolonless_guard->invoke( null, 'hex', 'A' ) &&
+ false === $semicolonless_guard->invoke( null, 'decimal', 'A' ) &&
+ false === $semicolonless_guard->invoke( null, null, ';' )
+);
+
+$numeric_ranges = array();
+$numeric_c1_values = array();
+$numeric_bmp_terminal_noncharacters = array();
+$numeric_noncharacter_planes = array();
+for ( $i = 0; $i < 6000; $i++ ) {
+ $generated = ( new Generator( new Prng( "numeric-range-smoke:{$i}" ), 4096, $names ) )->generate();
+ foreach ( numeric_reference_ranges( $generated['payload'] ) as $range => $_ ) {
+ $numeric_ranges[ $range ] = true;
+ }
+ $match_count = preg_match_all( '/(?:([xX])([0-9A-Fa-f]+)|([0-9]+));?/', $generated['payload'], $matches, PREG_SET_ORDER );
+ if ( false !== $match_count && $match_count > 0 ) {
+ foreach ( $matches as $match ) {
+ $is_hex = '' !== ( $match[1] ?? '' );
+ $digits = $is_hex ? $match[2] : $match[3];
+ $significant_digits = substr( $digits, strspn( $digits, '0' ) );
+ if ( '' === $significant_digits || strlen( $significant_digits ) > ( $is_hex ? 6 : 7 ) ) {
+ continue;
+ }
+
+ $value = intval( $significant_digits, $is_hex ? 16 : 10 );
+ if ( $value >= 0x80 && $value <= 0x9F ) {
+ $numeric_c1_values[ $value ] = true;
+ }
+ if ( 0xFFFE === $value || 0xFFFF === $value ) {
+ $numeric_bmp_terminal_noncharacters[ $value ] = true;
+ }
+ if ( $value >= 0x1FFFE && $value <= 0x10FFFF && ( $value & 0xFFFF ) >= 0xFFFE ) {
+ $numeric_noncharacter_planes[ $value >> 16 ] = true;
+ }
+ }
+ }
+ if (
+ array() === array_diff(
+ array(
+ 'zero-only',
+ 'c0-control',
+ 'c1-control',
+ 'bmp-pre-surrogate',
+ 'bmp-post-surrogate',
+ 'surrogate',
+ 'bmp-noncharacter',
+ 'plane-noncharacter',
+ 'astral',
+ 'above-unicode-legal-digits',
+ 'digit-count-overflow',
+ ),
+ array_keys( $numeric_ranges )
+ ) &&
+ 32 === count( $numeric_c1_values ) &&
+ 2 === count( $numeric_bmp_terminal_noncharacters ) &&
+ 16 === count( $numeric_noncharacter_planes )
+ ) {
+ break;
+ }
+}
+check(
+ 'numeric generator covers range buckets',
+ array() === array_diff(
+ array(
+ 'zero-only',
+ 'c0-control',
+ 'c1-control',
+ 'bmp-pre-surrogate',
+ 'bmp-post-surrogate',
+ 'surrogate',
+ 'bmp-noncharacter',
+ 'plane-noncharacter',
+ 'astral',
+ 'above-unicode-legal-digits',
+ 'digit-count-overflow',
+ ),
+ array_keys( $numeric_ranges )
+ ),
+ implode( ',', array_keys( $numeric_ranges ) )
+);
+$expected_c1_values = range( 0x80, 0x9F );
+check(
+ 'numeric generator covers all C1 remap rows',
+ array() === array_diff( $expected_c1_values, array_keys( $numeric_c1_values ) ),
+ implode( ',', array_map( static fn( int $value ): string => dechex( $value ), array_keys( $numeric_c1_values ) ) )
+);
+check(
+ 'numeric generator covers BMP terminal noncharacters',
+ array() === array_diff( array( 0xFFFE, 0xFFFF ), array_keys( $numeric_bmp_terminal_noncharacters ) ),
+ implode( ',', array_map( static fn( int $value ): string => dechex( $value ), array_keys( $numeric_bmp_terminal_noncharacters ) ) )
+);
+check(
+ 'numeric generator covers per-plane noncharacters',
+ array() === array_diff( range( 1, 16 ), array_keys( $numeric_noncharacter_planes ) ),
+ implode( ',', array_keys( $numeric_noncharacter_planes ) )
+);
+
+$byte_strategies = array();
+$byte_contexts = array();
+$byte_unsafe = 0;
+$byte_nul = 0;
+for ( $i = 0; $i < $total; $i++ ) {
+ $generated = ( new Generator( new Prng( "byte-smoke:{$i}" ), 4096, $names ) )->generate_bytes();
+ $byte_strategies[ $generated['strategy'] ] = true;
+ $byte_contexts[ $generated['context'] ] = true;
+ if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) {
+ ++$byte_unsafe;
+ }
+ if ( str_contains( $generated['payload'], "\x00" ) ) {
+ ++$byte_nul;
+ }
+}
+check( 'all 5 byte-space strategies appear', 5 === count( $byte_strategies ), implode( ',', array_keys( $byte_strategies ) ) );
+check( 'byte-space cases run both contexts', array( 'both' ) === array_keys( $byte_contexts ), implode( ',', array_keys( $byte_contexts ) ) );
+check( 'byte-space generator emits unsafe payloads', $byte_unsafe > 0, (string) $byte_unsafe );
+check( 'byte-space generator emits NUL bytes', $byte_nul > 0, (string) $byte_nul );
+
+$trap_oracles = new class() extends Oracles {
+ public function decode( string $context, string $payload ): string {
+ throw new \RuntimeException( "oracle trap called for {$context} " . bin2hex( $payload ) );
+ }
+};
+$unsafe_byte_failures = ( new Checks( $trap_oracles ) )->run_without_oracle( 'both', "\xFF\x00<\"\r" );
+check(
+ 'oracle-free byte checks accept unsafe payloads',
+ array() === $unsafe_byte_failures,
+ json_encode( $unsafe_byte_failures )
+);
+$raw_c1_failures = ( new Checks( $trap_oracles ) )->run_without_oracle( 'both', "\x80\x9F" );
+check(
+ 'oracle-free byte checks pass raw C1 bytes through unchanged',
+ array() === $raw_c1_failures,
+ json_encode( $raw_c1_failures )
+);
+
+$fuzz_failures = 0;
+for ( $i = 0; $i < 300; $i++ ) {
+ $generated = ( new Generator( new Prng( "smoke-run:{$i}" ), 4096, $names ) )->generate();
+ $failures = $checks->run( $generated['context'], $generated['payload'] );
+ foreach ( $failures as $failure ) {
+ ++$fuzz_failures;
+ echo " finding: {$failure['signature']} on " . bin2hex( substr( $generated['payload'], 0, 48 ) ) . "\n";
+ }
+}
+check( '300-case fuzz run clean', 0 === $fuzz_failures );
+
+$byte_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'bytes', '--seed', '1', '--cases', '200', '--progress-every', '200' ) );
+check( '200-case byte-space worker clean', 0 === $byte_worker['code'], $byte_worker['stdout'] . $byte_worker['stderr'] );
+
+$name_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'names', '--seed', '1', '--cases', '300', '--progress-every', '300' ) );
+check(
+ '300-case name-sweep worker clean',
+ 0 === $name_worker['code'] && str_contains( $name_worker['stdout'], '"name-sweep":300' ),
+ $name_worker['stdout'] . $name_worker['stderr']
+);
+
+$legacy_follower_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'legacy-followers', '--seed', '1', '--cases', '300', '--progress-every', '300' ) );
+check(
+ '300-case legacy-follower worker clean',
+ 0 === $legacy_follower_worker['code'] && str_contains( $legacy_follower_worker['stdout'], '"legacy-follower-sweep":300' ),
+ $legacy_follower_worker['stdout'] . $legacy_follower_worker['stderr']
+);
+
+$prefix_family_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'prefix-families', '--seed', '1', '--cases', '300', '--progress-every', '300' ) );
+check(
+ '300-case prefix-family worker clean',
+ 0 === $prefix_family_worker['code'] && str_contains( $prefix_family_worker['stdout'], '"prefix-family-sweep":300' ),
+ $prefix_family_worker['stdout'] . $prefix_family_worker['stderr']
+);
+
+$numeric_boundary_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'numeric-boundaries', '--seed', '1', '--cases', '64', '--progress-every', '64' ) );
+check(
+ '64-case numeric-boundary worker clean',
+ 0 === $numeric_boundary_worker['code'] && str_contains( $numeric_boundary_worker['stdout'], '"numeric-boundary-sweep":64' ),
+ $numeric_boundary_worker['stdout'] . $numeric_boundary_worker['stderr']
+);
+
+$corpus_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'corpus', '--seed', '1', '--cases', '300', '--progress-every', '300' ) );
+$corpus_worker_has_strategies = true;
+foreach ( expected_corpus_strategies() as $strategy ) {
+ $corpus_worker_has_strategies = $corpus_worker_has_strategies && str_contains( $corpus_worker['stdout'], '"' . $strategy . '"' );
+}
+check(
+ '300-case corpus mutation worker clean',
+ 0 === $corpus_worker['code'] && $corpus_worker_has_strategies,
+ $corpus_worker['stdout'] . $corpus_worker['stderr']
+);
+
+$token_map_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'token-map', '--seed', '1', '--cases', '300', '--progress-every', '300' ) );
+check(
+ '300-case token-map worker clean',
+ 0 === $token_map_worker['code'] && str_contains( $token_map_worker['stdout'], '"token-map-structure-sweep":300' ),
+ $token_map_worker['stdout'] . $token_map_worker['stderr']
+);
+
+$coverage_unavailable_worker = run_process(
+ array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'coverage', '--seed', '1', '--cases', '1', '--progress-every', '1' ),
+ array(
+ 'HTML_DECODER_FUZZ_DISABLE_PCOV' => '1',
+ 'HTML_DECODER_FUZZ_FAKE_COVERAGE' => '0',
+ )
+);
+check(
+ 'coverage worker reports unavailable pcov',
+ 2 === $coverage_unavailable_worker['code'] && str_contains( $coverage_unavailable_worker['stdout'], 'coverage mode requires pcov' ),
+ $coverage_unavailable_worker['stdout'] . $coverage_unavailable_worker['stderr']
+);
+
+$coverage_worker_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-coverage-worker-' . getmypid();
+remove_tree( $coverage_worker_dir );
+$coverage_worker = run_process(
+ array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'coverage', '--seed', '1', '--cases', '8', '--progress-every', '8', '--output-dir', $coverage_worker_dir ),
+ array( 'HTML_DECODER_FUZZ_FAKE_COVERAGE' => '1' )
+);
+$coverage_worker_manifests = glob( $coverage_worker_dir . '/coverage-corpus/payload-*/coverage.json' );
+$coverage_worker_manifest = is_array( $coverage_worker_manifests ) && array() !== $coverage_worker_manifests
+ ? json_decode( (string) file_get_contents( $coverage_worker_manifests[0] ), true )
+ : array();
+check(
+ 'coverage worker retains fake new-edge payloads',
+ 0 === $coverage_worker['code'] &&
+ str_contains( $coverage_worker['stdout'], '"type":"coverage"' ) &&
+ str_contains( $coverage_worker['stdout'], '"coverage_new_edges"' ) &&
+ is_array( $coverage_worker_manifests ) &&
+ count( $coverage_worker_manifests ) > 0,
+ $coverage_worker['stdout'] . $coverage_worker['stderr']
+);
+check(
+ 'coverage corpus manifest records payload and target edges',
+ is_array( $coverage_worker_manifest ) &&
+ 'coverage' === ( $coverage_worker_manifest['mode'] ?? null ) &&
+ 'fake' === ( $coverage_worker_manifest['coverage_provider'] ?? null ) &&
+ is_string( $coverage_worker_manifest['payload_base64'] ?? null ) &&
+ ( $coverage_worker_manifest['new_edge_count'] ?? 0 ) > 0 &&
+ is_array( $coverage_worker_manifest['new_edges'] ?? null ),
+ json_encode( $coverage_worker_manifest )
+);
+remove_tree( $coverage_worker_dir );
+
+$byte_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-byte-runner-' . getmypid();
+remove_tree( $byte_runner_dir );
+$byte_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'bytes',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--cases-per-batch',
+ '200',
+ '--summary-mode',
+ 'none',
+ '--output-dir',
+ $byte_runner_dir,
+ )
+);
+$byte_runner_state = is_file( $byte_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $byte_runner_dir . '/state.json' ), true )
+ : array();
+check(
+ '200-case byte-space runner clean',
+ 0 === $byte_runner['code'] &&
+ 200 === ( $byte_runner_state['cases'] ?? 0 ) &&
+ 200 === ( $byte_runner_state['by_context']['both'] ?? 0 ),
+ $byte_runner['stdout'] . $byte_runner['stderr'] . json_encode( $byte_runner_state )
+);
+remove_tree( $byte_runner_dir );
+
+$name_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-name-runner-' . getmypid();
+remove_tree( $name_runner_dir );
+$name_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'names',
+ '--lanes',
+ '2',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--cases-per-batch',
+ '100',
+ '--summary-mode',
+ 'all',
+ '--output-dir',
+ $name_runner_dir,
+ )
+);
+$name_runner_state = is_file( $name_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $name_runner_dir . '/state.json' ), true )
+ : array();
+check(
+ 'name-sweep runner clean',
+ 0 === $name_runner['code'] &&
+ ( $name_runner_state['cases'] ?? 0 ) >= 200 &&
+ ( $name_runner_state['cases'] ?? null ) === ( $name_runner_state['by_strategy']['name-sweep'] ?? null ) &&
+ ( $name_runner_state['cases'] ?? null ) === ( $name_runner_state['by_context']['both'] ?? null ),
+ $name_runner['stdout'] . $name_runner['stderr'] . json_encode( $name_runner_state )
+);
+$name_runner_windows = summary_start_windows( $name_runner_dir, 'names' );
+check(
+ 'name-sweep runner uses distinct start-case windows',
+ start_windows_are_distinct( $name_runner_windows, 100 ),
+ json_encode( $name_runner_windows )
+);
+remove_tree( $name_runner_dir );
+
+$legacy_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-legacy-follower-runner-' . getmypid();
+remove_tree( $legacy_runner_dir );
+$legacy_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'legacy-followers',
+ '--lanes',
+ '2',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--cases-per-batch',
+ '100',
+ '--summary-mode',
+ 'all',
+ '--output-dir',
+ $legacy_runner_dir,
+ )
+);
+$legacy_runner_state = is_file( $legacy_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $legacy_runner_dir . '/state.json' ), true )
+ : array();
+check(
+ 'legacy-follower runner clean',
+ 0 === $legacy_runner['code'] &&
+ ( $legacy_runner_state['cases'] ?? 0 ) >= 200 &&
+ ( $legacy_runner_state['cases'] ?? null ) === ( $legacy_runner_state['by_strategy']['legacy-follower-sweep'] ?? null ) &&
+ ( $legacy_runner_state['cases'] ?? null ) === ( $legacy_runner_state['by_context']['both'] ?? null ),
+ $legacy_runner['stdout'] . $legacy_runner['stderr'] . json_encode( $legacy_runner_state )
+);
+$legacy_runner_windows = summary_start_windows( $legacy_runner_dir, 'legacy-followers' );
+check(
+ 'legacy-follower runner uses distinct start-case windows',
+ start_windows_are_distinct( $legacy_runner_windows, 100 ),
+ json_encode( $legacy_runner_windows )
+);
+remove_tree( $legacy_runner_dir );
+
+$prefix_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-prefix-family-runner-' . getmypid();
+remove_tree( $prefix_runner_dir );
+$prefix_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'prefix-families',
+ '--lanes',
+ '2',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--cases-per-batch',
+ '100',
+ '--summary-mode',
+ 'all',
+ '--output-dir',
+ $prefix_runner_dir,
+ )
+);
+$prefix_runner_state = is_file( $prefix_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $prefix_runner_dir . '/state.json' ), true )
+ : array();
+check(
+ 'prefix-family runner clean',
+ 0 === $prefix_runner['code'] &&
+ ( $prefix_runner_state['cases'] ?? 0 ) >= 200 &&
+ ( $prefix_runner_state['cases'] ?? null ) === ( $prefix_runner_state['by_strategy']['prefix-family-sweep'] ?? null ) &&
+ ( $prefix_runner_state['cases'] ?? null ) === ( $prefix_runner_state['by_context']['both'] ?? null ),
+ $prefix_runner['stdout'] . $prefix_runner['stderr'] . json_encode( $prefix_runner_state )
+);
+$prefix_runner_windows = summary_start_windows( $prefix_runner_dir, 'prefix-families' );
+check(
+ 'prefix-family runner uses distinct start-case windows',
+ start_windows_are_distinct( $prefix_runner_windows, 100 ),
+ json_encode( $prefix_runner_windows )
+);
+remove_tree( $prefix_runner_dir );
+
+$numeric_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-numeric-boundary-runner-' . getmypid();
+remove_tree( $numeric_runner_dir );
+$numeric_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'numeric-boundaries',
+ '--lanes',
+ '2',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--cases-per-batch',
+ '100',
+ '--summary-mode',
+ 'all',
+ '--output-dir',
+ $numeric_runner_dir,
+ )
+);
+$numeric_runner_state = is_file( $numeric_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $numeric_runner_dir . '/state.json' ), true )
+ : array();
+check(
+ 'numeric-boundary runner clean',
+ 0 === $numeric_runner['code'] &&
+ ( $numeric_runner_state['cases'] ?? 0 ) >= 200 &&
+ ( $numeric_runner_state['cases'] ?? null ) === ( $numeric_runner_state['by_strategy']['numeric-boundary-sweep'] ?? null ) &&
+ ( $numeric_runner_state['cases'] ?? null ) === ( $numeric_runner_state['by_context']['both'] ?? null ),
+ $numeric_runner['stdout'] . $numeric_runner['stderr'] . json_encode( $numeric_runner_state )
+);
+$numeric_runner_windows = summary_start_windows( $numeric_runner_dir, 'numeric-boundaries' );
+check(
+ 'numeric-boundary runner uses distinct start-case windows',
+ start_windows_are_distinct( $numeric_runner_windows, 100 ),
+ json_encode( $numeric_runner_windows )
+);
+remove_tree( $numeric_runner_dir );
+
+$corpus_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-corpus-runner-' . getmypid();
+remove_tree( $corpus_runner_dir );
+$corpus_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'corpus',
+ '--lanes',
+ '2',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--cases-per-batch',
+ '100',
+ '--summary-mode',
+ 'all',
+ '--output-dir',
+ $corpus_runner_dir,
+ )
+);
+$corpus_runner_state = is_file( $corpus_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $corpus_runner_dir . '/state.json' ), true )
+ : array();
+$corpus_runner_strategies = array_keys( $corpus_runner_state['by_strategy'] ?? array() );
+sort( $corpus_runner_strategies );
+check(
+ 'corpus mutation runner clean',
+ 0 === $corpus_runner['code'] &&
+ ( $corpus_runner_state['cases'] ?? 0 ) >= 200 &&
+ ( $corpus_runner_state['cases'] ?? null ) === ( $corpus_runner_state['by_context']['both'] ?? null ) &&
+ ( $corpus_runner_state['cases'] ?? null ) === array_sum( $corpus_runner_state['by_strategy'] ?? array() ) &&
+ expected_corpus_strategies() === $corpus_runner_strategies,
+ $corpus_runner['stdout'] . $corpus_runner['stderr'] . json_encode( $corpus_runner_state )
+);
+$corpus_runner_windows = summary_start_windows( $corpus_runner_dir, 'corpus' );
+check(
+ 'corpus mutation runner uses distinct start-case windows',
+ start_windows_are_distinct( $corpus_runner_windows, 100 ),
+ json_encode( $corpus_runner_windows )
+);
+remove_tree( $corpus_runner_dir );
+
+$token_map_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-token-map-runner-' . getmypid();
+remove_tree( $token_map_runner_dir );
+$token_map_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'token-map',
+ '--lanes',
+ '2',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--cases-per-batch',
+ '100',
+ '--summary-mode',
+ 'all',
+ '--output-dir',
+ $token_map_runner_dir,
+ )
+);
+$token_map_runner_state = is_file( $token_map_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $token_map_runner_dir . '/state.json' ), true )
+ : array();
+check(
+ 'token-map runner clean',
+ 0 === $token_map_runner['code'] &&
+ ( $token_map_runner_state['cases'] ?? 0 ) >= 200 &&
+ ( $token_map_runner_state['cases'] ?? null ) === ( $token_map_runner_state['by_strategy']['token-map-structure-sweep'] ?? null ) &&
+ ( $token_map_runner_state['cases'] ?? null ) === ( $token_map_runner_state['by_context']['both'] ?? null ),
+ $token_map_runner['stdout'] . $token_map_runner['stderr'] . json_encode( $token_map_runner_state )
+);
+$token_map_runner_windows = summary_start_windows( $token_map_runner_dir, 'token-map' );
+check(
+ 'token-map runner uses distinct start-case windows',
+ start_windows_are_distinct( $token_map_runner_windows, 100 ),
+ json_encode( $token_map_runner_windows )
+);
+remove_tree( $token_map_runner_dir );
+
+$coverage_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-coverage-runner-' . getmypid();
+remove_tree( $coverage_runner_dir );
+$coverage_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'coverage',
+ '--lanes',
+ '2',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '40',
+ '--cases-per-batch',
+ '20',
+ '--summary-mode',
+ 'failures',
+ '--output-dir',
+ $coverage_runner_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAKE_COVERAGE' => '1' )
+);
+$coverage_runner_state = is_file( $coverage_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $coverage_runner_dir . '/state.json' ), true )
+ : array();
+$coverage_runner_manifests = glob( $coverage_runner_dir . '/coverage-corpus/payload-*/coverage.json' );
+$coverage_summary = is_file( $coverage_runner_dir . '/summary.ndjson' )
+ ? file( $coverage_runner_dir . '/summary.ndjson', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES )
+ : array();
+$coverage_summary_retained = 0;
+if ( is_array( $coverage_summary ) ) {
+ foreach ( $coverage_summary as $line ) {
+ $record = json_decode( $line, true );
+ if ( is_array( $record ) && 'coverage' === ( $record['type'] ?? null ) && ! empty( $record['coverage_retained'] ) ) {
+ ++$coverage_summary_retained;
+ }
+ }
+}
+check(
+ 'coverage runner aggregates fake new-edge corpus',
+ 0 === $coverage_runner['code'] &&
+ ( $coverage_runner_state['cases'] ?? 0 ) >= 40 &&
+ ( $coverage_runner_state['cases'] ?? null ) === ( $coverage_runner_state['by_context']['both'] ?? null ) &&
+ ( $coverage_runner_state['cases'] ?? null ) === array_sum( $coverage_runner_state['by_strategy'] ?? array() ) &&
+ ( $coverage_runner_state['coverage']['edges'] ?? 0 ) > 0 &&
+ ( $coverage_runner_state['coverage']['payloads'] ?? 0 ) > 0 &&
+ is_array( $coverage_runner_manifests ) &&
+ count( $coverage_runner_manifests ) === ( $coverage_runner_state['coverage']['payloads'] ?? -1 ) &&
+ $coverage_summary_retained === ( $coverage_runner_state['coverage']['payloads'] ?? -1 ),
+ $coverage_runner['stdout'] . $coverage_runner['stderr'] . json_encode( $coverage_runner_state )
+);
+remove_tree( $coverage_runner_dir );
+
+$name_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'names', '--seed', '1', '--case', '0' ) );
+check( 'name-sweep replay regenerates clean case', 0 === $name_replay['code'], $name_replay['stdout'] . $name_replay['stderr'] );
+
+$legacy_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'legacy-followers', '--seed', '1', '--case', '0' ) );
+check( 'legacy-follower replay regenerates clean case', 0 === $legacy_replay['code'], $legacy_replay['stdout'] . $legacy_replay['stderr'] );
+
+$prefix_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'prefix-families', '--seed', '1', '--case', '37' ) );
+check(
+ 'prefix-family replay regenerates clean case',
+ 0 === $prefix_replay['code'] &&
+ str_contains( $prefix_replay['stdout'], 'mode prefix-families, strategy prefix-family-sweep' ) &&
+ str_contains( $prefix_replay['stdout'], 'Hex preview: 266e6f7478' ),
+ $prefix_replay['stdout'] . $prefix_replay['stderr']
+);
+
+$prefix_fault_seed_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'prefix-families', '--seed', '1', '--case', '37' ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+);
+check( 'faulted prefix-family seed replay reproduces generated case', 1 === $prefix_fault_seed_replay['code'], $prefix_fault_seed_replay['stdout'] . $prefix_fault_seed_replay['stderr'] );
+
+$numeric_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'numeric-boundaries', '--seed', '1', '--case', '25' ) );
+check(
+ 'numeric-boundary replay regenerates mixed-case hex case',
+ 0 === $numeric_replay['code'] &&
+ str_contains( $numeric_replay['stdout'], 'mode numeric-boundaries, strategy numeric-boundary-sweep' ) &&
+ str_contains( $numeric_replay['stdout'], 'Hex preview: 2623783130466645653b' ),
+ $numeric_replay['stdout'] . $numeric_replay['stderr']
+);
+
+$numeric_fault_seed_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'numeric-boundaries', '--seed', '1', '--case', '0' ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+check( 'faulted numeric-boundary seed replay reproduces generated case', 1 === $numeric_fault_seed_replay['code'], $numeric_fault_seed_replay['stdout'] . $numeric_fault_seed_replay['stderr'] );
+
+$corpus_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'corpus', '--seed', '1', '--case', '0' ) );
+check(
+ 'corpus mutation replay regenerates clean case',
+ 0 === $corpus_replay['code'] &&
+ str_contains( $corpus_replay['stdout'], 'mode corpus, strategy corpus-byte-perturb' ) &&
+ str_contains( $corpus_replay['stdout'], 'Hex preview: 64262335383b' ),
+ $corpus_replay['stdout'] . $corpus_replay['stderr']
+);
+
+$single_level_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'corpus', '--seed', '1', '--case', '11875' ) );
+check(
+ 'corpus replay regenerates single-level decode fixture',
+ 0 === $single_level_replay['code'] &&
+ str_contains( $single_level_replay['stdout'], 'mode corpus, strategy corpus-splice' ) &&
+ str_contains( $single_level_replay['stdout'], 'Hex preview: 26616d703b616d703b5a' ),
+ $single_level_replay['stdout'] . $single_level_replay['stderr']
+);
+
+$corpus_fault_seed_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'corpus', '--seed', '1', '--case', '0' ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+check( 'faulted corpus mutation seed replay reproduces generated case', 1 === $corpus_fault_seed_replay['code'], $corpus_fault_seed_replay['stdout'] . $corpus_fault_seed_replay['stderr'] );
+
+$token_map_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'token-map', '--seed', '1', '--case', '0' ) );
+check(
+ 'token-map replay regenerates clean case',
+ 0 === $token_map_replay['code'] &&
+ str_contains( $token_map_replay['stdout'], 'mode token-map, strategy token-map-structure-sweep' ),
+ $token_map_replay['stdout'] . $token_map_replay['stderr']
+);
+
+if ( null !== $token_map_fault_case_index ) {
+ $token_map_fault_seed_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'token-map', '--seed', '1', '--case', (string) $token_map_fault_case_index ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted token-map seed replay reproduces generated case', 1 === $token_map_fault_seed_replay['code'], $token_map_fault_seed_replay['stdout'] . $token_map_fault_seed_replay['stderr'] );
+}
+
+$coverage_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'coverage', '--seed', '1', '--case', '0' ) );
+check(
+ 'coverage replay regenerates clean generated case',
+ 0 === $coverage_replay['code'] &&
+ str_contains( $coverage_replay['stdout'], 'mode coverage, strategy numeric' ),
+ $coverage_replay['stdout'] . $coverage_replay['stderr']
+);
+
+$name_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-name-fault-' . getmypid();
+remove_tree( $name_pipeline_dir );
+$faulted_name_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'names',
+ '--seed',
+ '1',
+ '--start-case',
+ '11593',
+ '--cases',
+ '1',
+ '--output-dir',
+ $name_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+);
+check( 'faulted name-sweep worker reports findings', 1 === $faulted_name_worker['code'], $faulted_name_worker['stdout'] . $faulted_name_worker['stderr'] );
+
+$name_failure_files = glob( $name_pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted name-sweep worker writes failure artifact', is_array( $name_failure_files ) && array() !== $name_failure_files );
+
+$name_failure_file = is_array( $name_failure_files ) && array() !== $name_failure_files ? $name_failure_files[0] : null;
+if ( null !== $name_failure_file ) {
+ $name_manifest = json_decode( (string) file_get_contents( $name_failure_file ), true );
+ check(
+ 'name-sweep failure artifact records mode and signature',
+ 'names' === ( $name_manifest['mode'] ?? null ) &&
+ 'name-sweep' === ( $name_manifest['strategy'] ?? null ) &&
+ 11593 === ( $name_manifest['case'] ?? null ) &&
+ in_array( 'decode-mismatch:attribute', $name_manifest['signatures'] ?? array(), true ),
+ json_encode( $name_manifest )
+ );
+
+ $name_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $name_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted name-sweep replay reproduces finding', 1 === $name_fault_replay['code'], $name_fault_replay['stdout'] . $name_fault_replay['stderr'] );
+
+ $name_fault_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $name_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted name-sweep minimizer preserves signature', 0 === $name_fault_minimize['code'], $name_fault_minimize['stdout'] . $name_fault_minimize['stderr'] );
+}
+remove_tree( $name_pipeline_dir );
+
+$legacy_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-legacy-follower-fault-' . getmypid();
+remove_tree( $legacy_pipeline_dir );
+$faulted_legacy_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'legacy-followers',
+ '--seed',
+ '1',
+ '--start-case',
+ '0',
+ '--cases',
+ '80',
+ '--output-dir',
+ $legacy_pipeline_dir,
+ '--progress-every',
+ '80',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+);
+check( 'faulted legacy-follower worker reports findings', 1 === $faulted_legacy_worker['code'], $faulted_legacy_worker['stdout'] . $faulted_legacy_worker['stderr'] );
+
+$legacy_failure_files = glob( $legacy_pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted legacy-follower worker writes failure artifact', is_array( $legacy_failure_files ) && array() !== $legacy_failure_files );
+
+$legacy_failure_file = is_array( $legacy_failure_files ) && array() !== $legacy_failure_files ? $legacy_failure_files[0] : null;
+if ( null !== $legacy_failure_file ) {
+ $legacy_manifest = json_decode( (string) file_get_contents( $legacy_failure_file ), true );
+ check(
+ 'legacy-follower failure artifact records mode and signature',
+ 'legacy-followers' === ( $legacy_manifest['mode'] ?? null ) &&
+ 'legacy-follower-sweep' === ( $legacy_manifest['strategy'] ?? null ) &&
+ in_array( 'decode-mismatch:attribute', $legacy_manifest['signatures'] ?? array(), true ),
+ json_encode( $legacy_manifest )
+ );
+
+ $legacy_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $legacy_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted legacy-follower replay reproduces finding', 1 === $legacy_fault_replay['code'], $legacy_fault_replay['stdout'] . $legacy_fault_replay['stderr'] );
+
+ $legacy_fault_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $legacy_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted legacy-follower minimizer preserves signature', 0 === $legacy_fault_minimize['code'], $legacy_fault_minimize['stdout'] . $legacy_fault_minimize['stderr'] );
+}
+remove_tree( $legacy_pipeline_dir );
+
+$prefix_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-prefix-family-fault-' . getmypid();
+remove_tree( $prefix_pipeline_dir );
+$faulted_prefix_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'prefix-families',
+ '--seed',
+ '1',
+ '--start-case',
+ '37',
+ '--cases',
+ '1',
+ '--output-dir',
+ $prefix_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+);
+check( 'faulted prefix-family worker reports findings', 1 === $faulted_prefix_worker['code'], $faulted_prefix_worker['stdout'] . $faulted_prefix_worker['stderr'] );
+
+$prefix_failure_files = glob( $prefix_pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted prefix-family worker writes failure artifact', is_array( $prefix_failure_files ) && array() !== $prefix_failure_files );
+
+$prefix_failure_file = is_array( $prefix_failure_files ) && array() !== $prefix_failure_files ? $prefix_failure_files[0] : null;
+if ( null !== $prefix_failure_file ) {
+ $prefix_manifest = json_decode( (string) file_get_contents( $prefix_failure_file ), true );
+ check(
+ 'prefix-family failure artifact records mode and signature',
+ 'prefix-families' === ( $prefix_manifest['mode'] ?? null ) &&
+ 'prefix-family-sweep' === ( $prefix_manifest['strategy'] ?? null ) &&
+ 37 === ( $prefix_manifest['case'] ?? null ) &&
+ in_array( 'decode-mismatch:attribute', $prefix_manifest['signatures'] ?? array(), true ),
+ json_encode( $prefix_manifest )
+ );
+
+ $prefix_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $prefix_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted prefix-family replay reproduces finding', 1 === $prefix_fault_replay['code'], $prefix_fault_replay['stdout'] . $prefix_fault_replay['stderr'] );
+
+ $prefix_fault_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $prefix_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted prefix-family minimizer preserves signature', 0 === $prefix_fault_minimize['code'], $prefix_fault_minimize['stdout'] . $prefix_fault_minimize['stderr'] );
+}
+remove_tree( $prefix_pipeline_dir );
+
+$numeric_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-numeric-boundary-fault-' . getmypid();
+remove_tree( $numeric_pipeline_dir );
+$faulted_numeric_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'numeric-boundaries',
+ '--seed',
+ '1',
+ '--start-case',
+ '0',
+ '--cases',
+ '1',
+ '--output-dir',
+ $numeric_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+check( 'faulted numeric-boundary worker reports findings', 1 === $faulted_numeric_worker['code'], $faulted_numeric_worker['stdout'] . $faulted_numeric_worker['stderr'] );
+
+$numeric_failure_files = glob( $numeric_pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted numeric-boundary worker writes failure artifact', is_array( $numeric_failure_files ) && array() !== $numeric_failure_files );
+
+$numeric_failure_file = is_array( $numeric_failure_files ) && array() !== $numeric_failure_files ? $numeric_failure_files[0] : null;
+if ( null !== $numeric_failure_file ) {
+ $numeric_manifest = json_decode( (string) file_get_contents( $numeric_failure_file ), true );
+ check(
+ 'numeric-boundary failure artifact records mode and signature',
+ 'numeric-boundaries' === ( $numeric_manifest['mode'] ?? null ) &&
+ 'numeric-boundary-sweep' === ( $numeric_manifest['strategy'] ?? null ) &&
+ 0 === ( $numeric_manifest['case'] ?? null ) &&
+ in_array( 'reader-overran-input:text', $numeric_manifest['signatures'] ?? array(), true ),
+ json_encode( $numeric_manifest )
+ );
+
+ $numeric_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $numeric_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+ );
+ check( 'faulted numeric-boundary replay reproduces finding', 1 === $numeric_fault_replay['code'], $numeric_fault_replay['stdout'] . $numeric_fault_replay['stderr'] );
+
+ $numeric_fault_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $numeric_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+ );
+ check( 'faulted numeric-boundary minimizer preserves signature', 0 === $numeric_fault_minimize['code'], $numeric_fault_minimize['stdout'] . $numeric_fault_minimize['stderr'] );
+}
+remove_tree( $numeric_pipeline_dir );
+
+$corpus_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-corpus-fault-' . getmypid();
+remove_tree( $corpus_pipeline_dir );
+$faulted_corpus_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'corpus',
+ '--seed',
+ '1',
+ '--start-case',
+ '0',
+ '--cases',
+ '1',
+ '--output-dir',
+ $corpus_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+check( 'faulted corpus mutation worker reports findings', 1 === $faulted_corpus_worker['code'], $faulted_corpus_worker['stdout'] . $faulted_corpus_worker['stderr'] );
+
+$corpus_failure_files = glob( $corpus_pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted corpus mutation worker writes failure artifact', is_array( $corpus_failure_files ) && array() !== $corpus_failure_files );
+
+$corpus_failure_file = is_array( $corpus_failure_files ) && array() !== $corpus_failure_files ? $corpus_failure_files[0] : null;
+if ( null !== $corpus_failure_file ) {
+ $corpus_manifest = json_decode( (string) file_get_contents( $corpus_failure_file ), true );
+ check(
+ 'corpus mutation failure artifact records mode and signature',
+ 'corpus' === ( $corpus_manifest['mode'] ?? null ) &&
+ 'corpus-byte-perturb' === ( $corpus_manifest['strategy'] ?? null ) &&
+ 0 === ( $corpus_manifest['case'] ?? null ) &&
+ in_array( 'reader-overran-input:text', $corpus_manifest['signatures'] ?? array(), true ),
+ json_encode( $corpus_manifest )
+ );
+
+ $corpus_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $corpus_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+ );
+ check( 'faulted corpus mutation replay reproduces finding', 1 === $corpus_fault_replay['code'], $corpus_fault_replay['stdout'] . $corpus_fault_replay['stderr'] );
+
+ $corpus_fault_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $corpus_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+ );
+ check( 'faulted corpus mutation minimizer preserves signature', 0 === $corpus_fault_minimize['code'], $corpus_fault_minimize['stdout'] . $corpus_fault_minimize['stderr'] );
+}
+remove_tree( $corpus_pipeline_dir );
+
+$single_level_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-single-level-fault-' . getmypid();
+remove_tree( $single_level_pipeline_dir );
+$faulted_single_level_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'corpus',
+ '--seed',
+ '1',
+ '--start-case',
+ '11875',
+ '--cases',
+ '1',
+ '--output-dir',
+ $single_level_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'single-level-overdecode' )
+);
+check( 'faulted single-level corpus worker reports findings', 1 === $faulted_single_level_worker['code'], $faulted_single_level_worker['stdout'] . $faulted_single_level_worker['stderr'] );
+
+$single_level_failure_files = glob( $single_level_pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted single-level corpus worker writes failure artifact', is_array( $single_level_failure_files ) && array() !== $single_level_failure_files );
+
+$single_level_failure_file = is_array( $single_level_failure_files ) && array() !== $single_level_failure_files ? $single_level_failure_files[0] : null;
+if ( null !== $single_level_failure_file ) {
+ $single_level_manifest = json_decode( (string) file_get_contents( $single_level_failure_file ), true );
+ check(
+ 'single-level corpus failure artifact records mode and signature',
+ 'corpus' === ( $single_level_manifest['mode'] ?? null ) &&
+ 'corpus-splice' === ( $single_level_manifest['strategy'] ?? null ) &&
+ 11875 === ( $single_level_manifest['case'] ?? null ) &&
+ in_array( 'single-level-decode-overdecoded:text', $single_level_manifest['signatures'] ?? array(), true ) &&
+ in_array( 'single-level-decode-overdecoded:attribute', $single_level_manifest['signatures'] ?? array(), true ),
+ json_encode( $single_level_manifest )
+ );
+
+ $single_level_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $single_level_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'single-level-overdecode' )
+ );
+ check( 'faulted single-level corpus replay reproduces finding', 1 === $single_level_fault_replay['code'], $single_level_fault_replay['stdout'] . $single_level_fault_replay['stderr'] );
+
+ $single_level_fault_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $single_level_failure_file, '--signature', 'single-level-decode-overdecoded:text' ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'single-level-overdecode' )
+ );
+ check( 'faulted single-level corpus minimizer preserves signature', 0 === $single_level_fault_minimize['code'], $single_level_fault_minimize['stdout'] . $single_level_fault_minimize['stderr'] );
+}
+remove_tree( $single_level_pipeline_dir );
+
+if ( null !== $token_map_fault_case_index ) {
+ $token_map_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-token-map-fault-' . getmypid();
+ remove_tree( $token_map_pipeline_dir );
+ $faulted_token_map_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'token-map',
+ '--seed',
+ '1',
+ '--start-case',
+ (string) $token_map_fault_case_index,
+ '--cases',
+ '1',
+ '--output-dir',
+ $token_map_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted token-map worker reports findings', 1 === $faulted_token_map_worker['code'], $faulted_token_map_worker['stdout'] . $faulted_token_map_worker['stderr'] );
+
+ $token_map_failure_files = glob( $token_map_pipeline_dir . '/failure-*/failure.json' );
+ check( 'faulted token-map worker writes failure artifact', is_array( $token_map_failure_files ) && array() !== $token_map_failure_files );
+
+ $token_map_failure_file = is_array( $token_map_failure_files ) && array() !== $token_map_failure_files ? $token_map_failure_files[0] : null;
+ if ( null !== $token_map_failure_file ) {
+ $token_map_manifest = json_decode( (string) file_get_contents( $token_map_failure_file ), true );
+ check(
+ 'token-map failure artifact records mode and signature',
+ 'token-map' === ( $token_map_manifest['mode'] ?? null ) &&
+ 'token-map-structure-sweep' === ( $token_map_manifest['strategy'] ?? null ) &&
+ $token_map_fault_case_index === ( $token_map_manifest['case'] ?? null ) &&
+ in_array( 'decode-mismatch:attribute', $token_map_manifest['signatures'] ?? array(), true ),
+ json_encode( $token_map_manifest )
+ );
+
+ $token_map_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $token_map_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted token-map replay reproduces finding', 1 === $token_map_fault_replay['code'], $token_map_fault_replay['stdout'] . $token_map_fault_replay['stderr'] );
+
+ $token_map_fault_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $token_map_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' )
+ );
+ check( 'faulted token-map minimizer preserves signature', 0 === $token_map_fault_minimize['code'], $token_map_fault_minimize['stdout'] . $token_map_fault_minimize['stderr'] );
+ }
+ remove_tree( $token_map_pipeline_dir );
+}
+
+$coverage_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-coverage-fault-' . getmypid();
+remove_tree( $coverage_pipeline_dir );
+$faulted_coverage_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'coverage',
+ '--seed',
+ '1',
+ '--start-case',
+ '57',
+ '--cases',
+ '1',
+ '--output-dir',
+ $coverage_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array(
+ 'HTML_DECODER_FUZZ_FAKE_COVERAGE' => '1',
+ 'HTML_DECODER_FUZZ_FAULT' => 'reader-empty-chunk',
+ )
+);
+check( 'faulted coverage worker reports findings', 1 === $faulted_coverage_worker['code'], $faulted_coverage_worker['stdout'] . $faulted_coverage_worker['stderr'] );
+
+$coverage_failure_files = glob( $coverage_pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted coverage worker writes failure artifact', is_array( $coverage_failure_files ) && array() !== $coverage_failure_files );
+
+$coverage_failure_file = is_array( $coverage_failure_files ) && array() !== $coverage_failure_files ? $coverage_failure_files[0] : null;
+if ( null !== $coverage_failure_file ) {
+ $coverage_manifest = json_decode( (string) file_get_contents( $coverage_failure_file ), true );
+ check(
+ 'coverage failure artifact records mode and signature',
+ 'coverage' === ( $coverage_manifest['mode'] ?? null ) &&
+ 57 === ( $coverage_manifest['case'] ?? null ) &&
+ in_array( 'reader-returned-empty-chunk:text', $coverage_manifest['signatures'] ?? array(), true ),
+ json_encode( $coverage_manifest )
+ );
+
+ $coverage_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $coverage_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'reader-empty-chunk' )
+ );
+ check( 'faulted coverage replay reproduces finding', 1 === $coverage_fault_replay['code'], $coverage_fault_replay['stdout'] . $coverage_fault_replay['stderr'] );
+
+ $coverage_fault_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $coverage_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'reader-empty-chunk' )
+ );
+ check( 'faulted coverage minimizer preserves signature', 0 === $coverage_fault_minimize['code'], $coverage_fault_minimize['stdout'] . $coverage_fault_minimize['stderr'] );
+}
+remove_tree( $coverage_pipeline_dir );
+
+$reader_fault_pipelines = array(
+ array(
+ 'fault' => 'reader-empty-chunk',
+ 'case' => 57,
+ 'signature' => 'reader-returned-empty-chunk:text',
+ ),
+ array(
+ 'fault' => 'reader-short-match-length',
+ 'case' => 57,
+ 'signature' => 'reader-match-too-short:text',
+ ),
+ array(
+ 'fault' => 'reader-substring-composition',
+ 'case' => 97,
+ 'signature' => 'reader-composition-mismatch:text',
+ ),
+ array(
+ 'fault' => 'reader-null-mutates-match-length',
+ 'case' => 7,
+ 'signature' => 'reader-mutated-match-length-on-null:text',
+ ),
+ array(
+ 'fault' => 'reader-non-amp-match',
+ 'case' => 0,
+ 'signature' => 'reader-non-amp-match:text',
+ ),
+ array(
+ 'fault' => 'reader-gapless-drop-span',
+ 'case' => 0,
+ 'signature' => 'reader-walk-not-gapless:text',
+ ),
+ array(
+ 'fault' => 'numeric-invalid-not-replacement',
+ 'case' => 0,
+ 'signature' => 'numeric-invalid-not-replacement:text',
+ ),
+ array(
+ 'fault' => 'numeric-c1-not-remapped',
+ 'case' => 2,
+ 'signature' => 'numeric-c1-not-remapped:text',
+ ),
+ array(
+ 'fault' => 'text-secondary-oracle',
+ 'case' => 4,
+ 'signature' => 'text-secondary-oracle-mismatch:text',
+ 'minimize_signature' => 'text-secondary-oracle-mismatch:text',
+ ),
+ array(
+ 'fault' => 'attribute-no-amp-identity',
+ 'case' => 38,
+ 'signature' => 'attribute-without-ampersand-not-identity:attribute',
+ ),
+);
+foreach ( $reader_fault_pipelines as $reader_pipeline ) {
+ $reader_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-' . $reader_pipeline['fault'] . '-' . getmypid();
+ remove_tree( $reader_pipeline_dir );
+ $faulted_reader_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--seed',
+ '1',
+ '--start-case',
+ (string) $reader_pipeline['case'],
+ '--cases',
+ '1',
+ '--output-dir',
+ $reader_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => $reader_pipeline['fault'] )
+ );
+ check( "faulted {$reader_pipeline['fault']} worker reports findings", 1 === $faulted_reader_worker['code'], $faulted_reader_worker['stdout'] . $faulted_reader_worker['stderr'] );
+
+ $reader_failure_files = glob( $reader_pipeline_dir . '/failure-*/failure.json' );
+ check( "faulted {$reader_pipeline['fault']} worker writes failure artifact", is_array( $reader_failure_files ) && array() !== $reader_failure_files );
+
+ $reader_failure_file = is_array( $reader_failure_files ) && array() !== $reader_failure_files ? $reader_failure_files[0] : null;
+ if ( null !== $reader_failure_file ) {
+ $reader_manifest = json_decode( (string) file_get_contents( $reader_failure_file ), true );
+ check(
+ "{$reader_pipeline['fault']} failure artifact records mode and signature",
+ 'oracle' === ( $reader_manifest['mode'] ?? null ) &&
+ $reader_pipeline['case'] === ( $reader_manifest['case'] ?? null ) &&
+ in_array( $reader_pipeline['signature'], $reader_manifest['signatures'] ?? array(), true ),
+ json_encode( $reader_manifest )
+ );
+
+ $reader_fault_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $reader_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => $reader_pipeline['fault'] )
+ );
+ check( "faulted {$reader_pipeline['fault']} replay reproduces finding", 1 === $reader_fault_replay['code'], $reader_fault_replay['stdout'] . $reader_fault_replay['stderr'] );
+
+ $reader_fault_minimize_command = array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $reader_failure_file );
+ if ( isset( $reader_pipeline['minimize_signature'] ) ) {
+ $reader_fault_minimize_command[] = '--signature';
+ $reader_fault_minimize_command[] = $reader_pipeline['minimize_signature'];
+ }
+ $reader_fault_minimize = run_process(
+ $reader_fault_minimize_command,
+ array( 'HTML_DECODER_FUZZ_FAULT' => $reader_pipeline['fault'] )
+ );
+ check( "faulted {$reader_pipeline['fault']} minimizer preserves signature", 0 === $reader_fault_minimize['code'], $reader_fault_minimize['stdout'] . $reader_fault_minimize['stderr'] );
+ }
+ remove_tree( $reader_pipeline_dir );
+}
+
+$zero_cases = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--cases', '0' ) );
+check( 'worker rejects zero cases', 2 === $zero_cases['code'], $zero_cases['stdout'] . $zero_cases['stderr'] );
+
+$zero_batch = run_process( array( PHP_BINARY, __DIR__ . '/../runner.php', '--cases-per-batch', '0', '--duration-seconds', '1', '--output-dir', sys_get_temp_dir() . '/html-decoder-fuzz-smoke-bad-runner-' . getmypid() ) );
+check( 'runner rejects zero cases per batch', 2 === $zero_batch['code'], $zero_batch['stdout'] . $zero_batch['stderr'] );
+
+$unwritable_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unwritable-' . getmypid();
+remove_tree( $unwritable_dir );
+mkdir( $unwritable_dir, 0555, true );
+chmod( $unwritable_dir, 0555 );
+clearstatcache( true, $unwritable_dir );
+$unwritable_runner = run_process( array( PHP_BINARY, __DIR__ . '/../runner.php', '--duration-seconds', '1', '--output-dir', $unwritable_dir ) );
+chmod( $unwritable_dir, 0755 );
+clearstatcache( true, $unwritable_dir );
+remove_tree( $unwritable_dir );
+check( 'runner rejects unwritable output dir', 2 === $unwritable_runner['code'], $unwritable_runner['stdout'] . $unwritable_runner['stderr'] );
+
+$unreadable_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unreadable-' . getmypid();
+remove_tree( $unreadable_dir );
+mkdir( $unreadable_dir, 0333, true );
+chmod( $unreadable_dir, 0333 );
+clearstatcache( true, $unreadable_dir );
+$unreadable_runner = run_process( array( PHP_BINARY, __DIR__ . '/../runner.php', '--duration-seconds', '1', '--output-dir', $unreadable_dir ) );
+chmod( $unreadable_dir, 0755 );
+clearstatcache( true, $unreadable_dir );
+remove_tree( $unreadable_dir );
+check( 'runner rejects unreadable output dir', 2 === $unreadable_runner['code'], $unreadable_runner['stdout'] . $unreadable_runner['stderr'] );
+
+$bad_state_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-bad-state-' . getmypid();
+remove_tree( $bad_state_dir );
+mkdir( $bad_state_dir, 0777, true );
+mkdir( $bad_state_dir . '/state.json' );
+$bad_state_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--output-dir',
+ $bad_state_dir,
+ )
+);
+remove_tree( $bad_state_dir );
+check( 'runner reports state write failures', 2 === $bad_state_runner['code'], $bad_state_runner['stdout'] . $bad_state_runner['stderr'] );
+
+$state_hardlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-state-hardlink-' . getmypid();
+remove_tree( $state_hardlink_dir );
+mkdir( $state_hardlink_dir, 0777, true );
+$state_hardlink_target = $state_hardlink_dir . '-target';
+file_put_contents( $state_hardlink_target, "sentinel\n" );
+$state_hardlink_created = @link( $state_hardlink_target, $state_hardlink_dir . '/state.json' );
+if ( $state_hardlink_created ) {
+ $state_hardlink_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--summary-mode',
+ 'none',
+ '--output-dir',
+ $state_hardlink_dir,
+ )
+ );
+ check(
+ 'runner rejects hardlinked state output',
+ 2 === $state_hardlink_runner['code'] && "sentinel\n" === file_get_contents( $state_hardlink_target ),
+ $state_hardlink_runner['stdout'] . $state_hardlink_runner['stderr']
+ );
+} else {
+ check( 'runner rejects hardlinked state output', true, 'hardlink unavailable' );
+}
+remove_tree( $state_hardlink_dir );
+@unlink( $state_hardlink_target );
+
+$bad_summary_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-bad-summary-' . getmypid();
+remove_tree( $bad_summary_dir );
+mkdir( $bad_summary_dir, 0777, true );
+mkdir( $bad_summary_dir . '/summary.ndjson' );
+$bad_summary_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--output-dir',
+ $bad_summary_dir,
+ )
+);
+remove_tree( $bad_summary_dir );
+check( 'runner reports summary open failures', 2 === $bad_summary_runner['code'], $bad_summary_runner['stdout'] . $bad_summary_runner['stderr'] );
+
+$summary_symlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-summary-symlink-' . getmypid();
+remove_tree( $summary_symlink_dir );
+mkdir( $summary_symlink_dir, 0777, true );
+$summary_symlink_target = $summary_symlink_dir . '-target';
+file_put_contents( $summary_symlink_target, "sentinel\n" );
+$summary_symlink_created = @symlink( $summary_symlink_target, $summary_symlink_dir . '/summary.ndjson' );
+if ( $summary_symlink_created ) {
+ $summary_symlink_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--output-dir',
+ $summary_symlink_dir,
+ )
+ );
+ check(
+ 'runner rejects symlinked summary output',
+ 2 === $summary_symlink_runner['code'] && "sentinel\n" === file_get_contents( $summary_symlink_target ),
+ $summary_symlink_runner['stdout'] . $summary_symlink_runner['stderr']
+ );
+} else {
+ check( 'runner rejects symlinked summary output', true, 'symlink unavailable' );
+}
+remove_tree( $summary_symlink_dir );
+@unlink( $summary_symlink_target );
+
+$summary_hardlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-summary-hardlink-' . getmypid();
+remove_tree( $summary_hardlink_dir );
+mkdir( $summary_hardlink_dir, 0777, true );
+$summary_hardlink_target = $summary_hardlink_dir . '-target';
+file_put_contents( $summary_hardlink_target, "sentinel\n" );
+$summary_hardlink_created = @link( $summary_hardlink_target, $summary_hardlink_dir . '/summary.ndjson' );
+if ( $summary_hardlink_created ) {
+ $summary_hardlink_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--output-dir',
+ $summary_hardlink_dir,
+ )
+ );
+ check(
+ 'runner rejects hardlinked summary output',
+ 2 === $summary_hardlink_runner['code'] && "sentinel\n" === file_get_contents( $summary_hardlink_target ),
+ $summary_hardlink_runner['stdout'] . $summary_hardlink_runner['stderr']
+ );
+} else {
+ check( 'runner rejects hardlinked summary output', true, 'hardlink unavailable' );
+}
+remove_tree( $summary_hardlink_dir );
+@unlink( $summary_hardlink_target );
+
+$lane_stderr_symlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-symlink-' . getmypid();
+remove_tree( $lane_stderr_symlink_dir );
+mkdir( $lane_stderr_symlink_dir, 0777, true );
+$lane_stderr_symlink_target = $lane_stderr_symlink_dir . '-target';
+file_put_contents( $lane_stderr_symlink_target, "sentinel\n" );
+$lane_stderr_symlink_created = @symlink( $lane_stderr_symlink_target, $lane_stderr_symlink_dir . '/lane-0-stderr.log' );
+if ( $lane_stderr_symlink_created ) {
+ $lane_stderr_symlink_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--output-dir',
+ $lane_stderr_symlink_dir,
+ )
+ );
+ check(
+ 'runner rejects symlinked lane stderr output',
+ 2 === $lane_stderr_symlink_runner['code'] && "sentinel\n" === file_get_contents( $lane_stderr_symlink_target ),
+ $lane_stderr_symlink_runner['stdout'] . $lane_stderr_symlink_runner['stderr']
+ );
+} else {
+ check( 'runner rejects symlinked lane stderr output', true, 'symlink unavailable' );
+}
+remove_tree( $lane_stderr_symlink_dir );
+@unlink( $lane_stderr_symlink_target );
+
+$lane_stderr_hardlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-hardlink-' . getmypid();
+remove_tree( $lane_stderr_hardlink_dir );
+mkdir( $lane_stderr_hardlink_dir, 0777, true );
+$lane_stderr_hardlink_target = $lane_stderr_hardlink_dir . '-target';
+file_put_contents( $lane_stderr_hardlink_target, "sentinel\n" );
+$lane_stderr_hardlink_created = @link( $lane_stderr_hardlink_target, $lane_stderr_hardlink_dir . '/lane-0-stderr.log' );
+if ( $lane_stderr_hardlink_created ) {
+ $lane_stderr_hardlink_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--output-dir',
+ $lane_stderr_hardlink_dir,
+ )
+ );
+ check(
+ 'runner rejects hardlinked lane stderr output',
+ 2 === $lane_stderr_hardlink_runner['code'] && "sentinel\n" === file_get_contents( $lane_stderr_hardlink_target ),
+ $lane_stderr_hardlink_runner['stdout'] . $lane_stderr_hardlink_runner['stderr']
+ );
+} else {
+ check( 'runner rejects hardlinked lane stderr output', true, 'hardlink unavailable' );
+}
+remove_tree( $lane_stderr_hardlink_dir );
+@unlink( $lane_stderr_hardlink_target );
+
+$lane_stderr_fifo_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-fifo-' . getmypid();
+remove_tree( $lane_stderr_fifo_dir );
+mkdir( $lane_stderr_fifo_dir, 0777, true );
+$lane_stderr_fifo_created = function_exists( 'posix_mkfifo' ) && @posix_mkfifo( $lane_stderr_fifo_dir . '/lane-0-stderr.log', 0600 );
+if ( $lane_stderr_fifo_created ) {
+ $lane_stderr_fifo_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--output-dir',
+ $lane_stderr_fifo_dir,
+ )
+ );
+ check(
+ 'runner rejects non-regular lane stderr output',
+ 2 === $lane_stderr_fifo_runner['code'],
+ $lane_stderr_fifo_runner['stdout'] . $lane_stderr_fifo_runner['stderr']
+ );
+} else {
+ check( 'runner rejects non-regular lane stderr output', true, 'fifo unavailable' );
+}
+remove_tree( $lane_stderr_fifo_dir );
+
+$lane_stderr_cap_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-cap-' . getmypid();
+remove_tree( $lane_stderr_cap_dir );
+$lane_stderr_cap_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '40',
+ '--cases-per-batch',
+ '20',
+ '--max-stderr-bytes',
+ '128',
+ '--output-dir',
+ $lane_stderr_cap_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_STDERR_BYTES_PER_CASE' => '100' )
+);
+$lane_stderr_cap_state = is_file( $lane_stderr_cap_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $lane_stderr_cap_dir . '/state.json' ), true )
+ : array();
+$lane_stderr_cap_size = is_file( $lane_stderr_cap_dir . '/lane-0-stderr.log' ) ? filesize( $lane_stderr_cap_dir . '/lane-0-stderr.log' ) : 0;
+check(
+ 'runner caps per-lane stderr logs',
+ 0 === $lane_stderr_cap_runner['code'] &&
+ $lane_stderr_cap_size <= 128 &&
+ 1 === count( $lane_stderr_cap_state['worker_stderr_truncated'] ?? array() ),
+ $lane_stderr_cap_runner['stdout'] . $lane_stderr_cap_runner['stderr'] . ' stderr_size=' . $lane_stderr_cap_size . ' state=' . json_encode( $lane_stderr_cap_state['worker_stderr_truncated'] ?? null )
+);
+$lane_stderr_cap_reuse_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '40',
+ '--cases-per-batch',
+ '20',
+ '--max-stderr-bytes',
+ '128',
+ '--output-dir',
+ $lane_stderr_cap_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_STDERR_BYTES_PER_CASE' => '100' )
+);
+$lane_stderr_cap_reuse_size = is_file( $lane_stderr_cap_dir . '/lane-0-stderr.log' ) ? filesize( $lane_stderr_cap_dir . '/lane-0-stderr.log' ) : 0;
+check(
+ 'runner preserves per-lane stderr cap on reused output dirs',
+ 0 === $lane_stderr_cap_reuse_runner['code'] && $lane_stderr_cap_reuse_size <= 128,
+ $lane_stderr_cap_reuse_runner['stdout'] . $lane_stderr_cap_reuse_runner['stderr'] . ' stderr_size=' . $lane_stderr_cap_reuse_size
+);
+remove_tree( $lane_stderr_cap_dir );
+
+$lane_stderr_oversize_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-oversize-' . getmypid();
+remove_tree( $lane_stderr_oversize_dir );
+mkdir( $lane_stderr_oversize_dir, 0777, true );
+file_put_contents( $lane_stderr_oversize_dir . '/lane-0-stderr.log', str_repeat( 'X', 512 ) );
+$lane_stderr_oversize_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--max-stderr-bytes',
+ '128',
+ '--output-dir',
+ $lane_stderr_oversize_dir,
+ )
+);
+$lane_stderr_oversize_state = is_file( $lane_stderr_oversize_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $lane_stderr_oversize_dir . '/state.json' ), true )
+ : array();
+$lane_stderr_oversize_size = is_file( $lane_stderr_oversize_dir . '/lane-0-stderr.log' ) ? filesize( $lane_stderr_oversize_dir . '/lane-0-stderr.log' ) : 0;
+check(
+ 'runner truncates oversized reused lane stderr logs',
+ 0 === $lane_stderr_oversize_runner['code'] &&
+ $lane_stderr_oversize_size <= 128 &&
+ array() !== ( $lane_stderr_oversize_state['worker_stderr_startup_truncated'] ?? array() ),
+ $lane_stderr_oversize_runner['stdout'] . $lane_stderr_oversize_runner['stderr'] . ' stderr_size=' . $lane_stderr_oversize_size . ' state=' . json_encode( $lane_stderr_oversize_state['worker_stderr_startup_truncated'] ?? null )
+);
+remove_tree( $lane_stderr_oversize_dir );
+
+$lane_stderr_stale_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-stale-' . getmypid();
+remove_tree( $lane_stderr_stale_dir );
+mkdir( $lane_stderr_stale_dir, 0777, true );
+file_put_contents( $lane_stderr_stale_dir . '/lane-1-stderr.log', str_repeat( 'X', 512 ) );
+$lane_stderr_stale_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--max-stderr-bytes',
+ '128',
+ '--output-dir',
+ $lane_stderr_stale_dir,
+ )
+);
+$lane_stderr_stale_size = is_file( $lane_stderr_stale_dir . '/lane-1-stderr.log' ) ? filesize( $lane_stderr_stale_dir . '/lane-1-stderr.log' ) : 0;
+check(
+ 'runner truncates stale stderr logs from inactive lanes',
+ 0 === $lane_stderr_stale_runner['code'] && $lane_stderr_stale_size <= 128,
+ $lane_stderr_stale_runner['stdout'] . $lane_stderr_stale_runner['stderr'] . ' stderr_size=' . $lane_stderr_stale_size
+);
+remove_tree( $lane_stderr_stale_dir );
+
+$no_summary_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-no-summary-' . getmypid();
+remove_tree( $no_summary_dir );
+$no_summary_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--summary-mode',
+ 'none',
+ '--output-dir',
+ $no_summary_dir,
+ )
+);
+check(
+ 'runner can disable summary output',
+ 0 === $no_summary_runner['code'] && ! file_exists( $no_summary_dir . '/summary.ndjson' ),
+ $no_summary_runner['stdout'] . $no_summary_runner['stderr']
+);
+remove_tree( $no_summary_dir );
+
+$partial_artifact_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-partial-artifact-' . getmypid();
+remove_tree( $partial_artifact_dir );
+mkdir( $partial_artifact_dir . '/failure-orphan', 0777, true );
+file_put_contents( $partial_artifact_dir . '/failure-orphan/payload.txt', 'orphaned payload' );
+$partial_artifact_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--artifact-retention',
+ 'none',
+ '--output-dir',
+ $partial_artifact_dir,
+ )
+);
+$partial_artifact_state = is_file( $partial_artifact_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $partial_artifact_dir . '/state.json' ), true )
+ : array();
+check(
+ 'runner prunes partial failure artifacts on startup',
+ 0 === $partial_artifact_runner['code'] &&
+ ! is_dir( $partial_artifact_dir . '/failure-orphan' ) &&
+ ( $partial_artifact_state['artifact_retention']['startup_pruned_partial'] ?? 0 ) > 0,
+ $partial_artifact_runner['stdout'] . $partial_artifact_runner['stderr'] . json_encode( $partial_artifact_state['artifact_retention'] ?? null )
+);
+remove_tree( $partial_artifact_dir );
+
+$symlink_artifact_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-symlink-artifact-' . getmypid();
+remove_tree( $symlink_artifact_dir );
+mkdir( $symlink_artifact_dir . '/keepdir', 0777, true );
+file_put_contents( $symlink_artifact_dir . '/keepdir/important.txt', 'keep me' );
+$symlink_created = @symlink( $symlink_artifact_dir . '/keepdir', $symlink_artifact_dir . '/failure-link' );
+if ( $symlink_created ) {
+ $symlink_artifact_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--artifact-retention',
+ 'none',
+ '--output-dir',
+ $symlink_artifact_dir,
+ )
+ );
+ check(
+ 'runner prunes artifact symlinks without deleting targets',
+ 0 === $symlink_artifact_runner['code'] &&
+ ! file_exists( $symlink_artifact_dir . '/failure-link' ) &&
+ is_file( $symlink_artifact_dir . '/keepdir/important.txt' ),
+ $symlink_artifact_runner['stdout'] . $symlink_artifact_runner['stderr']
+ );
+} else {
+ check( 'runner prunes artifact symlinks without deleting targets', true, 'symlink unavailable' );
+}
+remove_tree( $symlink_artifact_dir );
+
+$glob_meta_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-glob-meta-' . getmypid();
+remove_tree( $glob_meta_dir );
+mkdir( $glob_meta_dir . '/run-*', 0777, true );
+mkdir( $glob_meta_dir . '/run-victim/keepdir', 0777, true );
+file_put_contents( $glob_meta_dir . '/run-victim/keepdir/important.txt', 'keep me' );
+$glob_meta_symlink_created = @symlink( $glob_meta_dir . '/run-victim/keepdir', $glob_meta_dir . '/run-victim/failure-link' );
+$glob_meta_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--artifact-retention',
+ 'none',
+ '--output-dir',
+ $glob_meta_dir . '/run-*',
+ )
+);
+check(
+ 'runner treats output dir metacharacters literally during startup pruning',
+ 0 === $glob_meta_runner['code'] &&
+ ( ! $glob_meta_symlink_created || file_exists( $glob_meta_dir . '/run-victim/failure-link' ) ) &&
+ is_file( $glob_meta_dir . '/run-victim/keepdir/important.txt' ),
+ $glob_meta_runner['stdout'] . $glob_meta_runner['stderr']
+);
+remove_tree( $glob_meta_dir );
+
+$symlink_write_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-symlink-write-' . getmypid();
+remove_tree( $symlink_write_dir );
+mkdir( $symlink_write_dir . '/keepdir', 0777, true );
+file_put_contents(
+ $symlink_write_dir . '/keepdir/failure.json',
+ json_encode(
+ array(
+ 'signatures' => array( 'decode-mismatch:text', 'reader-decode-mismatch:text' ),
+ )
+ )
+);
+$symlink_write_created = @symlink( $symlink_write_dir . '/keepdir', $symlink_write_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}" );
+if ( $symlink_write_created ) {
+ $symlink_write_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--seed',
+ (string) $skip_c1_fault_seed,
+ '--cases',
+ '200',
+ '--output-dir',
+ $symlink_write_dir,
+ '--progress-every',
+ '200',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+ );
+ $symlink_write_suffixed = glob( $symlink_write_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}-sig*/failure.json" );
+ check(
+ 'worker does not write through symlinked failure artifact dirs',
+ 1 === $symlink_write_worker['code'] &&
+ ! is_file( $symlink_write_dir . '/keepdir/payload.txt' ) &&
+ is_array( $symlink_write_suffixed ) &&
+ array() !== $symlink_write_suffixed,
+ $symlink_write_worker['stdout'] . $symlink_write_worker['stderr']
+ );
+} else {
+ check( 'worker does not write through symlinked failure artifact dirs', true, 'symlink unavailable' );
+}
+remove_tree( $symlink_write_dir );
+
+$incomplete_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-incomplete-manifest-' . getmypid();
+remove_tree( $incomplete_manifest_dir );
+mkdir( $incomplete_manifest_dir . '/failure-bad', 0777, true );
+file_put_contents(
+ $incomplete_manifest_dir . '/failure-bad/failure.json',
+ json_encode(
+ array(
+ 'signatures' => array( 'reader-decode-mismatch:text' ),
+ 'context' => 'text',
+ 'payload_base64' => '',
+ )
+ )
+);
+$incomplete_manifest_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--seed-base',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $incomplete_manifest_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+$incomplete_manifest_state = is_file( $incomplete_manifest_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $incomplete_manifest_dir . '/state.json' ), true )
+ : array();
+$incomplete_manifest_files = glob( $incomplete_manifest_dir . '/failure-*/failure.json' );
+$retained_manifest = is_array( $incomplete_manifest_files ) && 1 === count( $incomplete_manifest_files )
+ ? json_decode( (string) file_get_contents( $incomplete_manifest_files[0] ), true )
+ : array();
+check(
+ 'runner ignores incomplete manifests when enforcing retention cap',
+ 1 === $incomplete_manifest_runner['code'] &&
+ ! is_dir( $incomplete_manifest_dir . '/failure-bad' ) &&
+ ( $incomplete_manifest_state['artifact_retention']['startup_pruned_partial'] ?? 0 ) > 0 &&
+ is_array( $retained_manifest ) &&
+ isset( $retained_manifest['payload_base64'] ),
+ $incomplete_manifest_runner['stdout'] . $incomplete_manifest_runner['stderr'] . json_encode( $incomplete_manifest_state['artifact_retention'] ?? null )
+);
+remove_tree( $incomplete_manifest_dir );
+
+$nonreproducing_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-nonreproducing-manifest-' . getmypid();
+remove_tree( $nonreproducing_manifest_dir );
+mkdir( $nonreproducing_manifest_dir . '/failure-fake', 0777, true );
+$fake_payload = 'plain text';
+file_put_contents(
+ $nonreproducing_manifest_dir . '/failure-fake/failure.json',
+ json_encode(
+ array(
+ 'signatures' => array( 'reader-decode-mismatch:text' ),
+ 'context' => 'text',
+ 'input_size' => strlen( $fake_payload ),
+ 'payload_base64' => base64_encode( $fake_payload ),
+ 'failures' => array(
+ array( 'signature' => 'reader-decode-mismatch:text' ),
+ ),
+ )
+ )
+);
+$nonreproducing_manifest_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--seed-base',
+ '1',
+ '--cases-per-batch',
+ '1',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $nonreproducing_manifest_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+$nonreproducing_manifest_state = is_file( $nonreproducing_manifest_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $nonreproducing_manifest_dir . '/state.json' ), true )
+ : array();
+$nonreproducing_manifest_files = glob( $nonreproducing_manifest_dir . '/failure-*/failure.json' );
+$nonreproducing_retained = is_array( $nonreproducing_manifest_files ) && 1 === count( $nonreproducing_manifest_files )
+ ? json_decode( (string) file_get_contents( $nonreproducing_manifest_files[0] ), true )
+ : array();
+check(
+ 'runner ignores non-reproducing manifests when enforcing retention cap',
+ 1 === $nonreproducing_manifest_runner['code'] &&
+ ! is_dir( $nonreproducing_manifest_dir . '/failure-fake' ) &&
+ ( $nonreproducing_manifest_state['artifact_retention']['startup_pruned_partial'] ?? 0 ) > 0 &&
+ is_array( $nonreproducing_retained ) &&
+ isset( $nonreproducing_retained['payload_base64'] ) &&
+ 'plain text' !== base64_decode( $nonreproducing_retained['payload_base64'], true ),
+ $nonreproducing_manifest_runner['stdout'] . $nonreproducing_manifest_runner['stderr'] . json_encode( $nonreproducing_manifest_state['artifact_retention'] ?? null )
+);
+remove_tree( $nonreproducing_manifest_dir );
+
+$unverified_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unverified-manifest-' . getmypid();
+remove_tree( $unverified_manifest_dir );
+$unverified_seed_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '200',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $unverified_manifest_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+$unverified_before = glob( $unverified_manifest_dir . '/failure-*/failure.json' );
+$unverified_runner = run_process(
+ array(
+ PHP_BINARY,
+ '-d',
+ 'disable_functions=mb_check_encoding',
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--seed-base',
+ '9999',
+ '--cases-per-batch',
+ '1',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $unverified_manifest_dir,
+ )
+);
+$unverified_after = glob( $unverified_manifest_dir . '/failure-*/failure.json' );
+$unverified_state = is_file( $unverified_manifest_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $unverified_manifest_dir . '/state.json' ), true )
+ : array();
+check(
+ 'runner preserves retained artifacts when startup verification is unavailable',
+ 1 === $unverified_seed_runner['code'] &&
+ 0 === $unverified_runner['code'] &&
+ is_array( $unverified_before ) &&
+ is_array( $unverified_after ) &&
+ count( $unverified_before ) === count( $unverified_after ) &&
+ ( false !== ( $unverified_state['artifact_retention']['startup_verification_unavailable'] ?? false ) ),
+ $unverified_seed_runner['stdout'] . $unverified_seed_runner['stderr'] . $unverified_runner['stdout'] . $unverified_runner['stderr'] . json_encode( $unverified_state['artifact_retention'] ?? null )
+);
+remove_tree( $unverified_manifest_dir );
+
+$unverified_weak_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unverified-weak-manifest-' . getmypid();
+remove_tree( $unverified_weak_manifest_dir );
+$unverified_weak_seed_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '200',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $unverified_weak_manifest_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+mkdir( $unverified_weak_manifest_dir . '/failure-000weak', 0777, true );
+file_put_contents(
+ $unverified_weak_manifest_dir . '/failure-000weak/failure.json',
+ json_encode(
+ array(
+ 'signatures' => array( 'decode-mismatch:text', 'reader-decode-mismatch:text' ),
+ 'context' => 'text',
+ 'payload_base64' => base64_encode( 'x' ),
+ )
+ )
+);
+$unverified_weak_runner = run_process(
+ array(
+ PHP_BINARY,
+ '-d',
+ 'disable_functions=mb_check_encoding',
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--seed-base',
+ '9999',
+ '--cases-per-batch',
+ '1',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $unverified_weak_manifest_dir,
+ )
+);
+$unverified_weak_state = is_file( $unverified_weak_manifest_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $unverified_weak_manifest_dir . '/state.json' ), true )
+ : array();
+check(
+ 'runner ignores weak manifests when startup verification is unavailable',
+ 1 === $unverified_weak_seed_runner['code'] &&
+ 0 === $unverified_weak_runner['code'] &&
+ ! is_dir( $unverified_weak_manifest_dir . '/failure-000weak' ) &&
+ is_file( $unverified_weak_manifest_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}/failure.json" ) &&
+ ( $unverified_weak_state['artifact_retention']['startup_pruned_partial'] ?? 0 ) > 0 &&
+ ( false !== ( $unverified_weak_state['artifact_retention']['startup_verification_unavailable'] ?? false ) ),
+ $unverified_weak_seed_runner['stdout'] . $unverified_weak_seed_runner['stderr'] . $unverified_weak_runner['stdout'] . $unverified_weak_runner['stderr'] . json_encode( $unverified_weak_state['artifact_retention'] ?? null )
+);
+remove_tree( $unverified_weak_manifest_dir );
+
+$unverified_fake_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unverified-fake-manifest-' . getmypid();
+remove_tree( $unverified_fake_manifest_dir );
+$unverified_fake_seed_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '200',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $unverified_fake_manifest_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+$fake_payload = 'x';
+mkdir( $unverified_fake_manifest_dir . '/failure-000fake', 0777, true );
+file_put_contents(
+ $unverified_fake_manifest_dir . '/failure-000fake/failure.json',
+ json_encode(
+ array(
+ 'signatures' => array( 'decode-mismatch:text', 'reader-decode-mismatch:text' ),
+ 'context' => 'text',
+ 'input_size' => strlen( $fake_payload ),
+ 'payload_base64' => base64_encode( $fake_payload ),
+ 'failures' => array(
+ array( 'signature' => 'decode-mismatch:text' ),
+ array( 'signature' => 'reader-decode-mismatch:text' ),
+ ),
+ )
+ )
+);
+$unverified_fake_runner = run_process(
+ array(
+ PHP_BINARY,
+ '-d',
+ 'disable_functions=mb_check_encoding',
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--seed-base',
+ '9999',
+ '--cases-per-batch',
+ '1',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $unverified_fake_manifest_dir,
+ )
+);
+$unverified_fake_state = is_file( $unverified_fake_manifest_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $unverified_fake_manifest_dir . '/state.json' ), true )
+ : array();
+$unverified_fake_counts = $unverified_fake_state['artifact_retention']['retained_by_signature'] ?? array();
+check(
+ 'runner preserves real artifacts when startup verification cannot reject full-shape fakes',
+ 1 === $unverified_fake_seed_runner['code'] &&
+ 0 === $unverified_fake_runner['code'] &&
+ is_file( $unverified_fake_manifest_dir . '/failure-000fake/failure.json' ) &&
+ is_file( $unverified_fake_manifest_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}/failure.json" ) &&
+ array_sum( is_array( $unverified_fake_counts ) ? $unverified_fake_counts : array() ) >= 2 &&
+ ( false !== ( $unverified_fake_state['artifact_retention']['startup_verification_unavailable'] ?? false ) ),
+ $unverified_fake_seed_runner['stdout'] . $unverified_fake_seed_runner['stderr'] . $unverified_fake_runner['stdout'] . $unverified_fake_runner['stderr'] . json_encode( $unverified_fake_state['artifact_retention'] ?? null )
+);
+remove_tree( $unverified_fake_manifest_dir );
+
+$bad_integer = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--cases', 'abc' ) );
+check( 'worker rejects non-numeric integer options', 2 === $bad_integer['code'], $bad_integer['stdout'] . $bad_integer['stderr'] );
+
+$huge_integer = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--cases', '999999999999999999999999999999999999999' ) );
+check( 'worker rejects out-of-range integer options', 2 === $huge_integer['code'], $huge_integer['stdout'] . $huge_integer['stderr'] );
+
+$byte_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-byte-' . getmypid();
+remove_tree( $byte_pipeline_dir );
+$faulted_byte_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'bytes',
+ '--seed',
+ '1',
+ '--cases',
+ '200',
+ '--output-dir',
+ $byte_pipeline_dir,
+ '--progress-every',
+ '200',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' )
+);
+check( 'faulted byte-space worker reports findings', 1 === $faulted_byte_worker['code'], $faulted_byte_worker['stdout'] . $faulted_byte_worker['stderr'] );
+
+$byte_failure_files = glob( $byte_pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted byte-space worker writes failure artifact', is_array( $byte_failure_files ) && array() !== $byte_failure_files );
+
+$byte_failure_file = is_array( $byte_failure_files ) && array() !== $byte_failure_files ? $byte_failure_files[0] : null;
+if ( null !== $byte_failure_file ) {
+ $byte_manifest = json_decode( (string) file_get_contents( $byte_failure_file ), true );
+ check( 'byte-space failure artifact records mode', 'bytes' === ( $byte_manifest['mode'] ?? null ) );
+
+ $byte_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $byte_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' )
+ );
+ check( 'faulted byte-space replay reproduces finding', 1 === $byte_replay['code'], $byte_replay['stdout'] . $byte_replay['stderr'] );
+
+ $byte_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $byte_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' )
+ );
+ check( 'faulted byte-space minimizer preserves signature', 0 === $byte_minimize['code'], $byte_minimize['stdout'] . $byte_minimize['stderr'] );
+}
+remove_tree( $byte_pipeline_dir );
+
+$raw_c1_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-raw-c1-' . getmypid();
+remove_tree( $raw_c1_pipeline_dir );
+$faulted_raw_c1_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'bytes',
+ '--seed',
+ '1',
+ '--start-case',
+ '3',
+ '--cases',
+ '1',
+ '--output-dir',
+ $raw_c1_pipeline_dir,
+ '--progress-every',
+ '1',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'raw-c1-not-pass-through' )
+);
+check( 'faulted raw-C1 byte worker reports findings', 1 === $faulted_raw_c1_worker['code'], $faulted_raw_c1_worker['stdout'] . $faulted_raw_c1_worker['stderr'] );
+
+$raw_c1_failure_file = $raw_c1_pipeline_dir . '/failure-seed1-case3/failure.json';
+check( 'faulted raw-C1 byte worker writes failure artifact', is_file( $raw_c1_failure_file ) );
+
+if ( is_file( $raw_c1_failure_file ) ) {
+ $raw_c1_manifest = json_decode( (string) file_get_contents( $raw_c1_failure_file ), true );
+ check(
+ 'raw-C1 byte failure artifact records mode and signature',
+ 'bytes' === ( $raw_c1_manifest['mode'] ?? null ) &&
+ in_array( 'raw-c1-not-pass-through:text', $raw_c1_manifest['signatures'] ?? array(), true ),
+ json_encode( $raw_c1_manifest )
+ );
+
+ $raw_c1_replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $raw_c1_failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'raw-c1-not-pass-through' )
+ );
+ check( 'faulted raw-C1 byte replay reproduces finding', 1 === $raw_c1_replay['code'], $raw_c1_replay['stdout'] . $raw_c1_replay['stderr'] );
+
+ $raw_c1_minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $raw_c1_failure_file, '--signature', 'raw-c1-not-pass-through:text' ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'raw-c1-not-pass-through' )
+ );
+ check( 'faulted raw-C1 byte minimizer preserves signature', 0 === $raw_c1_minimize['code'], $raw_c1_minimize['stdout'] . $raw_c1_minimize['stderr'] );
+}
+remove_tree( $raw_c1_pipeline_dir );
+
+$byte_mode_collision_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-byte-mode-collision-' . getmypid();
+remove_tree( $byte_mode_collision_dir );
+mkdir( $byte_mode_collision_dir . '/failure-seed1-case3', 0777, true );
+file_put_contents(
+ $byte_mode_collision_dir . '/failure-seed1-case3/failure.json',
+ json_encode(
+ array(
+ 'signatures' => array(
+ 'text-without-ampersand-not-identity:text',
+ 'reader-decode-mismatch:text',
+ 'attribute-without-ampersand-not-identity:attribute',
+ 'reader-decode-mismatch:attribute',
+ ),
+ )
+ )
+);
+$byte_mode_collision_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--mode',
+ 'bytes',
+ '--seed',
+ '1',
+ '--cases',
+ '4',
+ '--output-dir',
+ $byte_mode_collision_dir,
+ '--progress-every',
+ '4',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' )
+);
+$byte_mode_collision_suffixed = glob( $byte_mode_collision_dir . '/failure-seed1-case3-sig*/failure.json' );
+check(
+ 'worker separates same-signature artifacts by mode',
+ 1 === $byte_mode_collision_worker['code'] &&
+ is_file( $byte_mode_collision_dir . '/failure-seed1-case3/failure.json' ) &&
+ is_array( $byte_mode_collision_suffixed ) &&
+ array() !== $byte_mode_collision_suffixed,
+ $byte_mode_collision_worker['stdout'] . $byte_mode_collision_worker['stderr']
+);
+remove_tree( $byte_mode_collision_dir );
+
+$byte_runner_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-byte-runner-fault-' . getmypid();
+remove_tree( $byte_runner_pipeline_dir );
+$faulted_byte_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'bytes',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--cases-per-batch',
+ '200',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $byte_runner_pipeline_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' )
+);
+$faulted_byte_runner_state = is_file( $byte_runner_pipeline_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $byte_runner_pipeline_dir . '/state.json' ), true )
+ : array();
+$faulted_byte_runner_modes = array_unique( array_map( static fn( $seed ): string => $seed['mode'] ?? '', $faulted_byte_runner_state['failure_seeds'] ?? array() ) );
+check(
+ 'faulted byte-space runner reports findings',
+ 1 === $faulted_byte_runner['code'] &&
+ ( $faulted_byte_runner_state['failures'] ?? 0 ) > 0 &&
+ array( 'bytes' ) === array_values( $faulted_byte_runner_modes ),
+ $faulted_byte_runner['stdout'] . $faulted_byte_runner['stderr'] . json_encode( $faulted_byte_runner_state )
+);
+remove_tree( $byte_runner_pipeline_dir );
+
+$mixed_mode_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-mixed-mode-runner-' . getmypid();
+remove_tree( $mixed_mode_runner_dir );
+$mixed_mode_oracle_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ '1',
+ '--cases-per-batch',
+ '200',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $mixed_mode_runner_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+$mixed_mode_byte_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--mode',
+ 'bytes',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ '1',
+ '--cases-per-batch',
+ '200',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $mixed_mode_runner_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+$mixed_mode_state = is_file( $mixed_mode_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $mixed_mode_runner_dir . '/state.json' ), true )
+ : array();
+$mixed_mode_failure_modes = array_unique( array_map( static fn( $seed ): string => $seed['mode'] ?? '', $mixed_mode_state['failure_seeds'] ?? array() ) );
+check(
+ 'runner separates retained same-signature artifacts by mode',
+ 1 === $mixed_mode_oracle_runner['code'] &&
+ 1 === $mixed_mode_byte_runner['code'] &&
+ in_array( 'bytes', $mixed_mode_failure_modes, true ),
+ $mixed_mode_oracle_runner['stdout'] . $mixed_mode_oracle_runner['stderr'] . $mixed_mode_byte_runner['stdout'] . $mixed_mode_byte_runner['stderr'] . json_encode( $mixed_mode_state['artifact_retention'] ?? null )
+);
+remove_tree( $mixed_mode_runner_dir );
+
+$pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-' . getmypid();
+remove_tree( $pipeline_dir );
+$faulted_worker = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../worker.php',
+ '--seed',
+ (string) $skip_c1_fault_seed,
+ '--cases',
+ '200',
+ '--output-dir',
+ $pipeline_dir,
+ '--progress-every',
+ '200',
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+check( 'faulted worker reports findings', 1 === $faulted_worker['code'], $faulted_worker['stdout'] . $faulted_worker['stderr'] );
+
+$failure_files = glob( $pipeline_dir . '/failure-*/failure.json' );
+check( 'faulted worker writes failure artifact', is_array( $failure_files ) && array() !== $failure_files );
+
+$failure_file = is_array( $failure_files ) && array() !== $failure_files ? $failure_files[0] : null;
+if ( null !== $failure_file ) {
+ $manifest = json_decode( (string) file_get_contents( $failure_file ), true );
+ $detail = $manifest['failures'][0]['detail'] ?? array();
+ check( 'failure artifact includes full expected/got', isset( $detail['expected_base64'], $detail['got_base64'] ) );
+
+ $replay = run_process(
+ array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+ );
+ check( 'faulted replay reproduces finding', 1 === $replay['code'], $replay['stdout'] . $replay['stderr'] );
+
+ $minimize = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $failure_file ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+ );
+ check( 'faulted minimizer preserves signature', 0 === $minimize['code'], $minimize['stdout'] . $minimize['stderr'] );
+
+ $minimize_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-minimize-' . getmypid();
+ remove_tree( $minimize_dir );
+ $minimize_output_dir = run_process(
+ array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $failure_file, '--output-dir', $minimize_dir ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+ );
+ check(
+ 'minimizer creates requested output directory',
+ 0 === $minimize_output_dir['code'] && is_file( $minimize_dir . '/minimized.json' ),
+ $minimize_output_dir['stdout'] . $minimize_output_dir['stderr']
+ );
+ remove_tree( $minimize_dir );
+}
+
+remove_tree( $pipeline_dir );
+
+$runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-runner-' . getmypid();
+remove_tree( $runner_dir );
+$faulted_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1000',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '1000',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $runner_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+$runner_state = is_file( $runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $runner_dir . '/state.json' ), true )
+ : array();
+check(
+ 'faulted runner reports findings',
+ 1 === $faulted_runner['code'] && ( $runner_state['failures'] ?? 0 ) > 0,
+ $faulted_runner['stdout'] . $faulted_runner['stderr']
+);
+$retained_counts = $runner_state['artifact_retention']['retained_by_signature'] ?? array();
+check(
+ 'faulted runner caps retained artifacts by signature',
+ array() !== $retained_counts && array() === array_filter( $retained_counts, static fn( $count ) => $count > 1 ),
+ json_encode( $retained_counts )
+);
+check(
+ 'faulted runner prunes repeated failure artifacts',
+ ( $runner_state['artifact_retention']['pruned'] ?? 0 ) > 0,
+ json_encode( $runner_state['artifact_retention'] ?? null )
+);
+$retained_failure_dirs = glob( $runner_dir . '/failure-*/failure.json' );
+check(
+ 'faulted runner prunes over-cap failure directories',
+ is_array( $retained_failure_dirs ) && count( $retained_failure_dirs ) === array_sum( $retained_counts ),
+ 'dirs=' . ( is_array( $retained_failure_dirs ) ? count( $retained_failure_dirs ) : 0 ) . ' counts=' . json_encode( $retained_counts )
+);
+$runner_summary_failures = array();
+if ( is_file( $runner_dir . '/summary.ndjson' ) ) {
+ foreach ( file( $runner_dir . '/summary.ndjson', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES ) ?: array() as $line ) {
+ $summary_record = json_decode( $line, true );
+ if ( is_array( $summary_record ) && 'failure' === ( $summary_record['type'] ?? null ) ) {
+ $runner_summary_failures[] = $summary_record;
+ }
+ }
+}
+check(
+ 'faulted runner writes bounded default failure summary',
+ count( $runner_summary_failures ) === array_sum( $retained_counts ) &&
+ ( $runner_state['failures'] ?? 0 ) > count( $runner_summary_failures ),
+ 'failures=' . ( $runner_state['failures'] ?? 0 ) . ' summary_failures=' . count( $runner_summary_failures )
+);
+$runner_state_failure_seeds = $runner_state['failure_seeds'] ?? array();
+check(
+ 'faulted runner writes bounded failure seed state',
+ is_array( $runner_state_failure_seeds ) &&
+ count( $runner_state_failure_seeds ) === array_sum( $retained_counts ) &&
+ ( $runner_state['failures'] ?? 0 ) > count( $runner_state_failure_seeds ),
+ 'failures=' . ( $runner_state['failures'] ?? 0 ) . ' state_failure_seeds=' . count( $runner_state_failure_seeds )
+);
+
+$reuse_same_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1000',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '1000',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $runner_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+$reuse_same_state = is_file( $runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $runner_dir . '/state.json' ), true )
+ : array();
+$reuse_same_counts = $reuse_same_state['artifact_retention']['retained_by_signature'] ?? array();
+$reuse_same_dirs = glob( $runner_dir . '/failure-*/failure.json' );
+check(
+ 'runner preserves retained same-seed artifacts on reuse',
+ 1 === $reuse_same_runner['code'] &&
+ is_array( $reuse_same_dirs ) &&
+ count( $reuse_same_dirs ) === array_sum( $reuse_same_counts ) &&
+ is_file( $runner_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}/failure.json" ) &&
+ array() === array_filter( $reuse_same_counts, static fn( $count ) => $count > 1 ),
+ $reuse_same_runner['stdout'] . $reuse_same_runner['stderr'] . json_encode( $reuse_same_state['artifact_retention'] ?? null )
+);
+remove_tree( $runner_dir );
+
+$different_signature_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-different-signature-reuse-' . getmypid();
+remove_tree( $different_signature_dir );
+$different_signature_first = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '200',
+ '--max-artifacts-per-signature',
+ '100',
+ '--output-dir',
+ $different_signature_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+$different_signature_second = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '200',
+ '--max-artifacts-per-signature',
+ '100',
+ '--artifact-retention',
+ 'all',
+ '--output-dir',
+ $different_signature_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' )
+);
+$different_signature_case_files = glob( $different_signature_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}*/failure.json" );
+$different_signature_seen = array();
+foreach ( is_array( $different_signature_case_files ) ? $different_signature_case_files : array() as $failure_file ) {
+ $manifest = json_decode( (string) file_get_contents( $failure_file ), true );
+ if ( is_array( $manifest ) && isset( $manifest['signatures'] ) && is_array( $manifest['signatures'] ) ) {
+ $different_signature_seen[] = implode( ',', $manifest['signatures'] );
+ }
+}
+check(
+ 'runner preserves same-seed artifacts with different signatures',
+ 1 === $different_signature_first['code'] &&
+ 1 === $different_signature_second['code'] &&
+ in_array( 'decode-mismatch:text,reader-decode-mismatch:text,decode-mismatch:attribute,reader-decode-mismatch:attribute', $different_signature_seen, true ) &&
+ in_array( 'reader-decode-mismatch:text,reader-decode-mismatch:attribute', $different_signature_seen, true ),
+ $different_signature_first['stdout'] . $different_signature_first['stderr'] . $different_signature_second['stdout'] . $different_signature_second['stderr'] . json_encode( $different_signature_seen )
+);
+remove_tree( $different_signature_dir );
+
+$overcap_reuse_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-overcap-reuse-' . getmypid();
+remove_tree( $overcap_reuse_dir );
+$overcap_seed_run = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1000',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '1000',
+ '--artifact-retention',
+ 'all',
+ '--output-dir',
+ $overcap_reuse_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+$overcap_before_dirs = glob( $overcap_reuse_dir . '/failure-*/failure.json' );
+$overcap_prune_run = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '1',
+ '--seed-base',
+ '9999',
+ '--cases-per-batch',
+ '1',
+ '--max-artifacts-per-signature',
+ '1',
+ '--output-dir',
+ $overcap_reuse_dir,
+ )
+);
+$overcap_state = is_file( $overcap_reuse_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $overcap_reuse_dir . '/state.json' ), true )
+ : array();
+$overcap_counts = $overcap_state['artifact_retention']['retained_by_signature'] ?? array();
+$overcap_after_dirs = glob( $overcap_reuse_dir . '/failure-*/failure.json' );
+check(
+ 'runner prunes reused output dirs back under cap',
+ 1 === $overcap_seed_run['code'] &&
+ 0 === $overcap_prune_run['code'] &&
+ is_array( $overcap_before_dirs ) &&
+ is_array( $overcap_after_dirs ) &&
+ count( $overcap_before_dirs ) > count( $overcap_after_dirs ) &&
+ ( $overcap_state['artifact_retention']['startup_pruned'] ?? 0 ) > 0 &&
+ count( $overcap_after_dirs ) === array_sum( $overcap_counts ) &&
+ array() === array_filter( $overcap_counts, static fn( $count ) => $count > 1 ),
+ $overcap_seed_run['stdout'] . $overcap_seed_run['stderr'] . $overcap_prune_run['stdout'] . $overcap_prune_run['stderr'] . json_encode( $overcap_state['artifact_retention'] ?? null )
+);
+remove_tree( $overcap_reuse_dir );
+
+$no_artifact_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-no-artifacts-' . getmypid();
+remove_tree( $no_artifact_dir );
+$no_artifact_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '200',
+ '--artifact-retention',
+ 'none',
+ '--output-dir',
+ $no_artifact_dir,
+ ),
+ array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' )
+);
+$no_artifact_state = is_file( $no_artifact_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $no_artifact_dir . '/state.json' ), true )
+ : array();
+$no_artifact_dirs = glob( $no_artifact_dir . '/failure-*/failure.json' );
+check(
+ 'runner can prune all failure artifacts',
+ 1 === $no_artifact_runner['code'] && ( $no_artifact_state['failures'] ?? 0 ) > 0 && ( $no_artifact_state['artifact_retention']['pruned'] ?? 0 ) > 0 && is_array( $no_artifact_dirs ) && 0 === count( $no_artifact_dirs ),
+ $no_artifact_runner['stdout'] . $no_artifact_runner['stderr'] . json_encode( $no_artifact_state['artifact_retention'] ?? null )
+);
+remove_tree( $no_artifact_dir );
+
+$corrupt_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-corrupt-runner-' . getmypid();
+remove_tree( $corrupt_runner_dir );
+$corrupt_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '200',
+ '--output-dir',
+ $corrupt_runner_dir,
+ ),
+ array(
+ 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap',
+ 'HTML_DECODER_FUZZ_CORRUPT_FAILURE_EVENT' => '1',
+ )
+);
+$corrupt_runner_state = is_file( $corrupt_runner_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $corrupt_runner_dir . '/state.json' ), true )
+ : array();
+check(
+ 'runner treats malformed finding events as harness errors',
+ 2 === $corrupt_runner['code'] && ( $corrupt_runner_state['harness_errors'] ?? 0 ) > 0,
+ $corrupt_runner['stdout'] . $corrupt_runner['stderr']
+);
+remove_tree( $corrupt_runner_dir );
+
+$bogus_mode_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-bogus-mode-' . getmypid();
+remove_tree( $bogus_mode_dir );
+$bogus_mode_runner = run_process(
+ array(
+ PHP_BINARY,
+ __DIR__ . '/../runner.php',
+ '--lanes',
+ '1',
+ '--duration-seconds',
+ '0',
+ '--max-cases',
+ '200',
+ '--seed-base',
+ (string) $skip_c1_fault_seed,
+ '--cases-per-batch',
+ '200',
+ '--output-dir',
+ $bogus_mode_dir,
+ ),
+ array(
+ 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap',
+ 'HTML_DECODER_FUZZ_BOGUS_FAILURE_MODE' => '1',
+ )
+);
+$bogus_mode_state = is_file( $bogus_mode_dir . '/state.json' )
+ ? json_decode( (string) file_get_contents( $bogus_mode_dir . '/state.json' ), true )
+ : array();
+check(
+ 'runner treats bogus failure modes as harness errors',
+ 2 === $bogus_mode_runner['code'] && ( $bogus_mode_state['harness_errors'] ?? 0 ) > 0,
+ $bogus_mode_runner['stdout'] . $bogus_mode_runner['stderr']
+);
+remove_tree( $bogus_mode_dir );
+
+echo $failed > 0 ? "\n{$failed} smoke check(s) FAILED\n" : "\nAll smoke checks passed\n";
+exit( $failed > 0 ? 1 : 0 );
diff --git a/tools/html-decoder-fuzz/worker.php b/tools/html-decoder-fuzz/worker.php
new file mode 100644
index 0000000000000..cbe0fbb1adf35
--- /dev/null
+++ b/tools/html-decoder-fuzz/worker.php
@@ -0,0 +1,318 @@
+ 1,
+ 'cases' => 1000,
+ 'start-case' => 0,
+ 'max-bytes' => 4096,
+ 'mode' => 'oracle',
+ 'output-dir' => '',
+ 'progress-every' => 500,
+ )
+);
+
+Cli::require_int_at_least( $options, 'cases', 1 );
+Cli::require_int_at_least( $options, 'start-case', 0 );
+Cli::require_int_at_least( $options, 'max-bytes', 1 );
+Cli::require_int_at_least( $options, 'progress-every', 1 );
+Cli::require_one_of( $options, 'mode', Cli::valid_modes() );
+
+Bootstrap::load_targets();
+
+$oracles = Oracles::build();
+foreach ( $oracles->drain_events() as $event ) {
+ Cli::emit( array( 'type' => 'oracle-event' ) + $event );
+}
+
+if ( Cli::mode_uses_oracle( $options['mode'] ) && ! $oracles->has_required() ) {
+ Cli::emit(
+ array(
+ 'type' => 'fatal',
+ 'reason' => 'required oracle unavailable or failed the battery',
+ )
+ );
+ exit( 2 );
+}
+
+$coverage = null;
+if ( 'coverage' === $options['mode'] ) {
+ if ( ! CoverageGuidance::available() ) {
+ Cli::emit(
+ array(
+ 'type' => 'fatal',
+ 'reason' => CoverageGuidance::unavailable_reason(),
+ )
+ );
+ exit( 2 );
+ }
+ $coverage = new CoverageGuidance();
+}
+
+$output_dir = $options['output-dir'];
+if ( '' !== $output_dir && ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) {
+ Cli::emit(
+ array(
+ 'type' => 'fatal',
+ 'reason' => "cannot create output dir {$output_dir}",
+ )
+ );
+ exit( 2 );
+}
+
+$checks = new Checks( $oracles );
+$reference_names = Bootstrap::named_reference_names();
+$seed = (string) $options['seed'];
+$start = $options['start-case'];
+$end = $start + $options['cases'];
+$stderr_bytes_per_case = max( 0, (int) getenv( 'HTML_DECODER_FUZZ_STDERR_BYTES_PER_CASE' ) );
+$stats = array(
+ 'cases' => 0,
+ 'failures' => 0,
+ 'bytes' => 0,
+ 'by_strategy' => array(),
+ 'by_context' => array(),
+);
+if ( null !== $coverage ) {
+ $stats['coverage_new_edges'] = 0;
+ $stats['coverage_payloads'] = 0;
+}
+$started_at = microtime( true );
+
+Cli::emit(
+ array(
+ 'type' => 'start',
+ 'seed' => $seed,
+ 'start_case' => $start,
+ 'cases' => $options['cases'],
+ 'max_bytes' => $options['max-bytes'],
+ 'mode' => $options['mode'],
+ 'environment' => Cli::environment_metadata( $oracles ),
+ )
+);
+
+for ( $case = $start; $case < $end; $case++ ) {
+ if ( $stderr_bytes_per_case > 0 ) {
+ fwrite( STDERR, str_repeat( 'E', $stderr_bytes_per_case ) . "\n" );
+ }
+
+ $prng = new Prng( "{$seed}:{$case}" );
+ $generator = new Generator( $prng, $options['max-bytes'], $reference_names );
+ if ( 'bytes' === $options['mode'] ) {
+ $generated = $generator->generate_bytes();
+ } elseif ( 'names' === $options['mode'] ) {
+ $generated = $generator->generate_name_sweep( $case );
+ } elseif ( 'legacy-followers' === $options['mode'] ) {
+ $generated = $generator->generate_legacy_follower_sweep( $case );
+ } elseif ( 'prefix-families' === $options['mode'] ) {
+ $generated = $generator->generate_prefix_family_sweep( $case );
+ } elseif ( 'numeric-boundaries' === $options['mode'] ) {
+ $generated = $generator->generate_numeric_boundary_sweep( $case );
+ } elseif ( 'corpus' === $options['mode'] ) {
+ $generated = $generator->generate_corpus_mutation( $case );
+ } elseif ( 'token-map' === $options['mode'] ) {
+ $generated = $generator->generate_token_map_sweep( $case );
+ } elseif ( 'coverage' === $options['mode'] ) {
+ $generated = $generator->generate();
+ } else {
+ $generated = $generator->generate();
+ }
+ $payload = $generated['payload'];
+ $context = $generated['context'];
+ $strategy = $generated['strategy'];
+
+ if ( null !== $coverage ) {
+ $coverage->begin_case();
+ }
+ $failures = 'bytes' === $options['mode']
+ ? $checks->run_without_oracle( $context, $payload )
+ : $checks->run( $context, $payload );
+ $coverage_edges = null === $coverage ? array() : $coverage->finish_case( $payload, $context, $strategy );
+
+ ++$stats['cases'];
+ $stats['bytes'] += strlen( $payload );
+ $stats['by_strategy'][ $strategy ] = ( $stats['by_strategy'][ $strategy ] ?? 0 ) + 1;
+ $stats['by_context'][ $context ] = ( $stats['by_context'][ $context ] ?? 0 ) + 1;
+
+ if ( null !== $coverage ) {
+ $new_edges = $coverage->new_edges( $coverage_edges );
+ if ( array() !== $new_edges ) {
+ $stats['coverage_new_edges'] += count( $new_edges );
+ ++$stats['coverage_payloads'];
+
+ try {
+ $coverage_artifact = $coverage->retain_payload( $output_dir, $seed, $case, $generated, $payload, $new_edges );
+ } catch ( \RuntimeException $exception ) {
+ Cli::emit(
+ array(
+ 'type' => 'fatal',
+ 'reason' => $exception->getMessage(),
+ )
+ );
+ exit( 2 );
+ }
+
+ $coverage_record = array(
+ 'type' => 'coverage',
+ 'seed' => $seed,
+ 'case' => $case,
+ 'mode' => $options['mode'],
+ 'context' => $context,
+ 'strategy' => $strategy,
+ 'input_size' => strlen( $payload ),
+ 'coverage_provider' => $coverage->provider(),
+ 'edge_count' => count( $coverage_edges ),
+ 'seen_edge_count' => $coverage->seen_edge_count(),
+ 'new_edge_count' => count( $new_edges ),
+ 'new_edges' => $new_edges,
+ ) + $coverage_artifact;
+ if ( strlen( $payload ) <= 4096 ) {
+ $coverage_record['payload_base64'] = base64_encode( $payload );
+ }
+ Cli::emit( $coverage_record );
+ }
+ }
+
+ if ( array() !== $failures ) {
+ $stats['failures'] += count( $failures );
+
+ $record = array(
+ 'type' => 'failure',
+ 'seed' => $seed,
+ 'case' => $case,
+ 'mode' => $options['mode'],
+ 'context' => $context,
+ 'strategy' => $strategy,
+ 'input_size' => strlen( $payload ),
+ 'signatures' => array_values( array_unique( array_column( $failures, 'signature' ) ) ),
+ 'failures' => $failures,
+ );
+
+ if ( strlen( $payload ) <= 4096 ) {
+ $record['payload_base64'] = base64_encode( $payload );
+ }
+
+ if ( '' !== $output_dir ) {
+ $signature_key = Cli::failure_signature_key( $record['signatures'], $record['mode'] );
+ $base_case_dir = "{$output_dir}/failure-seed{$seed}-case{$case}";
+ $case_dir = $base_case_dir;
+ $dir_matches_signature = static function ( string $dir ) use ( $signature_key ): bool {
+ if ( is_link( $dir ) ) {
+ return false;
+ }
+
+ $manifest = json_decode( (string) @file_get_contents( "{$dir}/failure.json" ), true );
+ $manifest_mode = $manifest['mode'] ?? 'oracle';
+ return is_array( $manifest ) &&
+ isset( $manifest['signatures'] ) &&
+ is_array( $manifest['signatures'] ) &&
+ is_string( $manifest_mode ) &&
+ in_array( $manifest_mode, Cli::valid_modes(), true ) &&
+ $signature_key === Cli::failure_signature_key( $manifest['signatures'], $manifest_mode );
+ };
+
+ if ( is_link( $case_dir ) || ( is_dir( $case_dir ) && ! $dir_matches_signature( $case_dir ) ) ) {
+ $suffix = substr( $signature_key, 0, 12 );
+ $case_dir = "{$base_case_dir}-sig{$suffix}";
+ $attempt = 2;
+ while ( is_link( $case_dir ) || ( is_dir( $case_dir ) && ! $dir_matches_signature( $case_dir ) ) ) {
+ $case_dir = "{$base_case_dir}-sig{$suffix}-{$attempt}";
+ ++$attempt;
+ }
+ }
+
+ if ( ! is_dir( $case_dir ) && ! mkdir( $case_dir, 0777, true ) ) {
+ Cli::emit(
+ array(
+ 'type' => 'fatal',
+ 'reason' => "cannot create failure artifact dir {$case_dir}",
+ )
+ );
+ exit( 2 );
+ }
+ if ( ! Cli::write_file( "{$case_dir}/payload.txt", $payload ) ) {
+ Cli::emit(
+ array(
+ 'type' => 'fatal',
+ 'reason' => "cannot write failure payload under {$case_dir}",
+ )
+ );
+ exit( 2 );
+ }
+
+ $artifact = $record;
+ $artifact['payload_base64'] = base64_encode( $payload );
+ $artifact['environment'] = Cli::environment_metadata( $oracles );
+ $artifact['git'] = Cli::git_metadata( Bootstrap::repo_root() );
+ $artifact_json = json_encode(
+ $artifact,
+ JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
+ );
+ if ( false === $artifact_json || ! Cli::write_file( "{$case_dir}/failure.json", $artifact_json ) ) {
+ Cli::emit(
+ array(
+ 'type' => 'fatal',
+ 'reason' => "cannot write failure manifest under {$case_dir}",
+ )
+ );
+ exit( 2 );
+ }
+ $record['artifact_dir'] = $case_dir;
+ }
+
+ if ( getenv( 'HTML_DECODER_FUZZ_CORRUPT_FAILURE_EVENT' ) ) {
+ if ( ! Cli::write_stream( STDOUT, "{\"type\":\"failure\"\n" ) ) {
+ fwrite( STDERR, "Cannot write corrupted failure event\n" );
+ exit( 2 );
+ }
+ } else {
+ if ( getenv( 'HTML_DECODER_FUZZ_BOGUS_FAILURE_MODE' ) ) {
+ $record['mode'] = 'bogus';
+ }
+ Cli::emit( $record );
+ }
+ }
+
+ if ( 0 === ( $stats['cases'] % max( 1, $options['progress-every'] ) ) ) {
+ $elapsed = microtime( true ) - $started_at;
+ Cli::emit(
+ array(
+ 'type' => 'progress',
+ 'seed' => $seed,
+ 'case' => $case,
+ 'cases_done' => $stats['cases'],
+ 'failures' => $stats['failures'],
+ 'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null,
+ )
+ );
+ }
+}
+
+$elapsed = microtime( true ) - $started_at;
+Cli::emit(
+ array(
+ 'type' => 'done',
+ 'seed' => $seed,
+ 'stats' => $stats,
+ 'elapsed_sec' => round( $elapsed, 2 ),
+ 'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null,
+ )
+);
+
+exit( $stats['failures'] > 0 ? 1 : 0 );