diff --git a/progress-handoff-91xXCG.md b/progress-handoff-91xXCG.md new file mode 100644 index 0000000000000..f849d0fbe4e51 --- /dev/null +++ b/progress-handoff-91xXCG.md @@ -0,0 +1,279 @@ +# Progress for handoff-91xXCG + +Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-91xXCG.md` + +## Status + +- [x] Confirmed no active `html-decoder-fuzz` run before editing. +- [x] Tier 1 item 1: run both decoder contexts per generated case. +- [x] Tier 1 item 2: add oracle-free arbitrary byte-space lane. +- [x] Tier 1 item 3: add reference-at-EOF generation strategy. +- [x] Tier 1 item 4: add `attribute_starts_with()` monotonicity invariants. +- [x] Tier 1 item 5: exercise multi-code-point `attribute_starts_with()` prefix paths. +- [x] Tier 1 item 6: add range-based numeric code point generation. +- [x] Tier 2 item 7: add exhaustive deterministic name sweep lane. +- [x] Tier 2 item 8: add edit-distance-1 lookalike generation. +- [x] Tier 2 item 9: add full follower-byte sweep after legacy names. +- [x] Tier 2 item 10: add prefix-family stress generation. +- [x] Tier 2 item 11: add digit-count numeric boundary stress generation. +- [x] Tier 2 item 12: add strategy composition and generalized attribute-prefix encoding. +- [x] Tier 2 item 13: add mutation/corpus mode. +- [x] Tier 2 item 14: add reader compositionality invariant. +- [x] Tier 2 item 15: add case-mangled valid-name near-misses. +- [x] Tier 3 item 16: assert null reader matches leave `match_byte_length` untouched. +- [x] Tier 3 item 17: assert non-ampersand reader offsets never match. +- [x] Tier 3 item 18: assert attribute no-amp identity in oracle mode. +- [x] Tier 3 item 19: add tab, LF, and FF to the oracle-safe generator alphabet. +- [x] Tier 3 item 20: assert reader reconstruction walks input without gaps or overlaps. +- [x] Tier 3 item 21: assert invalid numeric references decode to exactly U+FFFD. +- [x] Tier 3 item 22: assert C1 remapping applies only to numeric references while raw C1 bytes pass through unchanged. +- [x] Tier 3 item 23: add `html_entity_decode( ENT_HTML5 | ENT_QUOTES )` as a secondary text-context oracle. +- [x] Tier 3 item 24: add token-map structure-aware deterministic inputs. +- [x] Tier 3 item 25: add pcov-backed coverage-guided lane with new-edge corpus retention. +- [x] Tier 3 item 26: assert documented single-level decoding for nested ampersand references. +- [x] Cross-cutting concerns: sort derived name lists deterministically and document DOM oracle throughput limits. + +## Verification + +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` passed. +- 2026-06-11: `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 20 --progress-every 20` passed and reported `by_context: {"both":20}`. +- 2026-06-11: `php -l` passed for `Generator.php`, `Checks.php`, `Targets.php`, `worker.php`, `runner.php`, `replay.php`, `minimize.php`, and `tests/harness-smoke.php`. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200` passed. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=byte-no-amp-identity php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200` reported findings as expected. +- 2026-06-11: `php tools/html-decoder-fuzz/runner.php --mode bytes --lanes 1 --duration-seconds 0 --max-cases 200 --cases-per-batch 200 --summary-mode none --output-dir /tmp/html-decoder-fuzz-byte-check` passed. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=byte-no-amp-identity php tools/html-decoder-fuzz/runner.php --mode bytes --lanes 1 --duration-seconds 0 --max-cases 200 --cases-per-batch 200 --max-artifacts-per-signature 1 --output-dir /tmp/html-decoder-fuzz-byte-fault-runner` reported findings as expected. +- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode bytes --seed 1 --case 0` passed. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after byte-space lane coverage was added. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding mode-aware artifact separation, oracle-trap, and bogus-mode malformed-record coverage. +- 2026-06-11: `git diff --check` passed. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` passed after adding the reference-at-EOF strategy. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed and reported `reference-at-eof: 46`. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding reference-at-EOF coverage. +- 2026-06-11: Documented that adding the new weighted strategy intentionally changes generated-case `--seed --case` payload mapping; failure-manifest replay remains payload-stable. +- 2026-06-11: Verified `reference-at-eof` still ends in a reference for `max-bytes` 1, 2, 3, 4, 5, and 8 after reserving suffix space. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after tightening EOF suffix-shape coverage. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Checks.php`, `php -l tools/html-decoder-fuzz/lib/Targets.php`, and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding `attribute_starts_with()` monotonicity checks. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding `attribute_starts_with()` prefix, extension, case monotonicity, and fault-target coverage. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed with no real-target findings after adding `attribute_starts_with()` monotonicity checks. +- 2026-06-11: `git diff --check` passed after adding `attribute_starts_with()` monotonicity checks. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Checks.php`, `php -l tools/html-decoder-fuzz/lib/Generator.php`, `php -l tools/html-decoder-fuzz/lib/Targets.php`, and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding multi-code-point `attribute_starts_with()` prefix coverage. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding byte-slice search probes, multi-code-point generator cases, and the `attribute-multicodepoint-prefix` fault target. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed with no real-target findings after adding multi-code-point prefix coverage. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-multicodepoint-prefix php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 681 --cases 1 --progress-every 1` reported findings as expected and verified invalid-UTF-8 search details remain JSON-safe. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-multicodepoint-prefix php tools/html-decoder-fuzz/replay.php --seed 1 --case 681` reproduced the multi-code-point prefix finding. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-multicodepoint-prefix php tools/html-decoder-fuzz/minimize.php --failure /tmp/html-decoder-fuzz-multicodepoint-fault-681/failure-seed1-case681/failure.json` minimized the finding from 18 to 6 bytes. +- 2026-06-11: `git diff --check` passed after adding multi-code-point prefix coverage. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding range-based numeric code point generation. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting numeric range buckets, all 32 C1 remap rows, and all 16 noncharacter planes. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed with no real-target findings after adding range-based numeric code points. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=skip-c1-remap php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 128 --cases 1 --progress-every 1` reported findings as expected after the range generator shifted the deterministic C1 fault case from 170 to 128. +- 2026-06-11: `git diff --check` passed after adding range-based numeric code points. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed after addressing reviewer feedback on post-surrogate BMP coverage and multi-reference numeric smoke classification. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed after adding explicit BMP terminal noncharacter coverage for `0xFFFE` and `0xFFFF`. +- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `runner.php`, `replay.php`, `minimize.php`, and `tests/harness-smoke.php` after adding the deterministic name-sweep lane. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding full-period name-sweep generator coverage plus worker, runner, and replay smoke checks for `--mode names`. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode names --seed 1 --cases 1000 --progress-every 1000` passed and reported `by_strategy: {"name-sweep":1000}` and `by_context: {"both":1000}`. +- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode names --seed 1 --case 11593` passed for the deterministic `Áx` case. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-semicolonless php tools/html-decoder-fuzz/replay.php --mode names --seed 1 --case 11593` reproduced the expected attribute decode mismatch for `Áx`. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-semicolonless php tools/html-decoder-fuzz/minimize.php --failure /tmp/html-decoder-fuzz-name-fault-11593/failure-seed1-case11593/failure.json` minimized the finding from 8 to 7 bytes. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding reviewer-requested checks for distinct `names` runner start-case windows and the faulted name-sweep worker/replay/minimize pipeline. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding edit-distance-1 lookalike generation. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting lookalike samples produce edit-distance-1 name misses and a sparse-name corpus exercises delete, insert, substitute, and transpose branches. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed with no real-target findings after adding dynamic lookalikes. +- 2026-06-11: `git diff --check` passed after adding dynamic lookalikes. +- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `runner.php`, `replay.php`, `tests/harness-smoke.php`, `class-wp-html-decoder.php`, and `wpHtmlDecoder.php` after adding the legacy-follower sweep and ASCII-only ambiguous follower fix. +- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode legacy-followers --seed 1 --case 124` initially reproduced a real attribute decode mismatch for `Á\xC2\x80`; after replacing locale-sensitive `ctype_alnum()` with ASCII byte checks, the replay passed. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `vendor/bin/phpunit --group html-api tests/phpunit/tests/html-api/wpHtmlDecoder.php`, `php tools/html-decoder-fuzz/worker.php --mode legacy-followers --seed 1 --cases 300 --progress-every 300`, `php tools/html-decoder-fuzz/runner.php --mode legacy-followers --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-legacy-followers-check-fixed`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed. +- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `runner.php`, `replay.php`, and `tests/harness-smoke.php` after adding the prefix-family sweep mode. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting prefix-family full-period mapping over the exact expected reference set, reference splits, and ambiguous followers plus worker, runner, replay, seed-replay fault, and failure-manifest fault-pipeline coverage. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode prefix-families --seed 1 --cases 300 --progress-every 300`, `php tools/html-decoder-fuzz/runner.php --mode prefix-families --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-prefix-families-runner-check`, `php tools/html-decoder-fuzz/replay.php --mode prefix-families --seed 1 --case 37`, `HTML_DECODER_FUZZ_FAULT=attribute-semicolonless php tools/html-decoder-fuzz/worker.php --mode prefix-families --seed 1 --start-case 37 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-prefix-families-fault-check`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed. +- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `replay.php`, and `tests/harness-smoke.php` after adding the numeric-boundary sweep mode. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting numeric-boundary full-period mapping over 6/7 hex and 7/8 decimal significant digit counts, leading-zero variants, semicolon variants, mixed-case hex digits, worker, runner, replay, seed-replay fault, and failure-manifest fault-pipeline coverage. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode numeric-boundaries --seed 1 --cases 300 --progress-every 300`, `php tools/html-decoder-fuzz/runner.php --mode numeric-boundaries --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-numeric-boundaries-runner-check`, `php tools/html-decoder-fuzz/replay.php --mode numeric-boundaries --seed 1 --case 25`, `HTML_DECODER_FUZZ_FAULT=match-length-off-by-one php tools/html-decoder-fuzz/worker.php --mode numeric-boundaries --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-numeric-boundaries-fault-check`, `HTML_DECODER_FUZZ_FAULT=match-length-off-by-one php tools/html-decoder-fuzz/replay.php --failure /tmp/html-decoder-fuzz-numeric-boundaries-fault-check/failure-seed1-case0/failure.json`, `HTML_DECODER_FUZZ_FAULT=match-length-off-by-one php tools/html-decoder-fuzz/minimize.php --failure /tmp/html-decoder-fuzz-numeric-boundaries-fault-check/failure-seed1-case0/failure.json`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, and `git diff --check` passed. +- 2026-06-11: After reviewer feedback, exact-max numeric-boundary cases now use in-range payloads (`􏿿` and `􏿮` casing variants) while max-plus-one cases remain invalid; `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/replay.php --mode numeric-boundaries --seed 1 --case 25`, the refreshed fault-manifest replay/minimize, default 500-case worker, and `git diff --check` passed. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding the weighted composition strategy and generalized attribute-prefix encoder. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting all 12 weighted strategies appear, composition generates multi-reference splices, generalized attribute-prefix encoding covers every target string and literal/decimal/leading-zero/hex/semicolonless forms, and the skip-C1 fault artifact checks use the new deterministic case 157 after generator weighting shifted seed/case mapping. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` passed after adding strategy composition and reported `by_strategy` including `"composition":30`; `git diff --check` passed. +- 2026-06-11: After reviewer feedback, semicolonless numeric boundary protection now treats `;` as a reference-extending follower, composition inserts explicit fragment separators, and smoke asserts the exact weighted strategy set plus 2-3 separated composition fragments; `php tools/html-decoder-fuzz/tests/harness-smoke.php` and `git diff --check` passed. +- 2026-06-11: After follow-up reviewer feedback, composition now keeps separated fragments nonempty under small public `max-bytes` values; a targeted probe for max bytes 3, 5, 7, and 12, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed. +- 2026-06-11: `php -l` passed for `Cli.php`, `Generator.php`, `worker.php`, `replay.php`, and `tests/harness-smoke.php` after adding the corpus mutation mode. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting corpus seed-corpus size, retained html5lib text and attribute vectors, all four mutation strategies, semicolon-toggle/reference-duplication shapes, UTF-8-safe splice/perturb mutations, oracle-safe diversified payloads, worker, runner start windows, seed replay, faulted seed replay, and failure-manifest replay/minimize coverage. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --cases 300 --progress-every 300` passed and reported all corpus mutation strategies with `by_context: {"both":300}`. +- 2026-06-11: `php tools/html-decoder-fuzz/runner.php --mode corpus --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-corpus-runner-check-20260611-2` passed. +- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode corpus --seed 1 --case 0` passed for the deterministic `corpus-byte-perturb` case with hex preview `67262335383b`. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=match-length-off-by-one php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-corpus-fault-check-20260611-2` reported the expected `reader-overran-input` findings; replaying and minimizing the resulting failure manifest with the same fault both succeeded. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500` and `git diff --check` passed after adding corpus mutation mode. +- 2026-06-11: After reviewer feedback, html5lib tree-construction entity rows now normalize simple `
` and `
...
` fixtures into decoder payloads before oracle-safety filtering, corpus mutations choose splice/edit offsets on UTF-8 boundaries, and smoke asserts retained WPT attribute sentinels plus mutation helper shapes; `php tools/html-decoder-fuzz/tests/harness-smoke.php`, the refreshed corpus worker/runner/replay/fault-manifest checks, default 500-case worker, and `git diff --check` passed. +- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after adding the reader compositionality invariant. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed after asserting empty reader chunks, one-byte matches, and non-compositional local-slice reads are detected by fault targets. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/replay.php --seed 1 --case 31`, and `git diff --check` passed after adding reader compositionality checks. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=reader-substring-composition php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 31 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-reader-composition-fault-20260611-1` reported `reader-composition-mismatch` findings; replaying and minimizing the resulting failure manifest with the same fault both succeeded. +- 2026-06-11: After reviewer feedback, `php tools/html-decoder-fuzz/tests/harness-smoke.php` passed with automated worker, failure-manifest, replay, and minimize coverage for `reader-empty-chunk`, `reader-short-match-length`, and `reader-substring-composition`. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding the case-mangled named-reference strategy. +- 2026-06-11: A targeted probe over 2,000 seeds produced 115 distinct `case-mangled-name` candidates with zero invalid shape/collision samples, including both lowercase-to-uppercase and uppercase-to-lowercase flips. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=skip-c1-remap php tools/html-decoder-fuzz/worker.php --seed 2 --start-case 36 --cases 1 --progress-every 1`, `HTML_DECODER_FUZZ_FAULT=reader-empty-chunk php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 57 --cases 1 --progress-every 1`, and `HTML_DECODER_FUZZ_FAULT=reader-substring-composition php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 97 --cases 1 --progress-every 1` reported the expected findings after the weighted strategy shifted generated-case mappings. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding case-mangled valid-name near-misses. +- 2026-06-11: After reviewer feedback, case-mangled smoke coverage now directly invokes `case_mangle_name_base()` against lowercase and uppercase source names; `php -l tools/html-decoder-fuzz/lib/Generator.php`, `php -l tools/html-decoder-fuzz/tests/harness-smoke.php`, a direct helper probe reporting `errors=0`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed. +- 2026-06-11: Adding the null-return `match_byte_length` sentinel invariant exposed a real `WP_HTML_Decoder::read_character_reference()` issue for unmatched named references in `data` context; `WP_Token_Map::read_token()` returns `null`, and the decoder now checks for `null` instead of `false`. +- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, `tests/harness-smoke.php`, `class-wp-html-decoder.php`, and `wpHtmlDecoder.php` after adding the null-return match-length invariant and decoder regression test. +- 2026-06-11: `vendor/bin/phpunit --group html-api tests/phpunit/tests/html-api/wpHtmlDecoder.php` passed with the unmatched named-reference match-length regression coverage. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=reader-null-mutates-match-length php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 7 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-null-match-fault-check` reported `reader-mutated-match-length-on-null` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after fixing the decoder and adding the invariant. +- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, `tests/harness-smoke.php`, and `wpHtmlDecoder.php` after adding non-ampersand reader-offset probes. +- 2026-06-11: `vendor/bin/phpunit --group html-api tests/phpunit/tests/html-api/wpHtmlDecoder.php` passed with non-ampersand offset match-length regression coverage. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=reader-non-amp-match php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-non-amp-fault-check` reported `reader-non-amp-match` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding non-ampersand reader-offset probes. +- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after generalizing no-amp identity checks to attribute context. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-no-amp-identity php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 38 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-attr-no-amp-fault-check-item18` reported `attribute-without-ampersand-not-identity` findings; replaying the failure manifest reproduced the findings and minimizing it completed successfully. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding attribute no-amp identity coverage. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after adding tab, LF, and FF to the oracle-safe generator alphabet. +- 2026-06-11: A reflection probe confirmed the generator alphabet contains space, tab, LF, and FF and remains `Generator::is_oracle_safe_payload()` safe. +- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode corpus --seed 1 --case 0` passed with the refreshed deterministic corpus byte-perturb preview `64262335383b`, and `php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --cases 300 --progress-every 300` passed. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after expanding the generator alphabet with tab, LF, and FF. +- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after adding the gapless reader-walk invariant. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=reader-gapless-drop-span php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-gapless-fault-check` reported `reader-walk-not-gapless` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding gapless reader-walk coverage. +- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after adding the invalid numeric replacement invariant. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=numeric-invalid-not-replacement php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 0 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-invalid-numeric-fault-check` reported `numeric-invalid-not-replacement` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding invalid numeric replacement coverage. +- 2026-06-11: `php -l` passed for `Checks.php`, `Targets.php`, and `tests/harness-smoke.php` after adding numeric C1 remap and raw C1 pass-through invariants. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=numeric-c1-not-remapped php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 2 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-c1-fault-check` reported `numeric-c1-not-remapped` findings; replaying the failure manifest reproduced the findings and minimizing it preserved the signature. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=raw-c1-not-pass-through php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --start-case 3 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-raw-c1-fault-check` reported `raw-c1-not-pass-through` findings; replaying the failure manifest reproduced the findings and minimizing it with `--signature raw-c1-not-pass-through:text` preserved the raw-C1-specific signature. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding C1 remap-only-for-numeric coverage. +- 2026-06-11: `php -l` passed for `Oracles.php`, `Checks.php`, `Targets.php`, `worker.php`, and `tests/harness-smoke.php` after adding the secondary text oracle. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=text-secondary-oracle php tools/html-decoder-fuzz/worker.php --seed 1 --start-case 4 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-secondary-fault-check` reported `text-secondary-oracle-mismatch` findings; replaying the failure manifest reproduced the findings and minimizing it with `--signature text-secondary-oracle-mismatch:text` preserved the secondary-oracle signature. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding the secondary text oracle and tightening it to known semicolon-terminated names. +- 2026-06-11: `php -l` passed for `Bootstrap.php`, `Cli.php`, `Generator.php`, `worker.php`, `replay.php`, and `tests/harness-smoke.php` after adding the token-map structure-aware sweep mode. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --mode token-map --seed 1 --cases 764 --progress-every 764` passed for one full token-map period and reported `by_strategy: {"token-map-structure-sweep":764}` and `by_context: {"both":764}`. +- 2026-06-11: `php tools/html-decoder-fuzz/runner.php --mode token-map --lanes 2 --duration-seconds 0 --max-cases 200 --cases-per-batch 100 --summary-mode all --output-dir /tmp/html-decoder-fuzz-token-map-runner-check` passed with distinct start-case windows. +- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode token-map --seed 1 --case 0` passed for the deterministic `&AEaQQ;` large-prefix divergent case. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=attribute-semicolonless php tools/html-decoder-fuzz/worker.php --mode token-map --seed 1 --start-case 631 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-token-map-fault-check` reported the expected `decode-mismatch:attribute` finding; replaying and minimizing the resulting failure manifest with the same fault both succeeded. +- 2026-06-11: `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, `php tools/html-decoder-fuzz/tests/harness-smoke.php`, and `git diff --check` passed after adding the token-map mode, smoke coverage, and docs. +- 2026-06-11: Local PHP did not have the `pcov` extension installed (`php --ri pcov` reported `Extension 'pcov' not present`), so coverage-mode smoke coverage used the explicit `HTML_DECODER_FUZZ_FAKE_COVERAGE=1` provider while the real mode reports a fatal error when pcov is unavailable. +- 2026-06-11: `php -l` passed for `CoverageGuidance.php`, `Cli.php`, `worker.php`, `runner.php`, `replay.php`, and `tests/harness-smoke.php` after adding coverage mode. +- 2026-06-11: `HTML_DECODER_FUZZ_DISABLE_PCOV=1 HTML_DECODER_FUZZ_FAKE_COVERAGE=0 php tools/html-decoder-fuzz/worker.php --mode coverage --seed 1 --cases 1 --progress-every 1` exited `2` with the expected fatal `coverage mode requires pcov`. +- 2026-06-11: `HTML_DECODER_FUZZ_FAKE_COVERAGE=1 php tools/html-decoder-fuzz/worker.php --mode coverage --seed 1 --cases 8 --progress-every 8 --output-dir /tmp/html-decoder-fuzz-coverage-worker-check` passed and retained fake new-edge payloads under `coverage-corpus/`. +- 2026-06-11: `HTML_DECODER_FUZZ_FAKE_COVERAGE=1 php tools/html-decoder-fuzz/runner.php --mode coverage --lanes 2 --duration-seconds 0 --max-cases 40 --cases-per-batch 20 --summary-mode failures --output-dir /tmp/html-decoder-fuzz-coverage-runner-check` passed and wrote coverage state with `cases=40`, `edges=76`, `payloads=40`, and `40` coverage corpus manifests. +- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode coverage --seed 1 --case 0` passed for the deterministic coverage-mode generated case. +- 2026-06-11: `HTML_DECODER_FUZZ_FAKE_COVERAGE=1 HTML_DECODER_FUZZ_FAULT=reader-empty-chunk php tools/html-decoder-fuzz/worker.php --mode coverage --seed 1 --start-case 57 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-coverage-fault-check` reported the expected reader findings; replaying and minimizing the resulting coverage-mode failure manifest with `HTML_DECODER_FUZZ_FAULT=reader-empty-chunk` both succeeded. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, and `git diff --check` passed after adding coverage mode, fake-provider smoke coverage, and docs. +- 2026-06-11: `php -l` passed for `Checks.php`, `Oracles.php`, `Targets.php`, and `tests/harness-smoke.php` after adding the single-level decode invariant and `single-level-overdecode` fault target. +- 2026-06-11: A direct real-target probe over `pre&amp;post` returned no failures, and `php tools/html-decoder-fuzz/replay.php --mode corpus --seed 1 --case 11875` passed for the deterministic `&amp;Z` corpus-splice fixture. +- 2026-06-11: `HTML_DECODER_FUZZ_FAULT=single-level-overdecode php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --start-case 11875 --cases 1 --progress-every 1 --output-dir /tmp/html-decoder-fuzz-single-level-fault-check` reported `single-level-decode-overdecoded` findings in text and attribute contexts; replaying the manifest reproduced the findings and minimizing it with `--signature single-level-decode-overdecoded:text` reduced the payload to `&amp;`. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, and `git diff --check` passed after adding single-level decode checks, smoke coverage, and docs. +- 2026-06-11: `php -l tools/html-decoder-fuzz/lib/Generator.php` and `php -l tools/html-decoder-fuzz/tests/harness-smoke.php` passed after sorting derived named-reference lists. +- 2026-06-11: `php tools/html-decoder-fuzz/replay.php --mode corpus --seed 1 --case 11875` and `php tools/html-decoder-fuzz/replay.php --mode names --seed 1 --case 11593` still passed with the expected deterministic payloads after adding explicit generator list sorting. +- 2026-06-11: `php tools/html-decoder-fuzz/tests/harness-smoke.php`, `php tools/html-decoder-fuzz/worker.php --seed 1 --cases 500 --progress-every 500`, `php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 200 --progress-every 200`, and `git diff --check` passed after addressing the cross-cutting determinism and throughput notes. + +## Review Log + +- Tier 1 item 1: + - Curie: APPROVE, determinism/API behavior. + - Dewey: APPROVE, harness and fault-injection coverage. + - Mencius: APPROVE, runtime/replay compatibility. +- Tier 1 item 2: + - Jason: APPROVE, byte generator/check semantics. + - Rawls: APPROVE, CLI/artifact compatibility after mode-aware artifact keying fix. + - Sartre: APPROVE, tests and documentation after oracle-trap and bogus-mode coverage. +- Tier 1 item 3: + - Hegel: APPROVE, generator semantics after max-bytes suffix reservation fix. + - Lovelace: APPROVE, smoke coverage after strict EOF suffix-shape checks. + - Erdos: APPROVE, docs/replay compatibility after documenting generated-case mapping drift. +- Tier 1 item 4: + - Copernicus: APPROVE, invariant semantics and exception handling. + - Maxwell: APPROVE, fault-target and smoke coverage. + - Poincare: APPROVE, integration/runtime compatibility. +- Tier 1 item 5: + - Banach: APPROVE, generator and fault-target coverage. + - Meitner: APPROVE, byte-slice search semantics and JSON-safe failure details. + - Carver: APPROVE, worker/replay/minimize integration and runtime compatibility. +- Tier 1 item 6: + - Kepler: APPROVE, numeric generator ranges after post-surrogate BMP coverage fix. + - Pascal: APPROVE, numeric smoke coverage after BMP noncharacter and multi-reference fixes. + - Beauvoir: APPROVE, integration/runtime compatibility after explicit `0xFFFE`/`0xFFFF` coverage. +- Tier 2 item 7: + - Mendel: APPROVE, generator semantics and deterministic mapping after smoke additions. + - Pasteur: APPROVE, CLI/worker/replay/minimize/runner integration and mode handling. + - Popper: APPROVE, smoke and fault-pipeline coverage after requested start-window and name-fault checks. +- Tier 2 item 8: + - Hilbert: APPROVE, generator semantics and single-edit mutation filtering after sparse smoke fix. + - Sagan: APPROVE, smoke rigor after branch-specific sparse corpus coverage replaced inferred operation coverage. + - Turing: APPROVE, runtime/integration compatibility and deterministic replay behavior. +- Tier 2 item 9: + - Chandrasekhar: APPROVE, `legacy-followers` generator/mode semantics and deterministic sharding. + - Linnaeus: APPROVE, ASCII-only ambiguous follower decoder fix and PHPUnit coverage. + - Leibniz: APPROVE, smoke/integration coverage for full-period sweep, runner windows, and fault pipeline. +- Tier 2 item 10: + - Gauss: APPROVE, prefix-family generator semantics after exact reference-set and replay smoke tightening. + - Peirce: APPROVE, CLI/worker/replay/runner integration and oracle-backed deterministic sharding. + - Noether: APPROVE, smoke coverage after requested exact reference and seed/case replay checks. +- Tier 2 item 11: + - Plato: APPROVE, numeric-boundary generator semantics after in-range exact-max correction and decode-outcome smoke tightening. + - Socrates: APPROVE, CLI/worker/replay/runner integration and artifact replay after the mixed-case case update. + - Volta: APPROVE, smoke/docs/progress coverage after exact-max and max-plus-one replacement assertions. +- Tier 2 item 12: + - Lagrange: APPROVE, generator semantics after semicolon follower protection and small-`max-bytes` composition fixes. + - Pauli: APPROVE, smoke coverage after exact strategy-set and delimiter-based composition assertions. + - Locke: APPROVE, integration/docs/progress accuracy after weighted composition and generalized encoder changes. +- Tier 2 item 13: + - Russell: APPROVE, corpus generator semantics after WPT attribute retention and UTF-8 boundary fixes. + - Ramanujan: APPROVE, CLI/worker/replay/runner integration and deterministic corpus replay. + - Zeno: APPROVE, smoke/docs/progress coverage after WPT sentinel and mutation-shape assertions. +- Tier 2 item 14: + - Boyle: APPROVE, reader compositionality invariant semantics and deterministic cases after pipeline coverage. + - Kuhn: APPROVE, fault-target and smoke coverage after automated worker/replay/minimize pipelines. + - Bohr: APPROVE, integration/runtime/docs/progress coverage after shared reader-path verification. +- Tier 2 item 15: + - Anscombe: APPROVE, generator semantics after independent generated-candidate and raw-helper probes. + - Cicero: APPROVE, smoke and deterministic fault fixture coverage after direct lowercase/uppercase helper checks replaced ambiguous source inference. + - Parfit: APPROVE, integration/docs/progress scope and generated-case mapping drift notes. +- Tier 3 item 16: + - Singer: APPROVE, production decoder semantics and PHPUnit regression coverage. + - Darwin: APPROVE, fuzzer invariant, fault target, and smoke pipeline coverage. + - Harvey: APPROVE, integration/docs/progress scope including the decoder fix exposed by the invariant. +- Tier 3 item 17: + - Lorentz: APPROVE, non-ampersand reader-offset invariant semantics and byte-safety. + - Arendt: APPROVE, fault target, smoke pipeline, and PHPUnit coverage. + - Gibbs: APPROVE, integration/docs/progress scope and commit boundaries. +- Tier 3 item 18: + - Wegener: APPROVE, no-amp identity invariant semantics after README wording correction. + - Descartes: APPROVE, attribute no-amp fault target, smoke pipeline, and docs after stale Checks doc fix. + - Hypatia: APPROVE, integration/progress scope and commit boundaries after README wording correction. +- Tier 3 item 19: + - Galileo: APPROVE, generator alphabet semantics and oracle-safety after explicit whitespace wording. + - Bacon: APPROVE, smoke coverage and docs after replacing broad HTML-whitespace wording. + - Euclid: APPROVE, integration/progress scope and commit boundaries after explicit tab/LF/FF wording. +- Tier 3 item 20: + - Carson: APPROVE, gapless reader-walk invariant semantics and failure signature stability. + - Herschel: APPROVE, span-drop fault target, smoke pipeline, and docs. + - Bernoulli: APPROVE, integration/progress scope and commit boundaries. +- Tier 3 item 21: + - Einstein: APPROVE, invalid numeric replacement invariant semantics and signature stability. + - Confucius: APPROVE, invalid numeric fault target, smoke pipeline, and docs. + - Aristotle: APPROVE, integration/progress scope and commit boundaries. +- Tier 3 item 22: + - McClintock: APPROVE, numeric C1 remap and raw C1 pass-through invariant semantics. + - Averroes: APPROVE, fault targets and smoke coverage after adding raw-C1 byte worker/replay/signature-pinned minimize coverage. + - Heisenberg: APPROVE, docs/progress scope and commit boundaries. +- Tier 3 item 23: + - Boole: APPROVE, secondary text oracle semantics and support gating. + - Ampere: APPROVE, secondary-oracle check, fault target, and smoke pipeline coverage. + - Feynman: APPROVE, docs/progress scope and commit boundaries. +- Tier 3 item 24: + - Hooke: APPROVE, token-map extraction and generator semantics after verifying name extraction, deterministic coverage, oracle-safety, and default mapping stability. + - Nash: APPROVE, CLI/worker/replay/runner integration and mode-aware failure artifact behavior. + - Goodall: APPROVE, smoke/docs/progress coverage and commit scope after full smoke and targeted token-map verification. +- Tier 3 item 25: + - Helmholtz: APPROVE, coverage-guidance and pcov semantics after static pcov-path review plus fake-provider verification on this no-pcov runtime. + - Laplace: APPROVE, worker/runner/replay/minimize integration and coverage-corpus artifact safety after duplicate-pruning verification. + - Nietzsche: APPROVE, smoke/docs/progress scope with explicit no-pcov residual-risk note and fake-provider coverage checks. +- Tier 3 item 26: + - Fermat: APPROVE, single-level decode invariant semantics and oracle-free byte-mode narrowness. + - Newton: APPROVE, fault target and worker/replay/minimize integration after README fault-target docs fix. + - Euler: APPROVE, docs/progress scope after README self-test and fault-target list updates. +- Cross-cutting concerns: + - Ohm: APPROVE, generator derived-list sorting and default mapping stability. + - Archimedes: APPROVE, injected-order smoke coverage and verification scope. + - Faraday: APPROVE, README throughput note, progress accuracy, and commit scope. diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index d902f4b7cabc4..f9402b86d33ad 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -60,17 +60,23 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen continue; } - // If there is a character reference, then the decoded value must exactly match what follows in the search string. - if ( 0 !== substr_compare( $search_text, $next_chunk, $search_at, strlen( $next_chunk ), $loose_case ) ) { + /* + * If there is a character reference, then the decoded value must + * match what follows in the search string. The search string may + * end within a multi-code-point replacement, such as `<⃒` + * decoding to `<⃒`, and still be a prefix match. + */ + $match_length = min( strlen( $next_chunk ), $search_length - $search_at ); + if ( 0 !== substr_compare( $search_text, $next_chunk, $search_at, $match_length, $loose_case ) ) { return false; } // The character reference matched, so continue checking. $haystack_at += $token_length; - $search_at += strlen( $next_chunk ); + $search_at += $match_length; } - return true; + return $search_at === $search_length; } /** @@ -361,7 +367,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat $name_length = 0; $replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length ); - if ( false === $replacement ) { + if ( null === $replacement ) { return null; } @@ -378,12 +384,14 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * character reference table but the match doesn't end in `;`. * It may be allowed if it's followed by something unambiguous. */ + $follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null; $ambiguous_follower = ( - $after_name < $length && - $name_at < $length && + null !== $follower_byte && ( - ctype_alnum( $text[ $after_name ] ) || - '=' === $text[ $after_name ] + ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || + ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || + ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) || + 0x3D === $follower_byte ) ); diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 97954f4eb3e30..f51b25bc9a88f 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -61,6 +61,126 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); } + /** + * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes. + */ + public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() { + $raw_attribute = "Á\xC2\x80"; + + $this->assertSame( + "\xC3\x81\xC2\x80", + WP_HTML_Decoder::decode_attribute( $raw_attribute ), + 'Should have decoded the semicolonless legacy reference before a multibyte follower.' + ); + + $match_byte_length = null; + $this->assertSame( + "\xC3\x81", + WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), + 'Should have matched the semicolonless legacy reference before a multibyte follower.' + ); + $this->assertSame( strlen( 'Á' ), $match_byte_length ); + } + + /** + * Ensures semicolonless legacy references remain ambiguous before ASCII alnum or equals. + * + * @dataProvider data_ambiguous_ascii_attribute_followers + * + * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower. + */ + public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous( $raw_attribute ) { + $this->assertSame( + $raw_attribute, + WP_HTML_Decoder::decode_attribute( $raw_attribute ), + 'Should not have decoded an ambiguous semicolonless legacy reference.' + ); + + $match_byte_length = 'sentinel'; + $this->assertNull( + WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), + 'Should not have matched an ambiguous semicolonless legacy reference.' + ); + $this->assertSame( 'sentinel', $match_byte_length ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_ambiguous_ascii_attribute_followers() { + return array( + 'ASCII digit' => array( 'Á0' ), + 'ASCII uppercase alpha' => array( 'ÁA' ), + 'ASCII lowercase alpha' => array( 'Áa' ), + 'equals' => array( 'Á=' ), + ); + } + + /** + * Ensures unmatched named character references leave the by-ref match length unchanged. + * + * @dataProvider data_unmatched_named_character_references + * + * @param string $context Decoder context. + * @param string $raw_text_node Raw text containing an unmatched named character reference. + */ + public function test_unmatched_named_character_reference_does_not_set_match_byte_length( $context, $raw_text_node ) { + $match_byte_length = 'sentinel'; + $this->assertNull( + WP_HTML_Decoder::read_character_reference( $context, $raw_text_node, 0, $match_byte_length ), + 'Should not have matched an unmatched named character reference.' + ); + $this->assertSame( 'sentinel', $match_byte_length ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_unmatched_named_character_references() { + return array( + 'text invalid name' => array( 'data', '&bogus;' ), + 'text invalid short-name candidate' => array( 'data', '&Fv=q' ), + 'attribute invalid name' => array( 'attribute', '&bogus;' ), + 'attribute invalid short-name candidate' => array( 'attribute', '&Fv=q' ), + ); + } + + /** + * Ensures non-ampersand offsets never match character references. + * + * @dataProvider data_non_ampersand_character_reference_offsets + * + * @param string $context Decoder context. + * @param string $raw_text_node Raw text containing a character reference away from offset. + * @param int $offset Offset that does not point at an ampersand. + */ + public function test_non_ampersand_offset_does_not_set_match_byte_length( $context, $raw_text_node, $offset ) { + $match_byte_length = 'sentinel'; + $this->assertNull( + WP_HTML_Decoder::read_character_reference( $context, $raw_text_node, $offset, $match_byte_length ), + 'Should not have matched a character reference away from an ampersand.' + ); + $this->assertSame( 'sentinel', $match_byte_length ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_non_ampersand_character_reference_offsets() { + return array( + 'text before reference' => array( 'data', 'a&b', 0 ), + 'text inside reference name' => array( 'data', 'a&b', 2 ), + 'attribute before reference' => array( 'attribute', 'a&b', 0 ), + 'attribute inside reference name' => array( 'attribute', 'a&b', 2 ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. * @@ -161,6 +281,11 @@ public static function data_attributes_with_prefix_and_case_sensitive_match() { array( 'http://wordpress.org', 'Http', 'ascii-case-insensitive', true ), array( 'http://wordpress.org', 'https', 'case-sensitive', false ), array( 'http://wordpress.org', 'https', 'ascii-case-insensitive', false ), + array( '', 'http', 'case-sensitive', false ), + array( 'jav', 'javascript:', 'case-sensitive', false ), + array( 'jav', 'javascript:', 'ascii-case-insensitive', false ), + array( '<⃒script', '<', 'case-sensitive', true ), + array( '>⃒script', '>', 'case-sensitive', true ), ); } } diff --git a/tools/html-decoder-fuzz/README.md b/tools/html-decoder-fuzz/README.md new file mode 100644 index 0000000000000..9eb6df2f76a48 --- /dev/null +++ b/tools/html-decoder-fuzz/README.md @@ -0,0 +1,347 @@ +# WP_HTML_Decoder Fuzzer + +Differential fuzzer for `WP_HTML_Decoder`: + +- `decode_text_node()` +- `decode_attribute()` +- `read_character_reference()` +- `attribute_starts_with()` + +The fuzzer runs in a bare PHP process. It loads only `WP_Token_Map`, the +generated HTML5 named-character-reference map, and `WP_HTML_Decoder`; it does +not bootstrap WordPress, a database, browsers, Node, or `wp-env`. + +## Requirements + +- PHP 8.4+ with `Dom\HTMLDocument` +- `mbstring` +- `pcov` for `coverage` mode +- Run from the repository root + +## Oracle + +The primary oracle is PHP's HTML5 parser: + +- Text context: parse `
PAYLOAD
` and read the + div's `textContent`. +- Attribute context: parse `
` and read + `getAttribute( 'title' )`. + +For text context, `html_entity_decode( ENT_HTML5 | ENT_QUOTES, 'UTF-8' )` also +runs as a secondary oracle on payloads whose references it supports: +known semicolon-terminated named references and literal text. Numeric +references, unknown named-looking references, and semicolonless named-looking +references stay with the DOM oracle and fuzzer +invariants, because `html_entity_decode()` does not implement those parser +states. `html_entity_decode()` is deliberately not used as the primary oracle or +as an attribute-context oracle because it does not implement the HTML +attribute-context rule for semicolonless named references followed by `=` or an +alphanumeric byte. + +In the default `oracle` mode, the generator neutralizes parser-vs-decoder +confounders by producing valid UTF-8 payloads with no raw `<`, no raw double +quote, no CR, and no NUL. This keeps the DOM parser focused on +character-reference decoding instead of tag structure, attribute termination, +input-preprocessing newline normalization, or NUL substitution. + +The separate `bytes` mode deliberately generates arbitrary byte payloads, +including invalid UTF-8, NUL, raw `<`, raw double quote, and CR. These payloads +never go to the DOM oracle. They run only oracle-free decoder invariants. + +The separate `names` mode deterministically sweeps every generated named +character reference base name with and without `;`, followed by representative +end, alphanumeric, equals, punctuation, whitespace, and multibyte followers. + +## Checks + +For each generated payload, the fuzzer runs both text and attribute contexts: + +1. Compare `decode_text_node()` or `decode_attribute()` to the DOM oracle, and + compare supported text payloads to the secondary `html_entity_decode()` + oracle. +2. Rebuild the decoded string with repeated `read_character_reference()` calls + plus literal spans, then compare it to the high-level decoder. +3. Assert every matched character reference reports a nonempty chunk, a byte + length of at least two, no input overrun, and the same chunk/length when + the matched slice is read again at offset zero. +4. Check `attribute_starts_with()` against the decoded attribute prefix for + ASCII search strings in both case-sensitive and ASCII-case-insensitive modes, + leading byte-slice prefixes that can end inside UTF-8 replacements, and + monotonic prefix, extension, and case-sensitivity invariants. +5. Assert decoded output is valid UTF-8. +6. Assert known nested ampersand fixtures decode exactly one level, so + `&amp;` decodes to `&` rather than `&`. +7. Assert text and attribute payloads without `&` are identity decodes. + +In `bytes` mode, checks 1, 4, and 5 are skipped because they depend on +DOM-safe UTF-8 payloads or a DOM-derived decoded attribute value. The lane keeps +the reader rebuild, advance/overrun, single-level decode, and no-`&` identity +checks for both text and attribute contexts. + +Decoding is not treated as idempotent. The checks and smoke suite explicitly +verify this with nested ampersand-reference fixtures. + +## Generator + +Every case is determined by `(seed, case index)`. Generated cases run in both +text and attribute contexts so the same payload exercises semicolonless and +attribute-disambiguation differences side by side. The generator preserves the +former context PRNG draw, so the earlier both-context lane change did not by +itself shift payload mapping. +Named-reference lists derived from the generated token map, or injected for +tests, are sorted by length and byte value before case-index mapping, so token +map storage order and caller array order do not affect generated payloads. +Adding or reweighting generation strategies intentionally changes future +`--seed --case` payload mapping; failure-manifest replay remains stable because +manifests store `payload_base64`. + +The generator uses the real generated named-reference map, with weighted +strategies for: + +- exact named references +- semicolonless legacy references +- attribute-context ambiguous followers +- numeric decimal and hex references drawn from ranges covering C0 controls, + all C1 controls, surrogates, BMP and per-plane noncharacters, astral values, + above-Unicode values with legal digit counts, digit-count overflow, zero-only + references, and leading zeros +- adjacent references +- truncation sweeps +- references ending at EOF, including bare introducers, partial numeric + references, semicolonless numeric references, and truncated names +- multibyte UTF-8 around references +- `attribute_starts_with()` prefixes generated by encoding target strings + per character as literals, decimal references, hex references, leading-zero + references, and semicolonless references +- `attribute_starts_with()` prefixes that split multi-code-point named-reference + replacements such as `<⃒` +- edit-distance-1 named-reference lookalikes plus ampersand boundaries +- valid named-reference names with letter case mangled into case-sensitive + near-misses +- composed cases that splice two or three generated strategy outputs +- plain no-ampersand text from an oracle-safe alphabet that includes space, tab, + LF, and FF + +`bytes` mode uses separate weighted strategies for uniform random bytes, +no-ampersand byte strings, arbitrary bytes around `&` boundaries, invalid UTF-8 +sequences, and raw HTML delimiters/control bytes. + +`names` mode uses a deterministic sweep instead of weighted random generation. +Case index maps directly to a named-reference base, semicolon variant, and +follower class; the same payload still runs in both text and attribute contexts. + +`legacy-followers` mode deterministically sweeps every semicolonless legacy +name followed by each oracle-safe ASCII byte, plus valid UTF-8 sequences +covering multibyte lead and continuation byte values. + +`prefix-families` mode deterministically sweeps known named-reference prefix +families such as `¬`/`∉`/`∉` and `≫⃒`/`≯`, truncating +each reference at every byte split and appending ambiguous followers. + +`numeric-boundaries` mode deterministically sweeps decimal and hex numeric +references at the decoder's maximum significant-digit count and one digit past +it, with and without leading zeros, semicolons, and mixed-case hex digits. + +`corpus` mode mutates a seed corpus built from retained decoder payloads, the +oracle battery, and html5lib entity vectors. Mutations splice corpus fragments, +perturb bytes within the oracle-safe alphabet, including space, tab, LF, and FF, +add or remove semicolons, and duplicate references to diversify structure beyond +the grammar. + +`token-map` mode deterministically sweeps the generated `WP_Token_Map` layout: +large-word group prefixes with names that diverge immediately after the shared +two-byte prefix, every small-word boundary name, and every large-word name at +the small/large length boundary. + +`coverage` mode runs the normal oracle-safe generator under pcov and treats each +new covered executable line in `WP_HTML_Decoder` or `WP_Token_Map` as a coverage +edge. Workers emit `coverage` events for payloads that discover new edges and +write those payloads under `coverage-corpus/` when an output directory is +provided. The runner deduplicates coverage edges across lanes and prunes +duplicate coverage-corpus artifacts. + +## Common Commands + +Run the smoke test: + +```sh +php tools/html-decoder-fuzz/tests/harness-smoke.php +``` + +Run one worker batch: + +```sh +php tools/html-decoder-fuzz/worker.php --seed 1 --cases 5000 +``` + +Run one oracle-free arbitrary-byte worker batch: + +```sh +php tools/html-decoder-fuzz/worker.php --mode bytes --seed 1 --cases 5000 +``` + +Run one deterministic named-reference sweep batch: + +```sh +php tools/html-decoder-fuzz/worker.php --mode names --seed 1 --cases 5000 +``` + +Run one deterministic legacy-follower sweep batch: + +```sh +php tools/html-decoder-fuzz/worker.php --mode legacy-followers --seed 1 --cases 5000 +``` + +Run one deterministic prefix-family sweep batch: + +```sh +php tools/html-decoder-fuzz/worker.php --mode prefix-families --seed 1 --cases 5000 +``` + +Run one deterministic numeric-boundary sweep batch: + +```sh +php tools/html-decoder-fuzz/worker.php --mode numeric-boundaries --seed 1 --cases 5000 +``` + +Run one corpus mutation batch: + +```sh +php tools/html-decoder-fuzz/worker.php --mode corpus --seed 1 --cases 5000 +``` + +Run one token-map structure sweep batch: + +```sh +php tools/html-decoder-fuzz/worker.php --mode token-map --seed 1 --cases 5000 +``` + +Run one coverage-guided batch: + +```sh +php -d pcov.enabled=1 -d pcov.directory=src/wp-includes tools/html-decoder-fuzz/worker.php --mode coverage --seed 1 --cases 5000 --output-dir /tmp/html-decoder-fuzz-coverage +``` + +Run parallel lanes for one minute: + +```sh +php tools/html-decoder-fuzz/runner.php --lanes 4 --duration-seconds 60 +``` + +Oracle modes spend most of their time in the two DOM parser calls per payload, +not in the PRNG. Scale long oracle runs with more lanes; the oracle-free `bytes` +mode avoids that DOM cost, and future high-throughput oracle work should batch +payloads into fewer documents or cache repeated sub-payloads. + +Run indefinitely: + +```sh +php tools/html-decoder-fuzz/runner.php --lanes 8 --duration-seconds 0 --max-cases 0 +``` + +Long runs keep disk use bounded by default. The runner records aggregate +counters in `state.json`, writes only newly retained failure exemplars plus +oracle/fatal events to `summary.ndjson`, and retains at most five failure +artifact directories for each distinct signature. Per-lane stderr logs are +capped at 64 KiB each, including reused output directories with existing +oversized lane logs. Repeated over-cap failures remain counted in `state.json` +without growing the event log. + +When startup verification is unavailable, the runner preserves complete +existing artifacts instead of pruning them to the cap; without the verifier it +cannot safely distinguish stale or fake full-shape manifests from valuable +findings. + +Useful retention options: + +```sh +# Preserve the previous verbose event log. +php tools/html-decoder-fuzz/runner.php --summary-mode all + +# Keep only one on-disk exemplar per signature. +php tools/html-decoder-fuzz/runner.php --max-artifacts-per-signature 1 + +# Prune every failure artifact and rely on state counters/signatures. +php tools/html-decoder-fuzz/runner.php --artifact-retention none + +# Keep every failure artifact for a short diagnostic run. +php tools/html-decoder-fuzz/runner.php --artifact-retention all + +# Raise or disable per-lane stderr capture. +php tools/html-decoder-fuzz/runner.php --max-stderr-bytes 262144 +php tools/html-decoder-fuzz/runner.php --max-stderr-bytes 0 +``` + +Replay a failure, an input file, or a generated case: + +```sh +php tools/html-decoder-fuzz/replay.php --failure artifacts/html-decoder-fuzz/run-.../failure-seedS-caseN/failure.json +php tools/html-decoder-fuzz/replay.php --input payload.txt --context attribute +php tools/html-decoder-fuzz/replay.php --seed 123 --case 45 +php tools/html-decoder-fuzz/replay.php --mode bytes --seed 123 --case 45 +``` + +Minimize a failure while preserving its signature: + +```sh +php tools/html-decoder-fuzz/minimize.php --failure artifacts/html-decoder-fuzz/run-.../failure-seedS-caseN/failure.json +``` + +Exit codes everywhere: `0` clean, `1` findings, `2` harness error. + +## Artifacts + +The runner writes under `artifacts/html-decoder-fuzz/run-*` by default: + +- `summary.ndjson` with retained failure exemplars plus oracle/fatal events by + default (`--summary-mode all` preserves every worker event; + `--summary-mode none` disables the file) +- `state.json` with aggregate counters, stop reason, Git metadata, and failure + seeds for retained exemplars, including retained/pruned artifact counts by + signature +- per-lane stderr logs, capped by `--max-stderr-bytes` +- retained failure directories containing `payload.txt` and a self-contained + `failure.json` with base64 payload, context, signatures, failure details, + full expected/got output as base64 for differential failures, environment + metadata, and Git metadata. By default retention is capped per signature; + use `--artifact-retention all` to keep every directory. + +## Harness Self-Test + +`tests/harness-smoke.php` verifies the DOM oracle battery, real target behavior +on the battery, generator determinism and safety, a short real fuzz run, and +mutation-tested broken targets: + +- C1 numeric references not remapped through the Windows-1252 table, and raw + C1 bytes not passing through unchanged +- supported text payloads disagreeing with the secondary `html_entity_decode()` + oracle +- nested ampersand references being decoded more than one level +- zero, surrogate, and above-Unicode numeric references not decoding to exactly + U+FFFD +- semicolonless named references decoded in attributes despite ambiguous + followers +- off-by-one `read_character_reference()` match lengths +- empty `read_character_reference()` chunks, one-byte matches, + null-return match-length mutations, non-ampersand offset matches, + non-compositional local slice reads, and non-gapless reader walks +- partial-prefix `attribute_starts_with()` matches +- partial multi-code-point `attribute_starts_with()` replacement matches +- non-monotonic `attribute_starts_with()` prefix, extension, and + case-sensitivity results +- safe attribute payloads and raw byte payloads without `&` not decoding + identically + +For end-to-end failure-pipeline checks, set `HTML_DECODER_FUZZ_FAULT` to one of +`skip-c1-remap`, `numeric-c1-not-remapped`, `raw-c1-not-pass-through`, +`text-secondary-oracle`, `numeric-invalid-not-replacement`, +`attribute-semicolonless`, `match-length-off-by-one`, +`reader-empty-chunk`, `reader-short-match-length`, +`reader-substring-composition`, `reader-null-mutates-match-length`, +`reader-non-amp-match`, `reader-gapless-drop-span`, +`attribute-no-amp-identity`, `byte-no-amp-identity`, +`single-level-overdecode`, +`attribute-prefix-monotonicity`, +`attribute-extension-monotonicity`, `attribute-case-monotonicity`, or +`attribute-multicodepoint-prefix` before running `worker.php`, `runner.php`, +`replay.php`, or `minimize.php`. diff --git a/tools/html-decoder-fuzz/lib/Bootstrap.php b/tools/html-decoder-fuzz/lib/Bootstrap.php new file mode 100644 index 0000000000000..5096937a8951d --- /dev/null +++ b/tools/html-decoder-fuzz/lib/Bootstrap.php @@ -0,0 +1,166 @@ +, + * small_names: string[] + * } + */ + public static function named_reference_structure(): array { + static $structure = null; + if ( null !== $structure ) { + return $structure; + } + + self::load_targets(); + + global $html5_named_character_references; + $map = $html5_named_character_references; + + $reflection = new \ReflectionObject( $map ); + $get = static function ( string $property ) use ( $reflection, $map ) { + $ref = $reflection->getProperty( $property ); + $ref->setAccessible( true ); + return $ref->getValue( $map ); + }; + + $key_length = (int) $get( 'key_length' ); + $groups = (string) $get( 'groups' ); + $large_words = (array) $get( 'large_words' ); + $small_words = (string) $get( 'small_words' ); + $large_names_by_key = array(); + $large_names_by_prefix = array(); + $small_names_by_key = array(); + $group_stride = $key_length + 1; + $groups_length = strlen( $groups ); + + for ( $group_at = 0, $group_index = 0; $group_at + $key_length <= $groups_length; $group_at += $group_stride, ++$group_index ) { + $prefix = substr( $groups, $group_at, $key_length ); + if ( '' === $prefix || ! isset( $large_words[ $group_index ] ) ) { + continue; + } + + $row = $large_words[ $group_index ]; + $row_at = 0; + while ( $row_at < strlen( $row ) ) { + $token_length = unpack( 'C', $row[ $row_at++ ] )[1]; + $token = substr( $row, $row_at, $token_length ); + $row_at += $token_length; + + $mapping_length = unpack( 'C', $row[ $row_at++ ] )[1]; + $row_at += $mapping_length; + + $name = $prefix . $token; + $large_names_by_key[ $name ] = true; + $large_names_by_prefix[ $prefix ][ $name ] = true; + } + } + + for ( $at = 0; $at < strlen( $small_words ); $at += $group_stride ) { + $name = rtrim( substr( $small_words, $at, $group_stride ), "\x00" ); + if ( '' !== $name ) { + $small_names_by_key[ $name ] = true; + } + } + + $group_prefixes = array_keys( $large_names_by_prefix ); + sort( $group_prefixes, SORT_STRING ); + + $large_names = array_keys( $large_names_by_key ); + $small_names = array_keys( $small_names_by_key ); + self::sort_names( $large_names ); + self::sort_names( $small_names ); + foreach ( $large_names_by_prefix as $prefix => $prefix_names_by_key ) { + $prefix_names = array_keys( $prefix_names_by_key ); + self::sort_names( $prefix_names ); + $large_names_by_prefix[ $prefix ] = $prefix_names; + } + ksort( $large_names_by_prefix, SORT_STRING ); + + $structure = array( + 'key_length' => $key_length, + 'group_prefixes' => $group_prefixes, + 'large_names' => $large_names, + 'large_names_by_prefix' => $large_names_by_prefix, + 'small_names' => $small_names, + ); + + return $structure; + } + + /** + * @param string[] $names + */ + private static function sort_names( array &$names ): void { + usort( + $names, + static function ( string $a, string $b ): int { + return strlen( $b ) <=> strlen( $a ) ?: strcmp( $a, $b ); + } + ); + } +} +} diff --git a/tools/html-decoder-fuzz/lib/Checks.php b/tools/html-decoder-fuzz/lib/Checks.php new file mode 100644 index 0000000000000..5c9015121c120 --- /dev/null +++ b/tools/html-decoder-fuzz/lib/Checks.php @@ -0,0 +1,1016 @@ + '&', + '&lt;' => '<', + '&#58;' => ':', + '&#x3a;' => ':', + ); + + private Oracles $oracles; + + /** @var array */ + private array $targets; + + public function __construct( Oracles $oracles, ?array $targets = null ) { + $this->oracles = $oracles; + $this->targets = $targets ?? Targets::resolve(); + } + + /** + * @return array + */ + public function run( string $context, string $payload ): array { + $failures = array(); + + if ( ! Generator::is_oracle_safe_payload( $payload ) ) { + return array( + self::failure( + 'unsafe-oracle-payload', + $context, + array( + 'context' => $context, + 'payload' => self::preview( $payload ), + ) + ), + ); + } + + $contexts = 'both' === $context ? array( 'text', 'attribute' ) : array( $context ); + + foreach ( $contexts as $one_context ) { + $failures = array_merge( $failures, $this->check_decode_context( $one_context, $payload ) ); + } + + $failures = array_merge( $failures, $this->check_attribute_starts_with( $payload ) ); + + return $failures; + } + + /** + * @return array + */ + public function run_without_oracle( string $context, string $payload ): array { + $failures = array(); + $contexts = 'both' === $context ? array( 'text', 'attribute' ) : array( $context ); + + foreach ( $contexts as $one_context ) { + $failures = array_merge( $failures, $this->check_decode_context_without_oracle( $one_context, $payload ) ); + } + + return $failures; + } + + /** + * @return array + */ + private function check_decode_context( string $context, string $payload ): array { + $failures = array(); + + try { + $expected = $this->oracles->decode( $context, $payload ); + } catch ( \Throwable $error ) { + return array( + self::failure( + 'oracle-exception', + $context, + array( + 'context' => $context, + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ), + ); + } + + $target_key = 'text' === $context ? 'decode_text' : 'decode_attribute'; + try { + $got = ( $this->targets[ $target_key ] )( $payload ); + } catch ( \Throwable $error ) { + return array( + self::failure( + 'target-exception', + "{$context}:decode", + array( + 'context' => $context, + 'target' => $target_key, + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ), + ); + } + + if ( $got !== $expected ) { + $failures[] = self::failure( + 'decode-mismatch', + $context, + self::diff_detail( $context, $expected, $got ) + ); + } + + $single_level_expected = self::single_level_decode_expected( $payload ); + if ( null !== $single_level_expected && $got !== $single_level_expected ) { + $failures[] = self::failure( + 'single-level-decode-overdecoded', + $context, + array_merge( + self::diff_detail( $context, $single_level_expected, $got ), + self::byte_detail( 'payload', $payload ) + ) + ); + } + + if ( 'text' === $context ) { + try { + $entity_decode_expected = $this->oracles->decode_text_with_entity_decode( $payload ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'oracle-exception', + 'text:entity-decode', + array( + 'context' => $context, + 'oracle' => 'entity-decode', + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ); + $entity_decode_expected = null; + } + + if ( null !== $entity_decode_expected && $got !== $entity_decode_expected ) { + $failures[] = self::failure( + 'text-secondary-oracle-mismatch', + $context, + array_merge( + self::diff_detail( $context, $entity_decode_expected, $got ), + array( + 'secondary_oracle' => 'html_entity_decode', + 'dom_expected_base64' => base64_encode( $expected ), + ) + ) + ); + } + } + + if ( ! mb_check_encoding( $got, 'UTF-8' ) ) { + $failures[] = self::failure( + 'decoded-not-valid-utf8', + $context, + array( + 'context' => $context, + 'decoded' => self::preview( $got ), + ) + ); + } + + if ( ! str_contains( $payload, '&' ) && $got !== $payload ) { + $failures[] = self::failure( + "{$context}-without-ampersand-not-identity", + $context, + self::diff_detail( $context, $payload, $got ) + ); + } + + $reader = $this->decode_with_reader( $context, $payload ); + foreach ( $reader['failures'] as $failure ) { + $failures[] = $failure; + } + + if ( $reader['decoded'] !== $got ) { + $failures[] = self::failure( + 'reader-decode-mismatch', + $context, + self::diff_detail( $context, $got, $reader['decoded'] ) + ); + } + + return $failures; + } + + /** + * @return array + */ + private function check_decode_context_without_oracle( string $context, string $payload ): array { + $failures = array(); + $target_key = 'text' === $context ? 'decode_text' : 'decode_attribute'; + + try { + $got = ( $this->targets[ $target_key ] )( $payload ); + } catch ( \Throwable $error ) { + return array( + self::failure( + 'target-exception', + "{$context}:decode", + array( + 'context' => $context, + 'target' => $target_key, + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ), + ); + } + + if ( ! str_contains( $payload, '&' ) && $got !== $payload ) { + $failures[] = self::failure( + "{$context}-without-ampersand-not-identity", + $context, + self::diff_detail( $context, $payload, $got ) + ); + } + + if ( ! str_contains( $payload, '&' ) && self::contains_raw_c1_byte( $payload ) && $got !== $payload ) { + $failures[] = self::failure( + 'raw-c1-not-pass-through', + $context, + self::diff_detail( $context, $payload, $got ) + ); + } + + $single_level_expected = self::single_level_decode_expected( $payload ); + if ( null !== $single_level_expected && $got !== $single_level_expected ) { + $failures[] = self::failure( + 'single-level-decode-overdecoded', + $context, + array_merge( + self::diff_detail( $context, $single_level_expected, $got ), + self::byte_detail( 'payload', $payload ) + ) + ); + } + + $reader = $this->decode_with_reader( $context, $payload ); + foreach ( $reader['failures'] as $failure ) { + $failures[] = $failure; + } + + if ( $reader['decoded'] !== $got ) { + $failures[] = self::failure( + 'reader-decode-mismatch', + $context, + self::diff_detail( $context, $got, $reader['decoded'] ) + ); + } + + return $failures; + } + + /** + * @return array{decoded: string, failures: array} + */ + private function decode_with_reader( string $context, string $payload ): array { + $decoder_context = 'text' === $context ? 'data' : 'attribute'; + $decoded = ''; + $failures = array(); + $end = strlen( $payload ); + $at = 0; + $was_at = 0; + $walk_at = 0; + $walk_spans = array(); + + $failures = array_merge( $failures, $this->check_reader_non_amp_offsets( $context, $decoder_context, $payload ) ); + + while ( $at < $end ) { + $amp_at = strpos( $payload, '&', $at ); + if ( false === $amp_at ) { + break; + } + + $match_byte_length = self::MATCH_BYTE_LENGTH_SENTINEL; + try { + $chunk = ( $this->targets['read_character_reference'] )( $decoder_context, $payload, $amp_at, $match_byte_length ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + "{$context}:read-character-reference", + array( + 'context' => $context, + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ); + break; + } + + if ( null === $chunk ) { + if ( self::MATCH_BYTE_LENGTH_SENTINEL !== $match_byte_length ) { + $failures[] = self::failure( + 'reader-mutated-match-length-on-null', + $context, + array( + 'context' => $context, + 'at' => $amp_at, + 'match_byte_length' => $match_byte_length, + 'match_byte_length_type' => gettype( $match_byte_length ), + ) + ); + break; + } + if ( $walk_at < $amp_at + 1 ) { + $walk_spans[] = array( + 'type' => 'literal', + 'start' => $walk_at, + 'end' => $amp_at + 1, + ); + $walk_at = $amp_at + 1; + } + $at = $amp_at + 1; + continue; + } + + if ( ! is_int( $match_byte_length ) || $match_byte_length <= 0 ) { + $failures[] = self::failure( + 'reader-did-not-advance', + $context, + array( + 'context' => $context, + 'at' => $amp_at, + 'match_byte_length' => $match_byte_length, + ) + ); + break; + } + + if ( '' === $chunk ) { + $failures[] = self::failure( + 'reader-returned-empty-chunk', + $context, + array( + 'context' => $context, + 'at' => $amp_at, + 'match_byte_length' => $match_byte_length, + ) + ); + break; + } + + if ( $match_byte_length < 2 ) { + $failures[] = self::failure( + 'reader-match-too-short', + $context, + array( + 'context' => $context, + 'at' => $amp_at, + 'match_byte_length' => $match_byte_length, + ) + ); + break; + } + + if ( $amp_at + $match_byte_length > $end ) { + $failures[] = self::failure( + 'reader-overran-input', + $context, + array( + 'context' => $context, + 'at' => $amp_at, + 'match_byte_length' => $match_byte_length, + 'input_length' => $end, + ) + ); + break; + } + + if ( $walk_at < $amp_at ) { + $walk_spans[] = array( + 'type' => 'literal', + 'start' => $walk_at, + 'end' => $amp_at, + ); + } + $walk_spans[] = array( + 'type' => 'reference', + 'start' => $amp_at, + 'end' => $amp_at + $match_byte_length, + ); + $walk_at = $amp_at + $match_byte_length; + + $reference = substr( $payload, $amp_at, $match_byte_length ); + $numeric_c1_replacement = self::numeric_c1_replacement( $reference ); + if ( null !== $numeric_c1_replacement && $numeric_c1_replacement !== $chunk ) { + $failures[] = self::failure( + 'numeric-c1-not-remapped', + $context, + array_merge( + array( + 'context' => $context, + 'at' => $amp_at, + 'expected_base64' => base64_encode( $numeric_c1_replacement ), + 'got_base64' => base64_encode( $chunk ), + 'match_byte_length' => $match_byte_length, + ), + self::byte_detail( 'reference', $reference ) + ) + ); + } + + $invalid_numeric_reason = self::invalid_numeric_replacement_reason( $reference ); + if ( null !== $invalid_numeric_reason && self::REPLACEMENT_CHARACTER !== $chunk ) { + $failures[] = self::failure( + 'numeric-invalid-not-replacement', + $context, + array_merge( + array( + 'context' => $context, + 'at' => $amp_at, + 'reason' => $invalid_numeric_reason, + 'expected_base64' => base64_encode( self::REPLACEMENT_CHARACTER ), + 'got_base64' => base64_encode( $chunk ), + 'match_byte_length' => $match_byte_length, + ), + self::byte_detail( 'reference', $reference ) + ) + ); + } + + $local_match_byte_length = null; + try { + $local_chunk = ( $this->targets['read_character_reference'] )( $decoder_context, $reference, 0, $local_match_byte_length ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + "{$context}:read-character-reference-local", + array( + 'context' => $context, + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ); + break; + } + + if ( $local_chunk !== $chunk || $local_match_byte_length !== $match_byte_length ) { + $failures[] = self::failure( + 'reader-composition-mismatch', + $context, + array_merge( + array( + 'context' => $context, + 'at' => $amp_at, + 'match_byte_length' => $match_byte_length, + 'local_match_byte_length' => $local_match_byte_length, + 'expected_chunk_base64' => base64_encode( $chunk ), + 'local_chunk_base64' => is_string( $local_chunk ) ? base64_encode( $local_chunk ) : null, + 'local_chunk_type' => gettype( $local_chunk ), + ), + self::byte_detail( 'reference', $reference ) + ) + ); + } + + $decoded .= substr( $payload, $was_at, $amp_at - $was_at ); + $decoded .= $chunk; + $at = $amp_at + $match_byte_length; + $was_at = $at; + } + + if ( $was_at < $end ) { + $decoded .= substr( $payload, $was_at ); + } + + if ( array() === $failures ) { + if ( $walk_at < $end ) { + $walk_spans[] = array( + 'type' => 'literal', + 'start' => $walk_at, + 'end' => $end, + ); + } + if ( isset( $this->targets['reader_span_filter'] ) ) { + $walk_spans = ( $this->targets['reader_span_filter'] )( $walk_spans ); + } + $failures = array_merge( $failures, $this->validate_reader_walk( $context, $payload, $walk_spans ) ); + } + + return array( + 'decoded' => $decoded, + 'failures' => $failures, + ); + } + + /** + * @param array $spans + * + * @return array + */ + private function validate_reader_walk( string $context, string $payload, array $spans ): array { + $cursor = 0; + $consumed_bytes = 0; + $input_length = strlen( $payload ); + + foreach ( $spans as $index => $span ) { + if ( $span['start'] !== $cursor ) { + return array( + self::failure( + 'reader-walk-not-gapless', + $context, + array( + 'context' => $context, + 'reason' => $span['start'] < $cursor ? 'overlap' : 'gap', + 'span_index' => $index, + 'expected_start' => $cursor, + 'actual_start' => $span['start'], + 'actual_end' => $span['end'], + 'input_length' => $input_length, + 'spans' => self::preview_reader_spans( $spans ), + ) + ), + ); + } + + if ( $span['end'] < $span['start'] || $span['end'] > $input_length ) { + return array( + self::failure( + 'reader-walk-not-gapless', + $context, + array( + 'context' => $context, + 'reason' => $span['end'] < $span['start'] ? 'negative-span' : 'overrun', + 'span_index' => $index, + 'expected_start' => $cursor, + 'actual_start' => $span['start'], + 'actual_end' => $span['end'], + 'input_length' => $input_length, + 'spans' => self::preview_reader_spans( $spans ), + ) + ), + ); + } + + $consumed_bytes += $span['end'] - $span['start']; + $cursor = $span['end']; + } + + if ( $cursor !== $input_length || $consumed_bytes !== $input_length ) { + return array( + self::failure( + 'reader-walk-not-gapless', + $context, + array( + 'context' => $context, + 'reason' => 'length-mismatch', + 'covered_until' => $cursor, + 'consumed_bytes' => $consumed_bytes, + 'input_length' => $input_length, + 'spans' => self::preview_reader_spans( $spans ), + ) + ), + ); + } + + return array(); + } + + /** + * @return array + */ + private function check_reader_non_amp_offsets( string $context, string $decoder_context, string $payload ): array { + $failures = array(); + foreach ( $this->reader_non_amp_probe_offsets( $payload ) as $offset ) { + $match_byte_length = self::MATCH_BYTE_LENGTH_SENTINEL; + try { + $chunk = ( $this->targets['read_character_reference'] )( $decoder_context, $payload, $offset, $match_byte_length ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + "{$context}:read-character-reference-non-amp", + array( + 'context' => $context, + 'at' => $offset, + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ); + break; + } + + if ( null !== $chunk || self::MATCH_BYTE_LENGTH_SENTINEL !== $match_byte_length ) { + $failures[] = self::failure( + 'reader-non-amp-match', + $context, + array( + 'context' => $context, + 'at' => $offset, + 'byte_hex' => bin2hex( $payload[ $offset ] ), + 'chunk_type' => gettype( $chunk ), + 'chunk_base64' => is_string( $chunk ) ? base64_encode( $chunk ) : null, + 'match_byte_length' => $match_byte_length, + 'match_byte_length_type' => gettype( $match_byte_length ), + ) + ); + break; + } + } + + return $failures; + } + + /** + * @return int[] + */ + private function reader_non_amp_probe_offsets( string $payload ): array { + $length = strlen( $payload ); + if ( 0 === $length ) { + return array(); + } + + $candidates = array( 0, intdiv( $length, 2 ), $length - 1 ); + $amp_at = strpos( $payload, '&' ); + if ( false !== $amp_at ) { + $candidates[] = $amp_at - 1; + $candidates[] = $amp_at + 1; + } + + $offsets = array(); + foreach ( $candidates as $offset ) { + if ( $offset < 0 || $offset >= $length || '&' === $payload[ $offset ] ) { + continue; + } + $offsets[ $offset ] = true; + } + + return array_keys( $offsets ); + } + + /** + * @return array + */ + private function check_attribute_starts_with( string $payload ): array { + $failures = array(); + + try { + $decoded = $this->oracles->decode( 'attribute', $payload ); + } catch ( \Throwable $error ) { + return array( + self::failure( + 'oracle-exception', + 'attribute:decode-for-prefix', + array( + 'context' => 'attribute', + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ), + ); + } + + $searches = $this->attribute_searches( $decoded ); + $results = array(); + $get_result = function ( string $search, string $case_sensitivity ) use ( $payload, &$failures, &$results ): ?bool { + $result_key = $case_sensitivity . "\0" . $search; + if ( array_key_exists( $result_key, $results ) ) { + return $results[ $result_key ]; + } + + try { + $results[ $result_key ] = ( $this->targets['attribute_starts_with'] )( $payload, $search, $case_sensitivity ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + "attribute-starts-with:{$case_sensitivity}", + array( + 'target' => 'attribute_starts_with', + 'case_sensitivity' => $case_sensitivity, + 'class' => get_class( $error ), + 'message' => $error->getMessage(), + ) + ); + $results[ $result_key ] = null; + } + + return $results[ $result_key ]; + }; + + foreach ( $searches as $search ) { + foreach ( array( 'case-sensitive', 'ascii-case-insensitive' ) as $case_sensitivity ) { + $expected = $this->expected_prefix_match( $decoded, $search, $case_sensitivity ); + $got = $get_result( $search, $case_sensitivity ); + if ( null === $got ) { + continue; + } + + if ( $got !== $expected ) { + $failures[] = self::failure( + 'attribute-starts-with-mismatch', + $case_sensitivity, + array_merge( + array( + 'case_sensitivity' => $case_sensitivity, + 'expected' => $expected, + 'got' => $got, + 'decoded' => self::preview( $decoded ), + ), + self::byte_detail( 'search', $search ) + ) + ); + } + } + } + + $monotonicity_failures = $this->check_attribute_starts_with_monotonicity( $searches, $get_result ); + $failures = array_merge( $failures, $monotonicity_failures ); + + return $failures; + } + + /** + * @return array + */ + private function check_attribute_starts_with_monotonicity( array $searches, callable $get_result ): array { + $failures = array(); + $candidates = array(); + + foreach ( $searches as $search ) { + $candidates[ $search ] = true; + foreach ( self::byte_prefixes( $search ) as $prefix ) { + $candidates[ $prefix ] = true; + } + } + + $case_sensitivities = array( 'case-sensitive', 'ascii-case-insensitive' ); + foreach ( array_keys( $candidates ) as $search ) { + foreach ( $case_sensitivities as $case_sensitivity ) { + $got = $get_result( $search, $case_sensitivity ); + if ( true === $got ) { + foreach ( self::byte_prefixes( $search ) as $prefix ) { + $prefix_got = $get_result( $prefix, $case_sensitivity ); + if ( false === $prefix_got ) { + $failures[] = self::failure( + 'attribute-starts-with-prefix-monotonicity', + $case_sensitivity, + array_merge( + array( + 'case_sensitivity' => $case_sensitivity, + ), + self::byte_detail( 'search', $search ), + self::byte_detail( 'prefix', $prefix ) + ) + ); + break; + } + } + } + + if ( false === $got ) { + foreach ( self::attribute_search_extensions() as $suffix ) { + $extension = $search . $suffix; + $extension_got = $get_result( $extension, $case_sensitivity ); + if ( true === $extension_got ) { + $failures[] = self::failure( + 'attribute-starts-with-extension-monotonicity', + $case_sensitivity, + array_merge( + array( + 'case_sensitivity' => $case_sensitivity, + ), + self::byte_detail( 'search', $search ), + self::byte_detail( 'extension', $extension ) + ) + ); + break; + } + } + } + } + + $case_sensitive = $get_result( $search, 'case-sensitive' ); + if ( true === $case_sensitive ) { + $case_insensitive = $get_result( $search, 'ascii-case-insensitive' ); + if ( false === $case_insensitive ) { + $failures[] = self::failure( + 'attribute-starts-with-case-monotonicity', + 'case-sensitive', + self::byte_detail( 'search', $search ) + ); + } + } + } + + return $failures; + } + + /** + * @return string[] + */ + private function attribute_searches( string $decoded ): array { + $searches = array( '', 'a', 'A', 'http', 'https:', 'javascript:', ':', '&' ); + + foreach ( array( 1, 2, 4, 8, 11 ) as $length ) { + $prefix = substr( $decoded, 0, $length ); + if ( '' !== $prefix && self::is_ascii( $prefix ) ) { + $searches[] = $prefix; + $searches[] = $prefix . 'x'; + $searches[] = strtoupper( $prefix ); + } + } + + $max_prefix_length = min( strlen( $decoded ), self::ATTRIBUTE_SEARCH_PREFIX_BYTES ); + for ( $length = 1; $length <= $max_prefix_length; $length++ ) { + $prefix = substr( $decoded, 0, $length ); + $searches[] = $prefix; + $searches[] = $prefix . 'x'; + } + + return array_values( array_unique( $searches ) ); + } + + private function expected_prefix_match( string $decoded, string $search, string $case_sensitivity ): bool { + if ( '' === $search ) { + return true; + } + + if ( strlen( $decoded ) < strlen( $search ) ) { + return false; + } + + $prefix = substr( $decoded, 0, strlen( $search ) ); + if ( 'ascii-case-insensitive' === $case_sensitivity ) { + return self::ascii_lower( $prefix ) === self::ascii_lower( $search ); + } + + return $prefix === $search; + } + + private static function is_ascii( string $text ): bool { + return ! preg_match( '/[\x80-\xFF]/', $text ); + } + + private static function ascii_lower( string $text ): string { + return strtr( $text, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz' ); + } + + /** + * @return string[] + */ + private static function byte_prefixes( string $text ): array { + $prefixes = array(); + for ( $length = 0; $length < strlen( $text ); $length++ ) { + $prefixes[] = substr( $text, 0, $length ); + } + return $prefixes; + } + + /** + * @return string[] + */ + private static function attribute_search_extensions(): array { + return array( "\x7F", 'x', 'A', '0', ':' ); + } + + private static function numeric_c1_replacement( string $reference ): ?string { + $value = self::numeric_reference_value( $reference ); + if ( null === $value || $value < 0x80 || $value > 0x9F ) { + return null; + } + + $replacement = mb_chr( self::C1_NUMERIC_REMAP[ $value - 0x80 ], 'UTF-8' ); + return false === $replacement ? null : $replacement; + } + + private static function invalid_numeric_replacement_reason( string $reference ): ?string { + $value = self::numeric_reference_value( $reference ); + if ( null === $value ) { + return null; + } + + if ( 0 === $value ) { + return 'zero'; + } + + if ( $value >= 0xD800 && $value <= 0xDFFF ) { + return 'surrogate'; + } + + if ( $value > 0x10FFFF ) { + return 'above-unicode'; + } + + return null; + } + + private static function numeric_reference_value( string $reference ): ?int { + if ( 1 !== preg_match( '/^&#(?:([xX])([0-9A-Fa-f]+)|([0-9]+));?$/', $reference, $match ) ) { + return null; + } + + $is_hex = '' !== ( $match[1] ?? '' ); + $digits = $is_hex ? $match[2] : $match[3]; + $base = $is_hex ? 16 : 10; + $max_digits = $is_hex ? 6 : 7; + $significant_digits = substr( $digits, strspn( $digits, '0' ) ); + + if ( '' === $significant_digits ) { + return 0; + } + + if ( strlen( $significant_digits ) > $max_digits ) { + return null; + } + + return intval( $significant_digits, $base ); + } + + private static function contains_raw_c1_byte( string $bytes ): bool { + return 1 === preg_match( '/[\x80-\x9F]/', $bytes ); + } + + private static function single_level_decode_expected( string $payload ): ?string { + $expected = ''; + $offset = 0; + $matched = false; + + while ( false !== ( $amp_at = strpos( $payload, '&', $offset ) ) ) { + $expected .= substr( $payload, $offset, $amp_at - $offset ); + + foreach ( self::SINGLE_LEVEL_DECODE_FIXTURES as $fixture => $decoded ) { + if ( str_starts_with( substr( $payload, $amp_at ), $fixture ) ) { + $expected .= $decoded; + $offset = $amp_at + strlen( $fixture ); + $matched = true; + continue 2; + } + } + + return null; + } + + return $matched ? $expected . substr( $payload, $offset ) : null; + } + + private static function byte_detail( string $name, string $bytes ): array { + $detail = array( + "{$name}_length" => strlen( $bytes ), + "{$name}_base64" => base64_encode( $bytes ), + "{$name}_preview" => self::preview( $bytes ), + ); + + if ( mb_check_encoding( $bytes, 'UTF-8' ) ) { + $detail[ "{$name}_text" ] = $bytes; + } + + return $detail; + } + + /** + * @param array $spans + * + * @return array + */ + private static function preview_reader_spans( array $spans ): array { + return array_slice( $spans, 0, 16 ); + } + + private static function failure( string $check, string $party, array $detail ): array { + return array( + 'check' => $check, + 'signature' => "{$check}:{$party}", + 'detail' => $detail, + ); + } + + private static function diff_detail( string $context, string $expected, string $got ): array { + $offset = self::first_difference( $expected, $got ); + + return array( + 'context' => $context, + 'expected_length' => strlen( $expected ), + 'got_length' => strlen( $got ), + 'first_diff_at' => $offset, + 'expected_base64' => base64_encode( $expected ), + 'got_base64' => base64_encode( $got ), + 'expected_window' => self::preview( $expected, $offset ), + 'got_window' => self::preview( $got, $offset ), + ); + } + + private static function first_difference( string $a, string $b ): int { + $max = min( strlen( $a ), strlen( $b ) ); + for ( $i = 0; $i < $max; $i++ ) { + if ( $a[ $i ] !== $b[ $i ] ) { + return $i; + } + } + return $max; + } + + private static function preview( string $bytes, int $center = 0 ): string { + $start = max( 0, $center - intdiv( self::PREVIEW_BYTES, 2 ) ); + return bin2hex( substr( $bytes, $start, self::PREVIEW_BYTES ) ); + } +} diff --git a/tools/html-decoder-fuzz/lib/Cli.php b/tools/html-decoder-fuzz/lib/Cli.php new file mode 100644 index 0000000000000..b3c2935d4ea5a --- /dev/null +++ b/tools/html-decoder-fuzz/lib/Cli.php @@ -0,0 +1,251 @@ + $defaults + * @return array + */ + public static function parse_args( array $argv, array $defaults ): array { + $options = $defaults; + $count = count( $argv ); + + for ( $i = 1; $i < $count; $i++ ) { + $arg = $argv[ $i ]; + if ( 0 !== strncmp( $arg, '--', 2 ) ) { + fwrite( STDERR, "Unexpected argument: {$arg}\n" ); + exit( 2 ); + } + + $body = substr( $arg, 2 ); + if ( false !== strpos( $body, '=' ) ) { + list( $name, $value ) = explode( '=', $body, 2 ); + } else { + $name = $body; + if ( $i + 1 >= $count ) { + fwrite( STDERR, "Missing value for --{$name}\n" ); + exit( 2 ); + } + $value = $argv[ ++$i ]; + } + + if ( ! array_key_exists( $name, $defaults ) ) { + fwrite( STDERR, "Unknown option --{$name}\n" ); + exit( 2 ); + } + + if ( is_int( $defaults[ $name ] ) ) { + if ( 1 !== preg_match( '/^-?\d+$/', $value ) ) { + fwrite( STDERR, "--{$name} must be an integer\n" ); + exit( 2 ); + } + $digits = '-' === $value[0] ? substr( $value, 1 ) : $value; + $digits = ltrim( $digits, '0' ); + $digits = '' === $digits ? '0' : $digits; + $max = (string) PHP_INT_MAX; + if ( strlen( $digits ) > strlen( $max ) || ( strlen( $digits ) === strlen( $max ) && strcmp( $digits, $max ) > 0 ) ) { + fwrite( STDERR, "--{$name} is outside the supported integer range\n" ); + exit( 2 ); + } + $options[ $name ] = (int) $value; + } else { + $options[ $name ] = $value; + } + } + + return $options; + } + + public static function emit( array $record ): void { + $json = json_encode( $record, JSON_UNESCAPED_SLASHES ); + if ( false === $json || ! self::write_stream( STDOUT, $json . "\n" ) ) { + fwrite( STDERR, "Cannot write worker event\n" ); + exit( 2 ); + } + } + + /** + * @param resource $stream + */ + public static function write_stream( $stream, string $contents ): bool { + $written = fwrite( $stream, $contents ); + return is_int( $written ) && strlen( $contents ) === $written; + } + + public static function write_file( string $path, string $contents ): bool { + if ( self::is_linked_file( $path ) ) { + return false; + } + + $written = file_put_contents( $path, $contents ); + return is_int( $written ) && strlen( $contents ) === $written; + } + + public static function append_file( string $path, string $contents ): bool { + if ( '' === $contents ) { + return true; + } + if ( self::is_linked_file( $path ) ) { + return false; + } + + $written = file_put_contents( $path, $contents, FILE_APPEND ); + return is_int( $written ) && strlen( $contents ) === $written; + } + + public static function is_linked_file( string $path ): bool { + if ( is_link( $path ) ) { + return true; + } + if ( ! file_exists( $path ) || is_dir( $path ) ) { + return false; + } + if ( ! is_file( $path ) ) { + return true; + } + + $stat = @lstat( $path ); + return is_array( $stat ) && isset( $stat['nlink'] ) && $stat['nlink'] > 1; + } + + public static function failure_signature_key( array $signatures, string $mode = 'oracle' ): string { + $normalized = array_map( 'strval', $signatures ); + sort( $normalized, SORT_STRING ); + return hash( 'sha256', $mode . "\0" . implode( "\0", $normalized ) ); + } + + public static function remove_tree( string $path, string $root ): bool { + if ( is_link( $path ) || is_file( $path ) ) { + $real_root = realpath( $root ); + $real_parent = realpath( dirname( $path ) ); + if ( false === $real_root || false === $real_parent ) { + return false; + } + + $root_prefix = rtrim( $real_root, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR; + if ( $real_parent !== $real_root && 0 !== strncmp( $real_parent . DIRECTORY_SEPARATOR, $root_prefix, strlen( $root_prefix ) ) ) { + return false; + } + + return @unlink( $path ); + } + if ( ! is_dir( $path ) ) { + return true; + } + + $real_path = realpath( $path ); + $real_root = realpath( $root ); + if ( false === $real_path || false === $real_root || $real_path === $real_root ) { + return false; + } + + $root_prefix = rtrim( $real_root, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR; + if ( 0 !== strncmp( $real_path . DIRECTORY_SEPARATOR, $root_prefix, strlen( $root_prefix ) ) ) { + return false; + } + + $items = new \RecursiveIteratorIterator( + new \RecursiveDirectoryIterator( $real_path, \FilesystemIterator::SKIP_DOTS ), + \RecursiveIteratorIterator::CHILD_FIRST + ); + + foreach ( $items as $item ) { + $pathname = $item->getPathname(); + if ( $item->isDir() && ! $item->isLink() ) { + if ( ! @rmdir( $pathname ) ) { + return false; + } + } elseif ( ! @unlink( $pathname ) ) { + return false; + } + } + + return @rmdir( $real_path ); + } + + public static function require_int_at_least( array $options, string $name, int $minimum ): void { + if ( ! isset( $options[ $name ] ) || ! is_int( $options[ $name ] ) || $options[ $name ] < $minimum ) { + fwrite( STDERR, "--{$name} must be at least {$minimum}\n" ); + exit( 2 ); + } + } + + /** + * @param string[] $allowed + */ + public static function require_one_of( array $options, string $name, array $allowed ): void { + if ( ! isset( $options[ $name ] ) || ! in_array( $options[ $name ], $allowed, true ) ) { + fwrite( STDERR, "--{$name} must be one of: " . implode( ', ', $allowed ) . "\n" ); + exit( 2 ); + } + } + + public static function git_metadata( string $repo_root ): array { + $run = static function ( array $command ) use ( $repo_root ): ?string { + $process = @proc_open( + $command, + array( + 0 => array( 'file', '/dev/null', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', '/dev/null', 'a' ), + ), + $pipes, + $repo_root + ); + if ( ! is_resource( $process ) ) { + return null; + } + $out = stream_get_contents( $pipes[1] ); + fclose( $pipes[1] ); + $code = proc_close( $process ); + return 0 === $code ? trim( (string) $out ) : null; + }; + + $commit = $run( array( 'git', 'rev-parse', 'HEAD' ) ); + $branch = $run( array( 'git', 'rev-parse', '--abbrev-ref', 'HEAD' ) ); + $status = $run( array( 'git', 'status', '--porcelain', '--untracked-files=no' ) ); + + return array( + 'commit' => $commit, + 'branch' => $branch, + 'dirty' => null === $status ? null : '' !== $status, + ); + } + + public static function environment_metadata( Oracles $oracles ): array { + return array( + 'php' => PHP_VERSION, + 'os' => PHP_OS_FAMILY, + 'oracles' => $oracles->names(), + ); + } + + public static function payload_preview( string $payload ): array { + return array( + 'bytes' => strlen( $payload ), + 'sha256' => hash( 'sha256', $payload ), + 'hex' => bin2hex( substr( $payload, 0, 80 ) ) . ( strlen( $payload ) > 80 ? '...' : '' ), + ); + } +} diff --git a/tools/html-decoder-fuzz/lib/CoverageGuidance.php b/tools/html-decoder-fuzz/lib/CoverageGuidance.php new file mode 100644 index 0000000000000..239abe787820d --- /dev/null +++ b/tools/html-decoder-fuzz/lib/CoverageGuidance.php @@ -0,0 +1,268 @@ + */ + private array $seen_edges = array(); + + /** @var string[] */ + private array $target_files; + + /** @var array */ + private array $target_file_set; + + private string $provider; + + public function __construct() { + $this->target_files = self::target_files(); + $this->target_file_set = array_fill_keys( $this->target_files, true ); + $this->provider = self::fake_enabled() ? 'fake' : 'pcov'; + } + + public static function available(): bool { + return self::fake_enabled() || self::pcov_available(); + } + + public static function unavailable_reason(): string { + if ( getenv( 'HTML_DECODER_FUZZ_DISABLE_PCOV' ) ) { + return 'coverage mode requires pcov; pcov was disabled by HTML_DECODER_FUZZ_DISABLE_PCOV'; + } + if ( ! extension_loaded( 'pcov' ) ) { + return 'coverage mode requires the pcov extension'; + } + if ( '0' === (string) ini_get( 'pcov.enabled' ) ) { + return 'coverage mode requires pcov.enabled=1'; + } + if ( ! function_exists( 'pcov\\start' ) || ! function_exists( 'pcov\\stop' ) || ! function_exists( 'pcov\\collect' ) || ! function_exists( 'pcov\\clear' ) ) { + return 'coverage mode requires the pcov start, stop, collect, and clear functions'; + } + + return 'coverage mode is unavailable'; + } + + public function provider(): string { + return $this->provider; + } + + public function begin_case(): void { + if ( 'pcov' !== $this->provider ) { + return; + } + + \pcov\stop(); + \pcov\clear(); + \pcov\start(); + } + + /** + * @return array + */ + public function finish_case( string $payload, string $context, string $strategy ): array { + if ( 'fake' === $this->provider ) { + return $this->fake_edges( $payload, $context, $strategy ); + } + + \pcov\stop(); + $type = defined( 'pcov\\inclusive' ) ? constant( 'pcov\\inclusive' ) : 1; + $coverage = \pcov\collect( $type, $this->target_files ); + \pcov\clear(); + + return $this->normalize_coverage( $coverage ); + } + + /** + * @param array $edges + * @return array + */ + public function new_edges( array $edges ): array { + $new_edges = array(); + foreach ( $edges as $edge ) { + if ( isset( $this->seen_edges[ $edge['key'] ] ) ) { + continue; + } + $this->seen_edges[ $edge['key'] ] = true; + $new_edges[] = $edge; + } + + return $new_edges; + } + + public function seen_edge_count(): int { + return count( $this->seen_edges ); + } + + /** + * @param array{context: string, strategy: string, payload: string} $generated + * @param array $new_edges + * @return array{artifact_dir: ?string, artifact_retained: bool, artifact_reused: bool} + */ + public function retain_payload( string $output_dir, string $seed, int $case, array $generated, string $payload, array $new_edges ): array { + if ( '' === $output_dir ) { + return array( + 'artifact_dir' => null, + 'artifact_retained' => false, + 'artifact_reused' => false, + ); + } + + $coverage_dir = rtrim( $output_dir, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR . 'coverage-corpus'; + if ( is_link( $coverage_dir ) || ( file_exists( $coverage_dir ) && ! is_dir( $coverage_dir ) ) ) { + throw new \RuntimeException( "coverage corpus path is not a directory: {$coverage_dir}" ); + } + if ( ! is_dir( $coverage_dir ) && ! mkdir( $coverage_dir, 0777, true ) && ! is_dir( $coverage_dir ) ) { + throw new \RuntimeException( "cannot create coverage corpus dir {$coverage_dir}" ); + } + + $payload_hash = hash( 'sha256', $payload ); + $case_dir = sprintf( + '%s/payload-seed%s-case%d-%s', + $coverage_dir, + preg_replace( '/[^A-Za-z0-9_-]/', '_', $seed ), + $case, + substr( $payload_hash, 0, 16 ) + ); + if ( is_link( $case_dir ) || ( file_exists( $case_dir ) && ! is_dir( $case_dir ) ) ) { + throw new \RuntimeException( "coverage corpus artifact path is not a directory: {$case_dir}" ); + } + + $artifact_reused = is_dir( $case_dir ); + if ( ! $artifact_reused && ! mkdir( $case_dir, 0777, false ) && ! is_dir( $case_dir ) ) { + throw new \RuntimeException( "cannot create coverage corpus artifact {$case_dir}" ); + } + + if ( ! $artifact_reused ) { + $manifest = array( + 'type' => 'coverage', + 'seed' => $seed, + 'case' => $case, + 'mode' => 'coverage', + 'context' => $generated['context'], + 'strategy' => $generated['strategy'], + 'input_size' => strlen( $payload ), + 'payload_base64' => base64_encode( $payload ), + 'payload_preview' => Cli::payload_preview( $payload ), + 'coverage_provider' => $this->provider, + 'new_edge_count' => count( $new_edges ), + 'new_edges' => $new_edges, + 'git' => Cli::git_metadata( Bootstrap::repo_root() ), + ); + $manifest_json = json_encode( $manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES ); + if ( + false === $manifest_json || + ! Cli::write_file( "{$case_dir}/payload.txt", $payload ) || + ! Cli::write_file( "{$case_dir}/coverage.json", $manifest_json ) + ) { + throw new \RuntimeException( "cannot write coverage corpus artifact under {$case_dir}" ); + } + } + + return array( + 'artifact_dir' => $case_dir, + 'artifact_retained' => ! $artifact_reused, + 'artifact_reused' => $artifact_reused, + ); + } + + /** + * @return string[] + */ + public static function target_files(): array { + $root = Bootstrap::repo_root(); + $files = array( + $root . '/src/wp-includes/html-api/class-wp-html-decoder.php', + $root . '/src/wp-includes/class-wp-token-map.php', + ); + + return array_values( + array_filter( + array_map( + static function ( string $file ): ?string { + $real = realpath( $file ); + return false === $real ? null : $real; + }, + $files + ) + ) + ); + } + + private static function fake_enabled(): bool { + $value = getenv( 'HTML_DECODER_FUZZ_FAKE_COVERAGE' ); + return false !== $value && '' !== $value && '0' !== $value; + } + + private static function pcov_available(): bool { + return ( + ! getenv( 'HTML_DECODER_FUZZ_DISABLE_PCOV' ) && + extension_loaded( 'pcov' ) && + '0' !== (string) ini_get( 'pcov.enabled' ) && + function_exists( 'pcov\\start' ) && + function_exists( 'pcov\\stop' ) && + function_exists( 'pcov\\collect' ) && + function_exists( 'pcov\\clear' ) + ); + } + + /** + * @param mixed $coverage + * @return array + */ + private function normalize_coverage( $coverage ): array { + if ( ! is_array( $coverage ) ) { + return array(); + } + + $edges = array(); + foreach ( $coverage as $file => $lines ) { + $file = realpath( (string) $file ) ?: (string) $file; + if ( ! isset( $this->target_file_set[ $file ] ) || ! is_array( $lines ) ) { + continue; + } + + foreach ( $lines as $line => $hits ) { + $line = (int) $line; + $hits = (int) $hits; + if ( $line <= 0 || $hits <= 0 ) { + continue; + } + $edge = $this->edge( $file, $line, $hits ); + $edges[ $edge['key'] ] = $edge; + } + } + ksort( $edges, SORT_STRING ); + + return array_values( $edges ); + } + + /** + * @return array + */ + private function fake_edges( string $payload, string $context, string $strategy ): array { + $digest = hash( 'sha256', $context . "\0" . $strategy . "\0" . $payload ); + $edges = array(); + + foreach ( $this->target_files as $index => $file ) { + $line_count = count( file( $file, FILE_IGNORE_NEW_LINES ) ?: array() ); + $line_count = max( 1, $line_count ); + $offset = hexdec( substr( $digest, $index * 8, 8 ) ); + $edges[] = $this->edge( $file, 1 + ( $offset % $line_count ), 1 ); + } + + return $edges; + } + + /** + * @return array{key: string, file: string, line: int, hits: int} + */ + private function edge( string $file, int $line, int $hits ): array { + return array( + 'key' => hash( 'sha256', $file . "\0" . $line ), + 'file' => $file, + 'line' => $line, + 'hits' => $hits, + ); + } +} diff --git a/tools/html-decoder-fuzz/lib/Generator.php b/tools/html-decoder-fuzz/lib/Generator.php new file mode 100644 index 0000000000000..85d455c0b526c --- /dev/null +++ b/tools/html-decoder-fuzz/lib/Generator.php @@ -0,0 +1,1560 @@ + */ + private ?array $name_sweep_base_name_set = null; + + public function __construct( Prng $prng, int $max_bytes = 4096, ?array $named_reference_names = null ) { + $this->prng = $prng; + $this->max_bytes = max( 1, $max_bytes ); + + $names = $named_reference_names ?? Bootstrap::named_reference_names(); + + $this->semicolon_names = array_values( + array_filter( + $names, + static fn( string $name ): bool => str_ends_with( $name, ';' ) + ) + ); + $this->legacy_names = array_values( + array_filter( + $names, + static fn( string $name ): bool => ! str_ends_with( $name, ';' ) + ) + ); + + if ( array() === $this->semicolon_names ) { + $this->semicolon_names = self::PREFERRED_SEMICOLON; + } else { + self::sort_reference_names( $this->semicolon_names ); + } + if ( array() === $this->legacy_names ) { + $this->legacy_names = self::PREFERRED_LEGACY; + } else { + self::sort_reference_names( $this->legacy_names ); + } + } + + /** + * @return array{context: string, strategy: string, payload: string} + */ + public function generate(): array { + // Preserve seed-to-payload mapping from the former one-context lane. + $this->prng->chance( 50 ); + $strategy = $this->prng->weighted( + array( + 'plain-no-amp' => 8, + 'named-exact' => 16, + 'named-missing-semi' => 15, + 'attribute-discriminator' => 15, + 'numeric' => 22, + 'adjacency' => 10, + 'truncation-sweep' => 9, + 'reference-at-eof' => 12, + 'multibyte-around' => 9, + 'attribute-prefix' => 8, + 'lookalike' => 8, + 'composition' => 9, + 'case-mangled-name' => 8, + ) + ); + + $method = 'gen_' . str_replace( '-', '_', $strategy ); + $payload = $this->$method(); + + return array( + 'context' => 'both', + 'strategy' => $strategy, + 'payload' => self::trim_to_safe_max( $payload, $this->max_bytes ), + ); + } + + /** + * @return array{context: string, strategy: string, payload: string} + */ + public function generate_bytes(): array { + $strategy = $this->prng->weighted( + array( + 'bytes-uniform' => 35, + 'bytes-no-amp' => 20, + 'bytes-with-amp' => 20, + 'bytes-invalid-utf8' => 15, + 'bytes-delimiters' => 10, + ) + ); + + $method = 'gen_' . str_replace( '-', '_', $strategy ); + $payload = $this->$method(); + + return array( + 'context' => 'both', + 'strategy' => $strategy, + 'payload' => substr( $payload, 0, $this->max_bytes ), + ); + } + + /** + * @return array{context: string, strategy: string, payload: string} + */ + public function generate_name_sweep( int $case_index ): array { + $base_names = $this->name_sweep_base_names(); + $followers = self::name_sweep_followers(); + $variants = 2 * count( $followers ); + $case_index = max( 0, $case_index ); + $name_index = intdiv( $case_index, $variants ) % count( $base_names ); + $variant = $case_index % $variants; + $with_semicolon = $variant >= count( $followers ); + $follower = $followers[ $variant % count( $followers ) ]; + + $payload = '&' . $base_names[ $name_index ] . ( $with_semicolon ? ';' : '' ) . $follower; + + return array( + 'context' => 'both', + 'strategy' => 'name-sweep', + 'payload' => self::trim_to_safe_max( $payload, $this->max_bytes ), + ); + } + + public function name_sweep_period(): int { + return count( $this->name_sweep_base_names() ) * 2 * count( self::name_sweep_followers() ); + } + + /** + * @return array{context: string, strategy: string, payload: string} + */ + public function generate_legacy_follower_sweep( int $case_index ): array { + $followers = self::legacy_follower_sweep_followers(); + $case_index = max( 0, $case_index ); + $name_index = intdiv( $case_index, count( $followers ) ) % count( $this->legacy_names ); + $follower = $followers[ $case_index % count( $followers ) ]; + $payload = '&' . $this->legacy_names[ $name_index ] . $follower; + + return array( + 'context' => 'both', + 'strategy' => 'legacy-follower-sweep', + 'payload' => self::trim_to_safe_max( $payload, $this->max_bytes ), + ); + } + + public function legacy_follower_sweep_period(): int { + return count( $this->legacy_names ) * count( self::legacy_follower_sweep_followers() ); + } + + /** + * @return array{context: string, strategy: string, payload: string} + */ + public function generate_prefix_family_sweep( int $case_index ): array { + $cases = $this->prefix_family_sweep_cases(); + $case_index = max( 0, $case_index ) % count( $cases ); + $case = $cases[ $case_index ]; + $prefix = substr( $case['reference'], 0, $case['split'] ); + + return array( + 'context' => 'both', + 'strategy' => 'prefix-family-sweep', + 'payload' => self::trim_to_safe_max( $prefix . $case['follower'], $this->max_bytes ), + ); + } + + public function prefix_family_sweep_period(): int { + return count( $this->prefix_family_sweep_cases() ); + } + + /** + * @return array{context: string, strategy: string, payload: string} + */ + public function generate_numeric_boundary_sweep( int $case_index ): array { + $cases = self::numeric_boundary_sweep_cases(); + $case_index = max( 0, $case_index ) % count( $cases ); + + return array( + 'context' => 'both', + 'strategy' => 'numeric-boundary-sweep', + 'payload' => self::trim_to_safe_max( $cases[ $case_index ], $this->max_bytes ), + ); + } + + public function numeric_boundary_sweep_period(): int { + return count( self::numeric_boundary_sweep_cases() ); + } + + /** + * @return array{context: string, strategy: string, payload: string} + */ + public function generate_corpus_mutation( int $case_index ): array { + $corpus = self::corpus_payloads(); + $case_index = max( 0, $case_index ); + $operation = $this->prng->weighted( + array( + 'splice' => 25, + 'byte-perturb' => 25, + 'semicolon-toggle' => 25, + 'reference-duplication' => 25, + ) + ); + $payload = $corpus[ $case_index % count( $corpus ) ]; + $payload = $this->mutate_corpus_payload( $payload, $operation, $corpus ); + + if ( $this->prng->chance( 35 ) ) { + $payload = $this->mutate_corpus_payload( + $payload, + $this->prng->choice( array( 'splice', 'byte-perturb', 'semicolon-toggle', 'reference-duplication' ) ), + $corpus + ); + } + + return array( + 'context' => 'both', + 'strategy' => 'corpus-' . $operation, + 'payload' => self::trim_to_safe_max( $payload, $this->max_bytes ), + ); + } + + public function corpus_period(): int { + return count( self::corpus_payloads() ); + } + + /** + * @return array{context: string, strategy: string, payload: string} + */ + public function generate_token_map_sweep( int $case_index ): array { + $cases = self::token_map_sweep_cases(); + $case_index = max( 0, $case_index ) % count( $cases ); + + return array( + 'context' => 'both', + 'strategy' => 'token-map-structure-sweep', + 'payload' => self::trim_to_safe_max( $cases[ $case_index ]['payload'], $this->max_bytes ), + ); + } + + public function token_map_period(): int { + return count( self::token_map_sweep_cases() ); + } + + public static function is_oracle_safe_payload( string $payload ): bool { + return ( + mb_check_encoding( $payload, 'UTF-8' ) && + ! str_contains( $payload, '<' ) && + ! str_contains( $payload, '"' ) && + ! str_contains( $payload, "\r" ) && + ! str_contains( $payload, "\x00" ) + ); + } + + private function gen_plain_no_amp(): string { + return $this->plain_text( false ); + } + + private function gen_named_exact(): string { + return $this->plain_text() . $this->named_exact() . $this->plain_text(); + } + + private function gen_named_missing_semi(): string { + $name = $this->pick_legacy_name(); + $follower = $this->prng->weighted( + array( + 'end' => 35, + 'punct' => 35, + 'alpha' => 20, + 'eq' => 10, + ) + ); + + $suffix = ''; + if ( 'punct' === $follower ) { + $suffix = $this->prng->choice( array( ' ', '.', '/', ':', ';', '-' ) ); + } elseif ( 'alpha' === $follower ) { + $suffix = $this->ascii_run( $this->prng->int( 1, 5 ) ); + } elseif ( 'eq' === $follower ) { + $suffix = '=' . $this->ascii_run( $this->prng->int( 0, 4 ) ); + } + + return $this->plain_text() . '&' . $name . $suffix . $this->plain_text(); + } + + private function gen_attribute_discriminator(): string { + $name = $this->prng->choice( array_values( array_intersect( self::PREFERRED_LEGACY, $this->legacy_names ) ) ?: $this->legacy_names ); + $follower = $this->prng->choice( array( '=', 'x', 'Z', '0', 'later;' ) ); + + return $this->plain_text() . '&' . $name . $follower . $this->plain_text(); + } + + private function gen_numeric(): string { + return $this->plain_text() . $this->numeric_reference() . $this->plain_text(); + } + + private function gen_adjacency(): string { + $count = $this->prng->int( 2, 8 ); + $out = $this->plain_text(); + + for ( $i = 0; $i < $count; $i++ ) { + $out .= $this->prng->chance( 48 ) ? $this->named_exact() : $this->numeric_reference(); + if ( $this->prng->chance( 20 ) ) { + $out .= $this->plain_text(); + } + } + + return $out . $this->plain_text(); + } + + private function gen_truncation_sweep(): string { + $reference = $this->prng->chance( 50 ) ? $this->named_exact() : $this->numeric_reference( true ); + $length = strlen( $reference ); + $prefix = substr( $reference, 0, $this->prng->int( 1, max( 1, $length - 1 ) ) ); + + return $this->plain_text() . $prefix . $this->plain_text(); + } + + private function gen_reference_at_eof(): string { + $kind = $this->prng->weighted( + array( + 'fixed' => 45, + 'named-prefix' => 25, + 'decimal-digits' => 15, + 'hex-digits' => 15, + ) + ); + $suffix = ''; + + if ( 'named-prefix' === $kind ) { + $name = $this->pick_semicolon_name(); + $reference = '&' . $name; + $suffix = substr( $reference, 0, $this->prng->int( 1, strlen( $reference ) - 1 ) ); + } elseif ( 'decimal-digits' === $kind ) { + $digits = $this->ascii_digits( $this->prng->int( 1, 9 ) ); + $suffix = substr( '&#' . $digits, 0, max( 1, min( strlen( '&#' . $digits ), $this->max_bytes ) ) ); + } elseif ( 'hex-digits' === $kind ) { + $prefix = $this->prng->chance( 50 ) ? '&#x' : '&#X'; + $digits = $this->hex_digits( $this->prng->int( 1, 8 ) ); + $suffix = substr( $prefix . $digits, 0, max( 1, min( strlen( $prefix . $digits ), $this->max_bytes ) ) ); + } else { + $suffix = $this->prng->choice( + array( + '&', + '&#', + '&#x', + '&#X', + '&g', + '>', + '¬', + '¬i', + '&', + '{', + '', + ) + ); + $suffix = substr( $suffix, 0, max( 1, min( strlen( $suffix ), $this->max_bytes ) ) ); + } + + return $this->plain_text_up_to( max( 0, $this->max_bytes - strlen( $suffix ) ) ) . $suffix; + } + + private function gen_multibyte_around(): string { + $atoms = array( 'e', "\u{00E9}", "\u{96EA}", "\u{1F642}", "\u{03B2}", "\u{05E2}\u{05D1}", "\u{0928}\u{092E}" ); + $out = ''; + $count = $this->prng->int( 2, 7 ); + for ( $i = 0; $i < $count; $i++ ) { + $out .= $this->prng->choice( $atoms ); + $out .= $this->prng->chance( 55 ) ? $this->named_exact() : $this->numeric_reference(); + } + return $out . $this->prng->choice( $atoms ); + } + + private function gen_attribute_prefix(): string { + if ( $this->prng->chance( 72 ) ) { + $encoded = $this->encode_attribute_prefix_target( $this->prng->choice( self::ATTRIBUTE_PREFIX_TARGETS ) ); + $suffix = $this->plain_text_up_to( max( 0, $this->max_bytes - strlen( $encoded['payload'] ) ) ); + if ( null !== $encoded['semicolonless_base'] && '' !== $suffix && self::would_extend_semicolonless_numeric( $encoded['semicolonless_base'], $suffix[0] ) ) { + $suffix = '_' . substr( $suffix, 1 ); + } + + return $encoded['payload'] . $suffix; + } + + $prefix = $this->prng->choice( + array( + '<⃒', + '>⃒', + '≪̸', + '=⃥', + 'jav', + ) + ); + + return $prefix . $this->plain_text_up_to( max( 0, $this->max_bytes - strlen( $prefix ) ) ); + } + + private function gen_lookalike(): string { + $lookalike = $this->prng->chance( 85 ) + ? $this->edit_distance_lookalike() + : $this->legacy_lookalike(); + + return $this->plain_text() . $lookalike . $this->plain_text(); + } + + private function gen_case_mangled_name(): string { + return $this->plain_text() . $this->case_mangled_name() . $this->plain_text(); + } + + private function gen_composition(): string { + if ( $this->max_bytes < 3 ) { + return self::trim_to_safe_max( $this->named_exact(), $this->max_bytes ); + } + + $strategies = array( + 'named-exact', + 'named-missing-semi', + 'attribute-discriminator', + 'numeric', + 'adjacency', + 'truncation-sweep', + 'reference-at-eof', + 'multibyte-around', + 'attribute-prefix', + 'lookalike', + 'case-mangled-name', + ); + $max_count = min( 3, intdiv( $this->max_bytes + strlen( self::COMPOSITION_SEPARATOR ), 1 + strlen( self::COMPOSITION_SEPARATOR ) ) ); + $count = $this->prng->int( 2, $max_count ); + $original_max_bytes = $this->max_bytes; + $out = ''; + + for ( $i = 0; $i < $count; $i++ ) { + if ( $i > 0 ) { + $out .= self::COMPOSITION_SEPARATOR; + } + + $remaining_fragments = $count - $i - 1; + $reserved_bytes = $remaining_fragments * ( 1 + strlen( self::COMPOSITION_SEPARATOR ) ); + $fragment_max_bytes = max( 1, $original_max_bytes - strlen( $out ) - $reserved_bytes ); + + $out .= $this->composition_fragment( $strategies, $fragment_max_bytes ); + } + + return $out; + } + + /** + * @param string[] $strategies + */ + private function composition_fragment( array $strategies, int $max_bytes ): string { + for ( $attempt = 0; $attempt < 8; $attempt++ ) { + $strategy = $this->prng->choice( $strategies ); + $method = 'gen_' . str_replace( '-', '_', $strategy ); + $fragment = $this->with_max_bytes( + $max_bytes, + function () use ( $method ): string { + return $this->$method(); + } + ); + $fragment = self::trim_to_safe_max( $fragment, $max_bytes ); + + if ( '' !== $fragment && ! str_contains( $fragment, self::COMPOSITION_SEPARATOR ) ) { + return $fragment; + } + } + + return '&'; + } + + /** + * @param string[] $corpus + */ + private function mutate_corpus_payload( string $payload, string $operation, array $corpus ): string { + switch ( $operation ) { + case 'splice': + return $this->mutate_corpus_splice( $payload, $corpus ); + + case 'byte-perturb': + return $this->mutate_corpus_byte_perturb( $payload ); + + case 'semicolon-toggle': + return $this->mutate_corpus_semicolon_toggle( $payload ); + + case 'reference-duplication': + return $this->mutate_corpus_reference_duplication( $payload ); + } + + return $payload; + } + + /** + * @param string[] $corpus + */ + private function mutate_corpus_splice( string $payload, array $corpus ): string { + $other = $this->prng->choice( $corpus ); + + $left_at = $this->utf8_boundary( $payload ); + $right_at = $this->utf8_boundary( $payload ); + if ( $right_at < $left_at ) { + list( $left_at, $right_at ) = array( $right_at, $left_at ); + } + + $other_left = $this->utf8_boundary( $other ); + $other_right = $this->utf8_boundary( $other ); + if ( $other_right < $other_left ) { + list( $other_left, $other_right ) = array( $other_right, $other_left ); + } + $splice = substr( $other, $other_left, $other_right - $other_left ); + if ( '' === $splice ) { + $splice = $this->prng->choice( array( '&', '€', '∉', '>' ) ); + } + + return substr( $payload, 0, $left_at ) . $splice . substr( $payload, $right_at ); + } + + private function mutate_corpus_byte_perturb( string $payload ): string { + $operation = $this->prng->weighted( + array( + 'insert' => 35, + 'replace' => 45, + 'delete' => 20, + ) + ); + + if ( '' === $payload || 'insert' === $operation ) { + $at = $this->utf8_boundary( $payload ); + return substr( $payload, 0, $at ) . $this->safe_corpus_byte() . substr( $payload, $at ); + } + + list( $at, $next ) = $this->utf8_character_span( $payload ); + if ( 'delete' === $operation ) { + return substr( $payload, 0, $at ) . substr( $payload, $next ); + } + + return substr( $payload, 0, $at ) . $this->safe_corpus_byte() . substr( $payload, $next ); + } + + private function mutate_corpus_semicolon_toggle( string $payload ): string { + $matches = $this->reference_matches( $payload ); + if ( array() === $matches ) { + return $payload . $this->prng->choice( array( '&', '&', ':', ':' ) ); + } + + $match = $this->prng->choice( $matches ); + $reference = $match['text']; + if ( str_ends_with( $reference, ';' ) ) { + $replacement = substr( $reference, 0, -1 ); + } else { + $replacement = $reference . ';'; + } + + return substr( $payload, 0, $match['offset'] ) . $replacement . substr( $payload, $match['offset'] + strlen( $reference ) ); + } + + private function mutate_corpus_reference_duplication( string $payload ): string { + $matches = $this->reference_matches( $payload ); + if ( array() === $matches ) { + return $payload . $this->prng->choice( array( '>>', '€€', '∉∉' ) ); + } + + $match = $this->prng->choice( $matches ); + return substr( $payload, 0, $match['offset'] + strlen( $match['text'] ) ) . $match['text'] . substr( $payload, $match['offset'] + strlen( $match['text'] ) ); + } + + private function safe_corpus_byte(): string { + return self::ASCII_ALPHABET[ $this->prng->int( 0, strlen( self::ASCII_ALPHABET ) - 1 ) ]; + } + + private function utf8_boundary( string $payload ): int { + return $this->prng->choice( self::utf8_boundaries( $payload ) ); + } + + /** + * @return array{0: int, 1: int} + */ + private function utf8_character_span( string $payload ): array { + $boundaries = self::utf8_boundaries( $payload ); + if ( count( $boundaries ) < 2 ) { + return array( 0, 0 ); + } + + $index = $this->prng->int( 0, count( $boundaries ) - 2 ); + return array( $boundaries[ $index ], $boundaries[ $index + 1 ] ); + } + + /** + * @return int[] + */ + private static function utf8_boundaries( string $payload ): array { + $boundaries = array( 0 ); + if ( '' === $payload ) { + return $boundaries; + } + + $match_count = preg_match_all( '/./us', $payload, $matches, PREG_OFFSET_CAPTURE ); + if ( false === $match_count || 0 === $match_count ) { + return array( 0, strlen( $payload ) ); + } + + foreach ( $matches[0] as $match ) { + $boundaries[] = $match[1] + strlen( $match[0] ); + } + + return array_values( array_unique( $boundaries ) ); + } + + /** + * @return array + */ + private function reference_matches( string $payload ): array { + $matches = array(); + $match_count = preg_match_all( '/&(?:#[xX][0-9A-Fa-f]+|#[0-9]+|[A-Za-z][A-Za-z0-9]+);?/', $payload, $raw_matches, PREG_OFFSET_CAPTURE ); + if ( false === $match_count || 0 === $match_count ) { + return $matches; + } + + foreach ( $raw_matches[0] as $match ) { + $matches[] = array( + 'text' => $match[0], + 'offset' => $match[1], + ); + } + + return $matches; + } + + /** + * @return array{payload: string, semicolonless_base: ?string} + */ + private function encode_attribute_prefix_target( string $target ): array { + if ( '' === $target ) { + return array( + 'payload' => '', + 'semicolonless_base' => null, + ); + } + + $length = strlen( $target ); + $force_reference_at = $this->prng->int( 0, $length - 1 ); + $out = ''; + $previous_base = null; + + for ( $i = 0; $i < $length; $i++ ) { + $char = $target[ $i ]; + $allow_literal = $i !== $force_reference_at && self::is_oracle_safe_literal( $char ) && ! self::would_extend_semicolonless_numeric( $previous_base, $char ); + $encoded = $this->encode_attribute_prefix_character( ord( $char ), $allow_literal ); + $out .= $encoded['payload']; + $previous_base = $encoded['semicolonless_base']; + } + + return array( + 'payload' => $out, + 'semicolonless_base' => $previous_base, + ); + } + + /** + * @return array{payload: string, semicolonless_base: ?string} + */ + private function encode_attribute_prefix_character( int $code_point, bool $allow_literal ): array { + $encoding = $this->prng->weighted( + array( + 'literal' => $allow_literal ? 34 : 0, + 'decimal' => 17, + 'decimal-leading-zero' => 17, + 'hex-lower' => 14, + 'hex-upper' => 10, + 'hex-leading-zero' => 8, + ) + ); + + if ( 'literal' === $encoding ) { + return array( + 'payload' => chr( $code_point ), + 'semicolonless_base' => null, + ); + } + + $is_hex = str_starts_with( $encoding, 'hex' ); + $digits = $is_hex ? dechex( $code_point ) : (string) $code_point; + + if ( str_ends_with( $encoding, 'leading-zero' ) ) { + $digits = str_repeat( '0', $this->prng->int( 1, 4 ) ) . $digits; + } + if ( 'hex-upper' === $encoding || ( 'hex-leading-zero' === $encoding && $this->prng->chance( 50 ) ) ) { + $digits = strtoupper( $digits ); + } + + $semicolon = $this->prng->chance( 68 ) ? ';' : ''; + $prefix = $is_hex + ? ( $this->prng->chance( 50 ) ? '&#x' : '&#X' ) + : '&#'; + + return array( + 'payload' => $prefix . $digits . $semicolon, + 'semicolonless_base' => '' === $semicolon ? ( $is_hex ? 'hex' : 'decimal' ) : null, + ); + } + + private static function is_oracle_safe_literal( string $char ): bool { + return ! in_array( $char, array( '<', '"', "\r", "\x00" ), true ); + } + + private static function would_extend_semicolonless_numeric( ?string $base, string $char ): bool { + if ( null === $base ) { + return false; + } + + if ( ';' === $char ) { + return true; + } + + if ( 'decimal' === $base ) { + return self::is_ascii_digit( $char ); + } + + return self::is_ascii_hex_digit( $char ); + } + + private static function is_ascii_digit( string $char ): bool { + $ord = ord( $char ); + return $ord >= 0x30 && $ord <= 0x39; + } + + private static function is_ascii_hex_digit( string $char ): bool { + $ord = ord( $char ); + return ( + ( $ord >= 0x30 && $ord <= 0x39 ) || + ( $ord >= 0x41 && $ord <= 0x46 ) || + ( $ord >= 0x61 && $ord <= 0x66 ) + ); + } + + private static function is_ascii_alpha( string $char ): bool { + $ord = ord( $char ); + return ( $ord >= 0x41 && $ord <= 0x5A ) || ( $ord >= 0x61 && $ord <= 0x7A ); + } + + /** + * @param callable(): string $callback + */ + private function with_max_bytes( int $max_bytes, callable $callback ): string { + $previous_max_bytes = $this->max_bytes; + $this->max_bytes = max( 1, $max_bytes ); + + try { + return $callback(); + } finally { + $this->max_bytes = $previous_max_bytes; + } + } + + private function edit_distance_lookalike(): string { + for ( $attempt = 0; $attempt < 40; $attempt++ ) { + $base = $this->prng->choice( $this->name_sweep_base_names() ); + $operation = $this->prng->weighted( + array( + 'delete' => 25, + 'insert' => 25, + 'substitute' => 25, + 'transpose' => 25, + ) + ); + $mutated = $this->mutate_name_base( $base, $operation ); + + if ( '' === $mutated || $mutated === $base || isset( $this->name_sweep_base_name_set()[ $mutated ] ) ) { + continue; + } + + return '&' . $mutated . ( $this->prng->chance( 80 ) ? ';' : '' ); + } + + return $this->legacy_lookalike(); + } + + private function case_mangled_name(): string { + $base_set = $this->name_sweep_base_name_set(); + for ( $attempt = 0; $attempt < 60; $attempt++ ) { + $base = $this->prng->choice( $this->name_sweep_base_names() ); + $mutated = $this->case_mangle_name_base( $base ); + if ( '' === $mutated || $mutated === $base || isset( $base_set[ $mutated ] ) ) { + continue; + } + + return '&' . $mutated . ';'; + } + + return $this->legacy_lookalike(); + } + + private function case_mangle_name_base( string $base ): string { + $letter_offsets = array(); + for ( $i = 0; $i < strlen( $base ); $i++ ) { + if ( self::is_ascii_alpha( $base[ $i ] ) ) { + $letter_offsets[] = $i; + } + } + + if ( array() === $letter_offsets ) { + return ''; + } + + $mutated = $base; + $flips = $this->prng->int( 1, min( 3, count( $letter_offsets ) ) ); + for ( $i = 0; $i < $flips; $i++ ) { + $index = $this->prng->int( 0, count( $letter_offsets ) - 1 ); + $offset = $letter_offsets[ $index ]; + array_splice( $letter_offsets, $index, 1 ); + $char = $mutated[ $offset ]; + $mutated[ $offset ] = strtolower( $char ) === $char ? strtoupper( $char ) : strtolower( $char ); + } + + return $mutated; + } + + private function legacy_lookalike(): string { + return $this->prng->choice( + array( + '&bogus;', + '&NoSuchEntity', + '&;', + '& ;', + '¬i;', + '¬it;', + '©right;', + '¢erdo;', + '&ngE', + '÷ontime;', + '&&', + '&>', + '&am', + '&', + ) + ); + } + + private function mutate_name_base( string $base, string $operation ): string { + $length = strlen( $base ); + + switch ( $operation ) { + case 'delete': + if ( $length < 2 ) { + return ''; + } + $offset = $this->prng->int( 0, $length - 1 ); + return substr( $base, 0, $offset ) . substr( $base, $offset + 1 ); + + case 'insert': + $offset = $this->prng->int( 0, $length ); + return substr( $base, 0, $offset ) . $this->random_name_char() . substr( $base, $offset ); + + case 'substitute': + if ( 0 === $length ) { + return ''; + } + $offset = $this->prng->int( 0, $length - 1 ); + return substr( $base, 0, $offset ) . $this->random_name_char( $base[ $offset ] ) . substr( $base, $offset + 1 ); + + case 'transpose': + if ( $length < 2 ) { + return ''; + } + $offsets = array(); + for ( $i = 0; $i < $length - 1; $i++ ) { + if ( $base[ $i ] !== $base[ $i + 1 ] ) { + $offsets[] = $i; + } + } + if ( array() === $offsets ) { + return ''; + } + + $offset = $this->prng->choice( $offsets ); + return substr( $base, 0, $offset ) . $base[ $offset + 1 ] . $base[ $offset ] . substr( $base, $offset + 2 ); + } + + return ''; + } + + private function random_name_char( ?string $except = null ): string { + $alphabet = self::NAME_MUTATION_ALPHABET; + $char = $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ]; + if ( null === $except || $char !== $except ) { + return $char; + } + + $offset = strpos( $alphabet, $except ); + if ( false === $offset ) { + return $char; + } + + return $alphabet[ ( $offset + $this->prng->int( 1, strlen( $alphabet ) - 1 ) ) % strlen( $alphabet ) ]; + } + + private function gen_bytes_uniform(): string { + $length = max( 1, $this->prng->biased_length( $this->max_bytes ) ); + return $this->prng->bytes( $length ); + } + + private function gen_bytes_no_amp(): string { + $length = max( 1, $this->prng->biased_length( $this->max_bytes ) ); + $out = ''; + while ( strlen( $out ) < $length ) { + $byte = $this->prng->int( 0, 255 ); + if ( 0x26 === $byte ) { + $byte = 0x00; + } + $out .= chr( $byte ); + } + return $out; + } + + private function gen_bytes_with_amp(): string { + $prefixes = array( '&', '&#', '&#x', '&#X', '&', '¬', '©', '&NoSuchEntity;' ); + $payload = $this->prng->bytes( $this->prng->int( 0, min( 32, $this->max_bytes ) ) ); + $payload .= $this->prng->choice( $prefixes ); + $payload .= $this->prng->bytes( $this->prng->int( 0, min( 64, $this->max_bytes ) ) ); + return $payload; + } + + private function gen_bytes_invalid_utf8(): string { + $atoms = array( + "\x80", + "\xBF", + "\xC0\xAF", + "\xE0\x80\x80", + "\xF0\x80\x80\x80", + "\xF5\x80\x80\x80", + "\xED\xA0\x80", + "\xFE", + "\xFF", + ); + + $out = ''; + $count = $this->prng->int( 1, 12 ); + for ( $i = 0; $i < $count; $i++ ) { + $out .= $this->prng->bytes( $this->prng->int( 0, 4 ) ); + $out .= $this->prng->choice( $atoms ); + } + return $out; + } + + private function gen_bytes_delimiters(): string { + $delimiters = array( "\x00", "\r", '<', '"', '&', '=', "\n", "\t", "\f" ); + $out = ''; + $count = $this->prng->int( 1, 24 ); + for ( $i = 0; $i < $count; $i++ ) { + $out .= $this->prng->choice( $delimiters ); + if ( $this->prng->chance( 35 ) ) { + $out .= $this->prng->bytes( $this->prng->int( 1, 4 ) ); + } + } + return $out; + } + + private function named_exact(): string { + return '&' . $this->pick_semicolon_name(); + } + + private function pick_semicolon_name(): string { + $preferred = array_values( array_intersect( self::PREFERRED_SEMICOLON, $this->semicolon_names ) ); + if ( array() !== $preferred && $this->prng->chance( 75 ) ) { + return $this->prng->choice( $preferred ); + } + + return $this->prng->choice( $this->semicolon_names ); + } + + private function pick_legacy_name(): string { + $preferred = array_values( array_intersect( self::PREFERRED_LEGACY, $this->legacy_names ) ); + if ( array() !== $preferred && $this->prng->chance( 80 ) ) { + return $this->prng->choice( $preferred ); + } + + return $this->prng->choice( $this->legacy_names ); + } + + /** + * @param string[] $names + */ + private static function sort_reference_names( array &$names ): void { + usort( + $names, + static function ( string $a, string $b ): int { + return strlen( $b ) <=> strlen( $a ) ?: strcmp( $a, $b ); + } + ); + } + + /** + * @return string[] + */ + private function name_sweep_base_names(): array { + if ( null !== $this->name_sweep_base_names ) { + return $this->name_sweep_base_names; + } + + $base_names = array(); + foreach ( array_merge( $this->semicolon_names, $this->legacy_names ) as $name ) { + $base = rtrim( $name, ';' ); + if ( '' !== $base ) { + $base_names[ $base ] = true; + } + } + + $this->name_sweep_base_names = array_keys( $base_names ); + return $this->name_sweep_base_names; + } + + /** + * @return array + */ + private function name_sweep_base_name_set(): array { + if ( null === $this->name_sweep_base_name_set ) { + $this->name_sweep_base_name_set = array_fill_keys( $this->name_sweep_base_names(), true ); + } + + return $this->name_sweep_base_name_set; + } + + /** + * @return string[] + */ + private static function name_sweep_followers(): array { + return array( '', 'x', 'X', '0', '=', '-', ' ', '/', "\u{00E9}" ); + } + + /** + * @return string[] + */ + private static function legacy_follower_sweep_followers(): array { + static $followers = null; + if ( null !== $followers ) { + return $followers; + } + + $followers = array(); + + for ( $byte = 1; $byte <= 0x7F; $byte++ ) { + if ( in_array( $byte, array( 0x0D, 0x22, 0x3C ), true ) ) { + continue; + } + $followers[] = chr( $byte ); + } + + for ( $lead = 0xC2; $lead <= 0xF4; $lead++ ) { + if ( $lead < 0xE0 ) { + $followers[] = chr( $lead ) . "\x80"; + } elseif ( 0xE0 === $lead ) { + $followers[] = "\xE0\xA0\x80"; + } elseif ( $lead < 0xF0 ) { + $followers[] = chr( $lead ) . "\x80\x80"; + } elseif ( 0xF0 === $lead ) { + $followers[] = "\xF0\x90\x80\x80"; + } elseif ( $lead < 0xF4 ) { + $followers[] = chr( $lead ) . "\x80\x80\x80"; + } else { + $followers[] = "\xF4\x80\x80\x80"; + } + } + + for ( $continuation = 0x80; $continuation <= 0xBF; $continuation++ ) { + $followers[] = "\xC2" . chr( $continuation ); + } + + $followers = array_values( array_unique( $followers ) ); + return $followers; + } + + /** + * @return array + */ + private function prefix_family_sweep_cases(): array { + $name_set = $this->name_sweep_base_name_set(); + $cases = array(); + + foreach ( self::prefix_family_sweep_references() as $reference ) { + $base = rtrim( $reference, ';' ); + if ( ! isset( $name_set[ $base ] ) ) { + continue; + } + + $full_reference = '&' . $reference; + for ( $split = 1; $split < strlen( $full_reference ); $split++ ) { + foreach ( self::prefix_family_sweep_followers() as $follower ) { + $cases[] = array( + 'reference' => $full_reference, + 'split' => $split, + 'follower' => $follower, + ); + } + } + } + + return $cases; + } + + /** + * @return string[] + */ + private static function prefix_family_sweep_references(): array { + return array( + 'not', + 'not;', + 'notin;', + 'notinva;', + 'ngt;', + 'nGt;', + 'nGtv;', + 'nge;', + 'ngeq;', + 'ngeqq;', + ); + } + + /** + * @return string[] + */ + private static function prefix_family_sweep_followers(): array { + return array( '', 'x', 'X', '0', '=', "\u{00E9}" ); + } + + /** + * @return string[] + */ + private static function numeric_boundary_sweep_cases(): array { + static $cases = null; + if ( null !== $cases ) { + return $cases; + } + + $cases = array(); + foreach ( array( 'decimal', 'hex-lower', 'hex-upper', 'hex-mixed' ) as $kind ) { + $is_decimal = 'decimal' === $kind; + $max_digits = $is_decimal ? 7 : 6; + foreach ( array( $max_digits, $max_digits + 1 ) as $digit_count ) { + foreach ( array( false, true ) as $leading_zero ) { + foreach ( array( false, true ) as $semicolon ) { + $cases[] = self::numeric_boundary_reference( $kind, $digit_count, $leading_zero, $semicolon ); + } + } + } + } + + return array_values( array_unique( $cases ) ); + } + + private static function numeric_boundary_reference( string $kind, int $digit_count, bool $leading_zero, bool $semicolon ): string { + if ( 'decimal' === $kind ) { + $prefix = '&#'; + $digits = 7 === $digit_count ? '1114111' : substr( str_repeat( '9', $digit_count ), 0, $digit_count ); + } else { + $prefix = 'hex-upper' === $kind ? '&#X' : '&#x'; + $digits = 6 === $digit_count ? '10ffee' : substr( str_repeat( 'abcdef', (int) ceil( $digit_count / 6 ) ), 0, $digit_count ); + if ( 'hex-upper' === $kind ) { + $digits = strtoupper( $digits ); + } elseif ( 'hex-mixed' === $kind ) { + $chars = str_split( $digits ); + foreach ( $chars as $i => $char ) { + if ( 0 === $i % 2 ) { + $chars[ $i ] = strtoupper( $char ); + } + } + $digits = implode( '', $chars ); + } + } + + if ( $leading_zero ) { + $digits = '0' . $digits; + } + + return $prefix . $digits . ( $semicolon ? ';' : '' ); + } + + /** + * @return string[] + */ + private static function corpus_payloads(): array { + static $payloads = null; + if ( null !== $payloads ) { + return $payloads; + } + + $payloads = array( + '', + 'plain text', + 'FOO>BAR', + 'FOO>BAR', + 'FOO>;;BAR', + 'FOO&&&>BAR', + "I'm ¬it; I tell you", + "I'm ∉ I tell you", + '&ammmp;', + '&amp;', + '∉∉¬', + 'ZZ>=YY', + 'ZZ>0YY', + 'ZZ> YY', + 'ZZ>', + 'javascript:alert(1)', + 'javascript:alert(1)', + 'javascript:alert(1)', + '€€€', + '���', + '<⃒tail', + '&NoSuchEntity;&', + ); + + foreach ( Oracles::battery() as $vector ) { + $payloads[] = $vector[1]; + } + + foreach ( self::html5lib_entity_payloads() as $payload ) { + $payloads[] = $payload; + } + + $payloads = array_values( + array_unique( + array_filter( + $payloads, + static fn( string $payload ): bool => self::is_oracle_safe_payload( $payload ) + ) + ) + ); + + if ( array() === $payloads ) { + $payloads = array( '&' ); + } + + return $payloads; + } + + /** + * @return array + */ + private static function token_map_sweep_cases(): array { + static $cases = null; + if ( null !== $cases ) { + return $cases; + } + + $structure = Bootstrap::named_reference_structure(); + $key_length = $structure['key_length']; + $cases = array(); + + foreach ( $structure['group_prefixes'] as $prefix ) { + $cases[] = array( + 'shape' => 'large-prefix-divergent', + 'prefix' => $prefix, + 'payload' => '&' . $prefix . self::token_map_divergent_suffix( $prefix, $structure['large_names_by_prefix'][ $prefix ] ?? array() ), + ); + } + + foreach ( $structure['small_names'] as $name ) { + $cases[] = array( + 'shape' => 'small-boundary-exact', + 'name' => $name, + 'payload' => '&' . $name, + ); + $cases[] = array( + 'shape' => 'small-boundary-extended', + 'name' => $name, + 'payload' => '&' . $name . 'Q;', + ); + } + + foreach ( $structure['large_names'] as $name ) { + if ( strlen( $name ) !== $key_length + 1 ) { + continue; + } + + $cases[] = array( + 'shape' => 'large-boundary-exact', + 'name' => $name, + 'payload' => '&' . $name, + ); + $cases[] = array( + 'shape' => 'large-boundary-extended', + 'name' => $name, + 'payload' => '&' . $name . 'Q;', + ); + } + + $cases = array_values( + array_filter( + $cases, + static fn( array $case ): bool => self::is_oracle_safe_payload( $case['payload'] ) + ) + ); + + return array() === $cases + ? array( array( 'shape' => 'fallback', 'payload' => '&NoSuchEntity;' ) ) + : $cases; + } + + /** + * @param string[] $names + */ + private static function token_map_divergent_suffix( string $prefix, array $names ): string { + $used_first_rest_chars = array(); + $prefix_length = strlen( $prefix ); + foreach ( $names as $name ) { + $rest = substr( $name, $prefix_length ); + if ( '' !== $rest ) { + $used_first_rest_chars[ $rest[0] ] = true; + } + } + + for ( $i = 0; $i < strlen( self::NAME_MUTATION_ALPHABET ); $i++ ) { + $char = self::NAME_MUTATION_ALPHABET[ $i ]; + if ( ! isset( $used_first_rest_chars[ $char ] ) ) { + return $char . 'QQ;'; + } + } + + return '_QQ;'; + } + + /** + * @return string[] + */ + private static function html5lib_entity_payloads(): array { + $payloads = array(); + foreach ( array( 'entities01.dat', 'entities02.dat' ) as $file ) { + $path = Bootstrap::repo_root() . '/tests/phpunit/data/html5lib-tests/tree-construction/' . $file; + if ( ! is_file( $path ) ) { + continue; + } + + $lines = file( $path, FILE_IGNORE_NEW_LINES ); + if ( ! is_array( $lines ) ) { + continue; + } + + for ( $i = 0; $i + 1 < count( $lines ); $i++ ) { + if ( '#data' !== $lines[ $i ] ) { + continue; + } + + $payload = self::html5lib_entity_payload_from_data_line( $lines[ $i + 1 ] ); + if ( strlen( $payload ) > 512 ) { + $payload = substr( $payload, 0, 512 ); + } + $payloads[] = $payload; + } + } + + return $payloads; + } + + private static function html5lib_entity_payload_from_data_line( string $line ): string { + if ( 1 === preg_match( '/^\s]+))><\/div>$/', $line, $match ) ) { + foreach ( array( 1, 2, 3 ) as $index ) { + if ( isset( $match[ $index ] ) && '' !== $match[ $index ] ) { + return $match[ $index ]; + } + } + } + + if ( 1 === preg_match( '/^
(.*)<\/div>$/', $line, $match ) ) { + return $match[1]; + } + + return $line; + } + + private function numeric_reference( bool $allow_missing_digits = false ): string { + $kind = $this->prng->weighted( + array( + 'decimal' => 45, + 'hex' => 45, + 'missing' => $allow_missing_digits ? 10 : 0, + ) + ); + + if ( 'missing' === $kind ) { + return $this->prng->choice( array( '&#;', '&#x;', '&#X;' ) ); + } + + $value = $this->numeric_code_point( 'hex' === $kind ? 16 : 10 ); + + if ( 'hex' === $kind ) { + $digits = dechex( $value ); + if ( $this->prng->chance( 50 ) ) { + $digits = strtoupper( $digits ); + } + $prefix = $this->prng->chance( 50 ) ? '&#x' : '&#X'; + } else { + $digits = (string) $value; + $prefix = '&#'; + } + + if ( $this->prng->chance( 35 ) ) { + $digits = str_repeat( '0', $this->prng->int( 1, 10 ) ) . $digits; + } + + return $prefix . $digits . ( $this->prng->chance( 82 ) ? ';' : '' ); + } + + private function numeric_code_point( int $numeric_base ): int { + $bucket = $this->prng->weighted( + array( + 'zero' => 5, + 'c0-control' => 8, + 'ascii' => 10, + 'c1-control' => 14, + 'bmp' => 12, + 'surrogate' => 12, + 'bmp-noncharacter' => 8, + 'plane-noncharacter' => 10, + 'astral' => 10, + 'above-unicode-legal-digits' => 8, + 'digit-count-overflow' => 5, + ) + ); + + switch ( $bucket ) { + case 'zero': + return 0; + + case 'c0-control': + return $this->prng->int( 1, 0x1F ); + + case 'ascii': + return $this->prng->int( 0x20, 0x7F ); + + case 'c1-control': + return $this->prng->int( 0x80, 0x9F ); + + case 'bmp': + if ( $this->prng->chance( 50 ) ) { + return $this->prng->int( 0xA0, 0xD7FF ); + } + if ( $this->prng->chance( 50 ) ) { + return $this->prng->int( 0xE000, 0xFDCF ); + } + return $this->prng->int( 0xFDF0, 0xFFFD ); + + case 'surrogate': + return $this->prng->int( 0xD800, 0xDFFF ); + + case 'bmp-noncharacter': + if ( $this->prng->chance( 75 ) ) { + return $this->prng->int( 0xFDD0, 0xFDEF ); + } + return $this->prng->choice( array( 0xFFFE, 0xFFFF ) ); + + case 'plane-noncharacter': + return ( $this->prng->int( 1, 16 ) << 16 ) + $this->prng->choice( array( 0xFFFE, 0xFFFF ) ); + + case 'astral': + return ( $this->prng->int( 1, 16 ) << 16 ) + $this->prng->int( 0, 0xFFFD ); + + case 'above-unicode-legal-digits': + return $this->prng->int( 0x110000, 16 === $numeric_base ? 0xFFFFFF : 9999999 ); + + case 'digit-count-overflow': + return $this->prng->int( 16 === $numeric_base ? 0x1000000 : 10000000, 16 === $numeric_base ? 0xFFFFFFF : 99999999 ); + } + + return 0x41; + } + + private function plain_text( bool $allow_amp = false ): string { + return $this->plain_text_up_to( min( 128, $this->max_bytes ), $allow_amp ); + } + + private function plain_text_up_to( int $max_bytes, bool $allow_amp = false ): string { + $length = $this->prng->biased_length( max( 0, $max_bytes ) ); + if ( 0 === $length ) { + return ''; + } + + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + if ( $allow_amp && $this->prng->chance( 3 ) ) { + $out .= '&'; + continue; + } + $out .= self::ASCII_ALPHABET[ $this->prng->int( 0, strlen( self::ASCII_ALPHABET ) - 1 ) ]; + } + + return $out; + } + + private function ascii_run( int $length ): string { + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + $out .= self::ASCII_ALPHABET[ $this->prng->int( 0, strlen( self::ASCII_ALPHABET ) - 1 ) ]; + } + return $out; + } + + private function ascii_digits( int $length ): string { + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + $out .= (string) $this->prng->int( 0, 9 ); + } + return $out; + } + + private function hex_digits( int $length ): string { + $digits = '0123456789abcdefABCDEF'; + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + $out .= $digits[ $this->prng->int( 0, strlen( $digits ) - 1 ) ]; + } + return $out; + } + + private static function trim_to_safe_max( string $payload, int $max_bytes ): string { + $payload = str_replace( array( '<', '"', "\r", "\x00" ), array( '', "'", "\n", '' ), $payload ); + + while ( '' !== $payload && ! mb_check_encoding( $payload, 'UTF-8' ) ) { + $payload = substr( $payload, 0, -1 ); + } + + if ( strlen( $payload ) <= $max_bytes ) { + return $payload; + } + + $trimmed = substr( $payload, 0, $max_bytes ); + while ( '' !== $trimmed && ! mb_check_encoding( $trimmed, 'UTF-8' ) ) { + $trimmed = substr( $trimmed, 0, -1 ); + } + + return $trimmed; + } +} diff --git a/tools/html-decoder-fuzz/lib/Oracles.php b/tools/html-decoder-fuzz/lib/Oracles.php new file mode 100644 index 0000000000000..77b4b61f7cc7c --- /dev/null +++ b/tools/html-decoder-fuzz/lib/Oracles.php @@ -0,0 +1,304 @@ + */ + private array $events = array(); + + private bool $dom_available = false; + private bool $entity_decode_available = true; + private bool $mb_available = false; + + public static function build(): self { + $oracles = new self(); + + $oracles->dom_available = class_exists( \Dom\HTMLDocument::class ); + $oracles->entity_decode_available = function_exists( 'html_entity_decode' ) && defined( 'ENT_HTML5' ) && defined( 'ENT_QUOTES' ); + $oracles->mb_available = function_exists( 'mb_check_encoding' ); + + if ( ! $oracles->dom_available ) { + $oracles->events[] = array( + 'type' => 'oracle-unavailable', + 'oracle' => 'dom', + 'detail' => 'PHP 8.4 Dom\\HTMLDocument is required', + ); + } + + if ( ! $oracles->entity_decode_available ) { + $oracles->events[] = array( + 'type' => 'oracle-unavailable', + 'oracle' => 'entity-decode', + 'detail' => 'html_entity_decode with ENT_HTML5 and ENT_QUOTES is required', + ); + } + + if ( ! $oracles->mb_available ) { + $oracles->events[] = array( + 'type' => 'oracle-unavailable', + 'oracle' => 'mb', + 'detail' => 'mb_check_encoding is required for UTF-8 output checks', + ); + } + + if ( $oracles->dom_available ) { + $oracles->verify_battery(); + } + + if ( $oracles->entity_decode_available ) { + $oracles->verify_entity_decode_battery(); + } + + return $oracles; + } + + /** + * @return array [context, payload, expected] + */ + public static function battery(): array { + return array( + array( 'text', '', '' ), + array( 'attribute', '', '' ), + array( 'text', 'plain text', 'plain text' ), + array( 'attribute', 'plain text', 'plain text' ), + array( 'text', '&', '&' ), + array( 'attribute', '&', '&' ), + array( 'text', '&amp;', '&' ), + array( 'attribute', '&amp;', '&' ), + array( 'text', '&', '&' ), + array( 'attribute', '&', '&' ), + array( 'text', '&x', '&x' ), + array( 'attribute', '&x', '&x' ), + array( 'text', '∉', "\u{2209}" ), + array( 'attribute', '∉', "\u{2209}" ), + array( 'text', '¬in', "\u{00AC}" . 'in' ), + array( 'attribute', '¬in', '¬in' ), + array( 'text', '&NoSuchEntity;', '&NoSuchEntity;' ), + array( 'attribute', '&NoSuchEntity;', '&NoSuchEntity;' ), + array( 'text', '€', "\u{20AC}" ), + array( 'attribute', '€', "\u{20AC}" ), + array( 'text', '€', "\u{20AC}" ), + array( 'attribute', '€', "\u{20AC}" ), + array( 'text', '�', "\u{FFFD}" ), + array( 'attribute', '�', "\u{FFFD}" ), + array( 'text', '�', "\u{FFFD}" ), + array( 'attribute', '�', "\u{FFFD}" ), + array( 'text', '�', "\u{FFFD}" ), + array( 'attribute', '�', "\u{FFFD}" ), + array( 'text', '&#;', '&#;' ), + array( 'attribute', '&#;', '&#;' ), + array( 'text', '&#x;', '&#x;' ), + array( 'attribute', '&#x;', '&#x;' ), + array( 'text', 'a:b', 'a:b' ), + array( 'attribute', 'a:b', 'a:b' ), + ); + } + + public function has_required(): bool { + return $this->dom_available && $this->entity_decode_available && $this->mb_available; + } + + public function names(): array { + $names = array(); + if ( $this->dom_available ) { + $names[] = 'dom'; + } + if ( $this->entity_decode_available ) { + $names[] = 'entity-decode'; + } + if ( $this->mb_available ) { + $names[] = 'mb'; + } + return $names; + } + + /** @return array */ + public function drain_events(): array { + $events = $this->events; + $this->events = array(); + return $events; + } + + public function decode( string $context, string $payload ): string { + if ( 'text' === $context ) { + return $this->decode_text( $payload ); + } + + if ( 'attribute' === $context ) { + return $this->decode_attribute( $payload ); + } + + throw new \InvalidArgumentException( "Unknown context {$context}" ); + } + + public function decode_text_with_entity_decode( string $payload ): ?string { + if ( ! $this->entity_decode_available || ! self::supports_entity_decode_text_payload( $payload ) ) { + return null; + } + + return html_entity_decode( $payload, ENT_HTML5 | ENT_QUOTES, 'UTF-8' ); + } + + private function decode_text( string $payload ): string { + $document = $this->parse( '
' . $payload . '
' ); + $div = $document->getElementById( 'fuzz' ); + if ( null === $div ) { + throw new \RuntimeException( 'DOM oracle could not find text wrapper element.' ); + } + + return $div->textContent; + } + + private function decode_attribute( string $payload ): string { + $document = $this->parse( '
' ); + $div = $document->getElementById( 'fuzz' ); + if ( null === $div ) { + throw new \RuntimeException( 'DOM oracle could not find attribute wrapper element.' ); + } + + return $div->getAttribute( 'title' ); + } + + private function parse( string $html ): \Dom\HTMLDocument { + $document = @\Dom\HTMLDocument::createFromString( $html ); + if ( ! $document instanceof \Dom\HTMLDocument ) { + throw new \RuntimeException( 'DOM oracle parse failed.' ); + } + + return $document; + } + + private static function supports_entity_decode_text_payload( string $payload ): bool { + $length = strlen( $payload ); + $offset = 0; + + while ( false !== ( $amp_at = strpos( $payload, '&', $offset ) ) ) { + $name_at = $amp_at + 1; + if ( $name_at >= $length ) { + return true; + } + + if ( '#' === $payload[ $name_at ] ) { + return false; + } + + $name_length = strspn( $payload, '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', $name_at ); + if ( 0 === $name_length ) { + $offset = $name_at; + continue; + } + + $after_name = $name_at + $name_length; + if ( $after_name >= $length || ';' !== $payload[ $after_name ] ) { + return false; + } + + $reference_name = substr( $payload, $name_at, $name_length + 1 ); + if ( ! isset( self::entity_decode_named_reference_set()[ $reference_name ] ) ) { + return false; + } + + $offset = $after_name + 1; + } + + return true; + } + + /** + * @return array + */ + private static function entity_decode_named_reference_set(): array { + static $names = null; + if ( null !== $names ) { + return $names; + } + + $names = array(); + foreach ( Bootstrap::named_reference_names() as $name ) { + if ( str_ends_with( $name, ';' ) ) { + $names[ $name ] = true; + } + } + + return $names; + } + + private function verify_battery(): void { + foreach ( self::battery() as $i => $vector ) { + list( $context, $payload, $expected ) = $vector; + try { + $got = $this->decode( $context, $payload ); + } catch ( \Throwable $error ) { + $this->dom_available = false; + $this->events[] = array( + 'type' => 'oracle-disabled', + 'oracle' => 'dom', + 'detail' => "battery vector {$i} threw " . get_class( $error ) . ': ' . $error->getMessage(), + ); + return; + } + + if ( $got !== $expected ) { + $this->dom_available = false; + $this->events[] = array( + 'type' => 'oracle-disabled', + 'oracle' => 'dom', + 'detail' => sprintf( + 'battery vector %d (%s, %s): expected %s, got %s', + $i, + $context, + bin2hex( $payload ), + bin2hex( $expected ), + bin2hex( $got ) + ), + ); + return; + } + } + } + + private function verify_entity_decode_battery(): void { + $battery = array( + array( '', '' ), + array( 'plain text', 'plain text' ), + array( 'a&b', 'a&b' ), + array( '"'', "\"'" ), + array( '∉', "\u{2209}" ), + array( '<⃒', "<\u{20D2}" ), + array( ' ', "\n" ), + ); + + foreach ( $battery as $i => $vector ) { + list( $payload, $expected ) = $vector; + try { + $got = $this->decode_text_with_entity_decode( $payload ); + } catch ( \Throwable $error ) { + $this->entity_decode_available = false; + $this->events[] = array( + 'type' => 'oracle-disabled', + 'oracle' => 'entity-decode', + 'detail' => "battery vector {$i} threw " . get_class( $error ) . ': ' . $error->getMessage(), + ); + return; + } + + if ( $got !== $expected ) { + $this->entity_decode_available = false; + $this->events[] = array( + 'type' => 'oracle-disabled', + 'oracle' => 'entity-decode', + 'detail' => sprintf( + 'battery vector %d (%s): expected %s, got %s', + $i, + bin2hex( $payload ), + bin2hex( $expected ), + bin2hex( is_string( $got ) ? $got : '' ) + ), + ); + return; + } + } + } +} diff --git a/tools/html-decoder-fuzz/lib/Prng.php b/tools/html-decoder-fuzz/lib/Prng.php new file mode 100644 index 0000000000000..09622543344a4 --- /dev/null +++ b/tools/html-decoder-fuzz/lib/Prng.php @@ -0,0 +1,84 @@ +seed = $seed; + } + + public function bytes( int $length ): string { + while ( strlen( $this->buffer ) < $length ) { + $this->buffer .= hash( 'sha256', $this->seed . ':' . $this->counter++, true ); + } + + $out = substr( $this->buffer, 0, $length ); + $this->buffer = (string) substr( $this->buffer, $length ); + return $out; + } + + public function uint32(): int { + $parts = unpack( 'Nvalue', $this->bytes( 4 ) ); + return (int) $parts['value']; + } + + public function int( int $min, int $max ): int { + if ( $max <= $min ) { + return $min; + } + + return $min + ( $this->uint32() % ( $max - $min + 1 ) ); + } + + public function chance( int $numerator, int $denominator = 100 ): bool { + return $this->int( 1, $denominator ) <= $numerator; + } + + public function choice( array $values ) { + return $values[ $this->int( 0, count( $values ) - 1 ) ]; + } + + /** + * @param array $weights Map of value to integer weight. + */ + public function weighted( array $weights ) { + $total = (int) array_sum( $weights ); + $pick = $this->int( 1, max( 1, $total ) ); + foreach ( $weights as $value => $weight ) { + $pick -= $weight; + if ( $pick <= 0 ) { + return $value; + } + } + + return array_key_first( $weights ); + } + + public function biased_length( int $max ): int { + $bucket = $this->weighted( + array( + 'tiny' => 38, + 'short' => 38, + 'mid' => 20, + 'large' => 4, + ) + ); + + switch ( $bucket ) { + case 'tiny': + return $this->int( 0, min( 8, $max ) ); + case 'short': + return $this->int( 0, min( 64, $max ) ); + case 'mid': + return $this->int( 0, min( 1024, $max ) ); + default: + return $this->int( 0, $max ); + } + } +} diff --git a/tools/html-decoder-fuzz/lib/Targets.php b/tools/html-decoder-fuzz/lib/Targets.php new file mode 100644 index 0000000000000..519f951f37db0 --- /dev/null +++ b/tools/html-decoder-fuzz/lib/Targets.php @@ -0,0 +1,256 @@ + + */ + public static function resolve(): array { + $targets = self::real(); + + switch ( getenv( 'HTML_DECODER_FUZZ_FAULT' ) ) { + case 'skip-c1-remap': + $targets['decode_text'] = static fn( string $text ): string => self::undo_c1_remap( \WP_HTML_Decoder::decode_text_node( $text ) ); + $targets['decode_attribute'] = static fn( string $text ): string => self::undo_c1_remap( \WP_HTML_Decoder::decode_attribute( $text ) ); + break; + + case 'attribute-semicolonless': + $targets['decode_attribute'] = static fn( string $text ): string => \WP_HTML_Decoder::decode_text_node( $text ); + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + return \WP_HTML_Decoder::read_character_reference( 'attribute' === $context ? 'data' : $context, $text, $at, $match_byte_length ); + }; + break; + + case 'match-length-off-by-one': + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( null !== $result ) { + ++$match_byte_length; + } + return $result; + }; + break; + + case 'reader-empty-chunk': + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( null !== $result && str_starts_with( substr( $text, $at ), '&' ) ) { + return ''; + } + return $result; + }; + break; + + case 'reader-short-match-length': + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( null !== $result && str_starts_with( substr( $text, $at ), '&' ) ) { + $match_byte_length = 1; + } + return $result; + }; + break; + + case 'reader-substring-composition': + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( null !== $result && 0 === $at && ':' === $text ) { + return '.'; + } + return $result; + }; + break; + + case 'reader-null-mutates-match-length': + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( null === $result && str_starts_with( substr( $text, $at ), '&' ) ) { + $match_byte_length = 0; + } + return $result; + }; + break; + + case 'reader-non-amp-match': + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( isset( $text[ $at ] ) && '&' !== $text[ $at ] ) { + $match_byte_length = 1; + return $text[ $at ]; + } + return $result; + }; + break; + + case 'reader-gapless-drop-span': + $targets['reader_span_filter'] = static function ( array $spans ): array { + foreach ( $spans as $index => $span ) { + if ( ( $span['end'] ?? 0 ) > ( $span['start'] ?? 0 ) ) { + unset( $spans[ $index ] ); + return array_values( $spans ); + } + } + return $spans; + }; + break; + + case 'numeric-invalid-not-replacement': + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( null !== $result && is_int( $match_byte_length ) && self::is_invalid_numeric_replacement_reference( substr( $text, $at, $match_byte_length ) ) ) { + return '?'; + } + return $result; + }; + break; + + case 'numeric-c1-not-remapped': + $targets['read_character_reference'] = static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( null !== $result && is_int( $match_byte_length ) ) { + $value = self::numeric_c1_reference_value( substr( $text, $at, $match_byte_length ) ); + if ( null !== $value ) { + $replacement = mb_chr( $value, 'UTF-8' ); + return false === $replacement ? $result : $replacement; + } + } + return $result; + }; + break; + + case 'raw-c1-not-pass-through': + $targets['decode_text'] = static fn( string $text ): string => self::rewrite_raw_c1_bytes( \WP_HTML_Decoder::decode_text_node( $text ) ); + $targets['decode_attribute'] = static fn( string $text ): string => self::rewrite_raw_c1_bytes( \WP_HTML_Decoder::decode_attribute( $text ) ); + break; + + case 'text-secondary-oracle': + $targets['decode_text'] = static function ( string $text ): string { + $decoded = \WP_HTML_Decoder::decode_text_node( $text ); + return str_contains( $text, '&' ) ? '!' . $decoded : $decoded; + }; + break; + + case 'single-level-overdecode': + $targets['decode_text'] = static function ( string $text ): string { + return \WP_HTML_Decoder::decode_text_node( \WP_HTML_Decoder::decode_text_node( $text ) ); + }; + $targets['decode_attribute'] = static function ( string $text ): string { + return \WP_HTML_Decoder::decode_attribute( \WP_HTML_Decoder::decode_attribute( $text ) ); + }; + break; + + case 'byte-no-amp-identity': + $targets['decode_text'] = static fn( string $text ): string => str_replace( "\x00", '', \WP_HTML_Decoder::decode_text_node( $text ) ); + $targets['decode_attribute'] = static fn( string $text ): string => str_replace( "\x00", '', \WP_HTML_Decoder::decode_attribute( $text ) ); + break; + + case 'attribute-no-amp-identity': + $targets['decode_attribute'] = static function ( string $text ): string { + $decoded = \WP_HTML_Decoder::decode_attribute( $text ); + return str_contains( $text, '&' ) ? $decoded : '!' . $decoded; + }; + break; + + case 'attribute-prefix-monotonicity': + $attribute_starts_with = $targets['attribute_starts_with']; + $targets['attribute_starts_with'] = static function ( string $haystack, string $search, string $case_sensitivity ) use ( $attribute_starts_with ): bool { + if ( 'jav' === $search ) { + return false; + } + return $attribute_starts_with( $haystack, $search, $case_sensitivity ); + }; + break; + + case 'attribute-extension-monotonicity': + $attribute_starts_with = $targets['attribute_starts_with']; + $targets['attribute_starts_with'] = static function ( string $haystack, string $search, string $case_sensitivity ) use ( $attribute_starts_with ): bool { + if ( str_ends_with( $search, "\x7F" ) ) { + return true; + } + return $attribute_starts_with( $haystack, $search, $case_sensitivity ); + }; + break; + + case 'attribute-case-monotonicity': + $attribute_starts_with = $targets['attribute_starts_with']; + $targets['attribute_starts_with'] = static function ( string $haystack, string $search, string $case_sensitivity ) use ( $attribute_starts_with ): bool { + if ( 'ascii-case-insensitive' === $case_sensitivity && 'jav' === $search ) { + return false; + } + return $attribute_starts_with( $haystack, $search, $case_sensitivity ); + }; + break; + + case 'attribute-multicodepoint-prefix': + $attribute_starts_with = $targets['attribute_starts_with']; + $targets['attribute_starts_with'] = static function ( string $haystack, string $search, string $case_sensitivity ) use ( $attribute_starts_with ): bool { + if ( str_starts_with( $haystack, '<⃒' ) && "<\xE2" === $search ) { + return false; + } + return $attribute_starts_with( $haystack, $search, $case_sensitivity ); + }; + break; + } + + return $targets; + } + + /** + * @return array + */ + public static function real(): array { + return array( + 'decode_text' => static fn( string $text ): string => \WP_HTML_Decoder::decode_text_node( $text ), + 'decode_attribute' => static fn( string $text ): string => \WP_HTML_Decoder::decode_attribute( $text ), + 'read_character_reference' => static fn( string $context, string $text, int $at, &$match_byte_length = null ): ?string => \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ), + 'attribute_starts_with' => static fn( string $haystack, string $search, string $case_sensitivity ): bool => \WP_HTML_Decoder::attribute_starts_with( $haystack, $search, $case_sensitivity ), + ); + } + + private static function undo_c1_remap( string $decoded ): string { + return str_replace( "\u{20AC}", "\u{0080}", $decoded ); + } + + private static function rewrite_raw_c1_bytes( string $decoded ): string { + return preg_replace( '/[\x80-\x9F]/', '?', $decoded ) ?? $decoded; + } + + private static function numeric_c1_reference_value( string $reference ): ?int { + $value = self::numeric_reference_value( $reference ); + return null !== $value && $value >= 0x80 && $value <= 0x9F ? $value : null; + } + + private static function is_invalid_numeric_replacement_reference( string $reference ): bool { + $value = self::numeric_reference_value( $reference ); + if ( null === $value ) { + return false; + } + + return 0 === $value || ( $value >= 0xD800 && $value <= 0xDFFF ) || $value > 0x10FFFF; + } + + private static function numeric_reference_value( string $reference ): ?int { + if ( 1 !== preg_match( '/^&#(?:([xX])([0-9A-Fa-f]+)|([0-9]+));?$/', $reference, $match ) ) { + return null; + } + + $is_hex = '' !== ( $match[1] ?? '' ); + $digits = $is_hex ? $match[2] : $match[3]; + $base = $is_hex ? 16 : 10; + $max_digits = $is_hex ? 6 : 7; + $significant_digits = substr( $digits, strspn( $digits, '0' ) ); + + if ( '' === $significant_digits ) { + return 0; + } + + if ( strlen( $significant_digits ) > $max_digits ) { + return null; + } + + return intval( $significant_digits, $base ); + } +} diff --git a/tools/html-decoder-fuzz/lib/autoload.php b/tools/html-decoder-fuzz/lib/autoload.php new file mode 100644 index 0000000000000..9a967a473ba5f --- /dev/null +++ b/tools/html-decoder-fuzz/lib/autoload.php @@ -0,0 +1,17 @@ + '', + 'input' => '', + 'context' => 'both', + 'mode' => 'oracle', + 'signature' => '', + 'output-dir' => '', + ) +); + +Cli::require_one_of( $options, 'context', array( 'text', 'attribute', 'both' ) ); +Cli::require_one_of( $options, 'mode', Cli::valid_modes() ); + +Bootstrap::load_targets(); + +$payload = null; +$context = $options['context']; +$mode = $options['mode']; +$signature = $options['signature']; +$source_dir = $options['output-dir']; + +if ( '' !== $options['failure'] ) { + $manifest = json_decode( (string) file_get_contents( $options['failure'] ), true ); + if ( ! is_array( $manifest ) || ! isset( $manifest['payload_base64'] ) ) { + fwrite( STDERR, "Cannot read failure manifest {$options['failure']}\n" ); + exit( 2 ); + } + $payload = base64_decode( $manifest['payload_base64'], true ); + $context = $manifest['context'] ?? $context; + $mode = $manifest['mode'] ?? 'oracle'; + if ( ! in_array( $context, array( 'text', 'attribute', 'both' ), true ) ) { + fwrite( STDERR, "Invalid context in failure manifest: {$context}\n" ); + exit( 2 ); + } + if ( ! in_array( $mode, Cli::valid_modes(), true ) ) { + fwrite( STDERR, "Invalid mode in failure manifest: {$mode}\n" ); + exit( 2 ); + } + if ( '' === $signature ) { + $signature = $manifest['signatures'][0] ?? ''; + } + if ( '' === $source_dir ) { + $source_dir = dirname( $options['failure'] ); + } +} elseif ( '' !== $options['input'] ) { + $payload = file_get_contents( $options['input'] ); + if ( false === $payload ) { + fwrite( STDERR, "Cannot read input file {$options['input']}\n" ); + exit( 2 ); + } + if ( '' === $source_dir ) { + $source_dir = dirname( $options['input'] ); + } +} else { + fwrite( STDERR, "Provide --failure or --input.\n" ); + exit( 2 ); +} + +if ( ! is_string( $payload ) ) { + fwrite( STDERR, "Payload could not be loaded.\n" ); + exit( 2 ); +} + +if ( '' === $signature ) { + fwrite( STDERR, "No signature given and none found in the manifest.\n" ); + exit( 2 ); +} + +$oracles = Oracles::build(); +if ( Cli::mode_uses_oracle( $mode ) && ! $oracles->has_required() ) { + fwrite( STDERR, "Required oracle unavailable; cannot minimize.\n" ); + exit( 2 ); +} + +$checks = new Checks( $oracles ); + +$reproduces = static function ( string $candidate ) use ( $checks, $context, $mode, $signature ): bool { + $failures = 'bytes' === $mode ? $checks->run_without_oracle( $context, $candidate ) : $checks->run( $context, $candidate ); + foreach ( $failures as $failure ) { + if ( $failure['signature'] === $signature ) { + return true; + } + } + return false; +}; + +if ( ! $reproduces( $payload ) ) { + fwrite( STDERR, "Signature {$signature} does not reproduce on the given payload.\n" ); + exit( 1 ); +} + +$current = $payload; +$tries = 0; + +$chunk = (int) ceil( max( 1, strlen( $current ) ) / 2 ); +while ( $chunk >= 1 ) { + $progress = false; + + for ( $at = 0; $at < strlen( $current ); ) { + $candidate = substr( $current, 0, $at ) . substr( $current, $at + $chunk ); + ++$tries; + + if ( strlen( $candidate ) < strlen( $current ) && $reproduces( $candidate ) ) { + $current = $candidate; + $progress = true; + } else { + $at += max( 1, intdiv( $chunk, 2 ) ); + } + } + + if ( ! $progress && $chunk > 1 ) { + $chunk = intdiv( $chunk, 2 ); + } elseif ( ! $progress ) { + break; + } +} + +for ( $at = 0; $at < strlen( $current ); $at++ ) { + if ( 'a' === $current[ $at ] ) { + continue; + } + + $candidate = $current; + $candidate[ $at ] = 'a'; + ++$tries; + + if ( $reproduces( $candidate ) ) { + $current = $candidate; + } +} + +$out_dir = '' !== $source_dir ? $source_dir : '.'; +if ( ! is_dir( $out_dir ) && ! mkdir( $out_dir, 0777, true ) ) { + fwrite( STDERR, "Cannot create output dir {$out_dir}\n" ); + exit( 2 ); +} + +$payload_path = "{$out_dir}/minimized-payload.txt"; +$manifest_path = "{$out_dir}/minimized.json"; +$manifest = json_encode( + array( + 'mode' => $mode, + 'context' => $context, + 'signature' => $signature, + 'original_size' => strlen( $payload ), + 'minimized_size' => strlen( $current ), + 'tries' => $tries, + 'payload_base64' => base64_encode( $current ), + 'payload_hex' => strlen( $current ) <= 256 ? bin2hex( $current ) : null, + 'environment' => Cli::environment_metadata( $oracles ), + 'git' => Cli::git_metadata( Bootstrap::repo_root() ), + ), + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES +); +if ( false === $manifest || ! Cli::write_file( $payload_path, $current ) || ! Cli::write_file( $manifest_path, $manifest ) ) { + fwrite( STDERR, "Cannot write minimized artifacts under {$out_dir}\n" ); + exit( 2 ); +} + +echo "Minimized {$signature}: " . strlen( $payload ) . ' -> ' . strlen( $current ) . " bytes in {$tries} tries.\n"; +echo 'Hex: ' . bin2hex( substr( $current, 0, 128 ) ) . ( strlen( $current ) > 128 ? '...' : '' ) . "\n"; +echo "Artifacts: {$payload_path}, {$manifest_path}\n"; + +exit( 0 ); diff --git a/tools/html-decoder-fuzz/replay.php b/tools/html-decoder-fuzz/replay.php new file mode 100644 index 0000000000000..20bb7e298334f --- /dev/null +++ b/tools/html-decoder-fuzz/replay.php @@ -0,0 +1,132 @@ + '', + 'input' => '', + 'seed' => -1, + 'case' => -1, + 'context' => 'both', + 'mode' => 'oracle', + 'max-bytes' => 4096, + ) +); + +Cli::require_int_at_least( $options, 'max-bytes', 1 ); +Cli::require_one_of( $options, 'context', array( 'text', 'attribute', 'both' ) ); +Cli::require_one_of( $options, 'mode', Cli::valid_modes() ); + +Bootstrap::load_targets(); + +$payload = null; +$context = $options['context']; +$mode = $options['mode']; +$source = null; + +if ( '' !== $options['failure'] ) { + $manifest = json_decode( (string) file_get_contents( $options['failure'] ), true ); + if ( ! is_array( $manifest ) || ! isset( $manifest['payload_base64'] ) ) { + fwrite( STDERR, "Cannot read failure manifest {$options['failure']}\n" ); + exit( 2 ); + } + $payload = base64_decode( $manifest['payload_base64'], true ); + $context = $manifest['context'] ?? $context; + $mode = $manifest['mode'] ?? 'oracle'; + if ( ! in_array( $context, array( 'text', 'attribute', 'both' ), true ) ) { + fwrite( STDERR, "Invalid context in failure manifest: {$context}\n" ); + exit( 2 ); + } + if ( ! in_array( $mode, Cli::valid_modes(), true ) ) { + fwrite( STDERR, "Invalid mode in failure manifest: {$mode}\n" ); + exit( 2 ); + } + $source = "failure manifest {$options['failure']}"; +} elseif ( '' !== $options['input'] ) { + $payload = file_get_contents( $options['input'] ); + if ( false === $payload ) { + fwrite( STDERR, "Cannot read input file {$options['input']}\n" ); + exit( 2 ); + } + $source = "input file {$options['input']}"; +} elseif ( $options['seed'] >= 0 && $options['case'] >= 0 ) { + $generator = new Generator( new Prng( "{$options['seed']}:{$options['case']}" ), $options['max-bytes'], Bootstrap::named_reference_names() ); + if ( 'bytes' === $mode ) { + $generated = $generator->generate_bytes(); + } elseif ( 'names' === $mode ) { + $generated = $generator->generate_name_sweep( $options['case'] ); + } elseif ( 'legacy-followers' === $mode ) { + $generated = $generator->generate_legacy_follower_sweep( $options['case'] ); + } elseif ( 'prefix-families' === $mode ) { + $generated = $generator->generate_prefix_family_sweep( $options['case'] ); + } elseif ( 'numeric-boundaries' === $mode ) { + $generated = $generator->generate_numeric_boundary_sweep( $options['case'] ); + } elseif ( 'corpus' === $mode ) { + $generated = $generator->generate_corpus_mutation( $options['case'] ); + } elseif ( 'token-map' === $mode ) { + $generated = $generator->generate_token_map_sweep( $options['case'] ); + } elseif ( 'coverage' === $mode ) { + $generated = $generator->generate(); + } else { + $generated = $generator->generate(); + } + $payload = $generated['payload']; + $context = $generated['context']; + $source = "seed {$options['seed']} case {$options['case']} (mode {$mode}, strategy {$generated['strategy']}, context {$context})"; +} else { + fwrite( STDERR, "Provide --failure, --input, or --seed with --case.\n" ); + exit( 2 ); +} + +if ( ! is_string( $payload ) ) { + fwrite( STDERR, "Payload could not be loaded.\n" ); + exit( 2 ); +} + +$oracles = Oracles::build(); +foreach ( $oracles->drain_events() as $event ) { + fwrite( STDERR, "oracle event: {$event['oracle']}: {$event['detail']}\n" ); +} +if ( Cli::mode_uses_oracle( $mode ) && ! $oracles->has_required() ) { + fwrite( STDERR, "Required oracle unavailable; cannot replay.\n" ); + exit( 2 ); +} + +$checks = new Checks( $oracles ); +$failures = 'bytes' === $mode ? $checks->run_without_oracle( $context, $payload ) : $checks->run( $context, $payload ); + +echo "Replaying {$source}\n"; +echo "Mode: {$mode}\n"; +echo "Context: {$context}\n"; +echo 'Payload: ' . strlen( $payload ) . ' bytes, sha256 ' . hash( 'sha256', $payload ) . "\n"; +echo 'Hex preview: ' . bin2hex( substr( $payload, 0, 96 ) ) . ( strlen( $payload ) > 96 ? '...' : '' ) . "\n"; +echo 'Oracles: ' . implode( ', ', $oracles->names() ) . "\n\n"; + +if ( array() === $failures ) { + echo "All checks passed.\n"; + exit( 0 ); +} + +echo count( $failures ) . " failure(s):\n"; +foreach ( $failures as $failure ) { + echo "- {$failure['signature']}\n"; + echo ' ' . json_encode( $failure['detail'], JSON_UNESCAPED_SLASHES ) . "\n"; +} + +exit( 1 ); diff --git a/tools/html-decoder-fuzz/runner.php b/tools/html-decoder-fuzz/runner.php new file mode 100644 index 0000000000000..d5bfd3602766a --- /dev/null +++ b/tools/html-decoder-fuzz/runner.php @@ -0,0 +1,945 @@ + 4, + 'duration-seconds' => 60, + 'max-cases' => 0, + 'cases-per-batch' => 2000, + 'seed-base' => 0, + 'max-bytes' => 4096, + 'mode' => 'oracle', + 'output-dir' => '', + 'stall-timeout' => 120, + 'artifact-retention' => 'bounded', + 'max-artifacts-per-signature' => 5, + 'summary-mode' => 'failures', + 'max-stderr-bytes' => 65536, + ) +); + +Cli::require_int_at_least( $options, 'lanes', 1 ); +Cli::require_int_at_least( $options, 'duration-seconds', 0 ); +Cli::require_int_at_least( $options, 'max-cases', 0 ); +Cli::require_int_at_least( $options, 'cases-per-batch', 1 ); +Cli::require_int_at_least( $options, 'seed-base', 0 ); +Cli::require_int_at_least( $options, 'max-bytes', 1 ); +Cli::require_int_at_least( $options, 'stall-timeout', 1 ); +Cli::require_int_at_least( $options, 'max-artifacts-per-signature', 0 ); +Cli::require_int_at_least( $options, 'max-stderr-bytes', 0 ); +Cli::require_one_of( $options, 'mode', Cli::valid_modes() ); +Cli::require_one_of( $options, 'artifact-retention', array( 'bounded', 'all', 'none' ) ); +Cli::require_one_of( $options, 'summary-mode', array( 'all', 'failures', 'none' ) ); + +$repo_root = Bootstrap::repo_root(); +$output_dir = $options['output-dir']; +if ( '' === $output_dir ) { + $now = microtime( true ); + $output_dir = sprintf( + '%s/artifacts/html-decoder-fuzz/run-%s-%06d-p%d', + $repo_root, + gmdate( 'Ymd-His', (int) $now ), + (int) ( ( $now - floor( $now ) ) * 1000000 ), + getmypid() + ); +} +if ( ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) { + fwrite( STDERR, "Cannot create output dir {$output_dir}\n" ); + exit( 2 ); +} +if ( ! is_writable( $output_dir ) ) { + fwrite( STDERR, "Output dir is not writable: {$output_dir}\n" ); + exit( 2 ); +} +if ( ! is_readable( $output_dir ) ) { + fwrite( STDERR, "Output dir is not readable: {$output_dir}\n" ); + exit( 2 ); +} + +$seed_base = $options['seed-base']; +if ( 0 === $seed_base ) { + $seed_base = (int) ( microtime( true ) * 1000 ) % 1000000000; +} + +$stderr_bytes_by_lane = array(); +$stderr_truncated_lanes = array(); +$startup_truncated_stderr_logs = array(); +$stderr_lane_ids = array(); +for ( $lane_id = 0; $lane_id < max( 1, $options['lanes'] ); $lane_id++ ) { + $stderr_lane_ids[ $lane_id ] = true; +} +$output_items = new \FilesystemIterator( $output_dir, \FilesystemIterator::SKIP_DOTS ); +foreach ( $output_items as $output_item ) { + if ( 1 === preg_match( '/^lane-(\d+)-stderr\.log$/', $output_item->getBasename(), $match ) ) { + $stderr_lane_ids[ (int) $match[1] ] = true; + } +} +ksort( $stderr_lane_ids, SORT_NUMERIC ); + +foreach ( array_keys( $stderr_lane_ids ) as $lane_id ) { + $stderr_path = "{$output_dir}/lane-{$lane_id}-stderr.log"; + if ( Cli::is_linked_file( $stderr_path ) ) { + fwrite( STDERR, "Lane stderr log is a linked file: {$stderr_path}\n" ); + exit( 2 ); + } + if ( ! is_file( $stderr_path ) ) { + continue; + } + + $stderr_size = filesize( $stderr_path ); + if ( ! is_int( $stderr_size ) ) { + fwrite( STDERR, "Cannot stat lane stderr log {$stderr_path}\n" ); + exit( 2 ); + } + if ( $stderr_size > $options['max-stderr-bytes'] ) { + $truncated = $options['max-stderr-bytes'] > 0 + ? file_get_contents( $stderr_path, false, null, 0, $options['max-stderr-bytes'] ) + : ''; + if ( ! is_string( $truncated ) || ! Cli::write_file( $stderr_path, $truncated ) ) { + fwrite( STDERR, "Cannot truncate lane stderr log {$stderr_path}\n" ); + exit( 2 ); + } + $startup_truncated_stderr_logs[] = array( + 'lane' => $lane_id, + 'bytes' => $options['max-stderr-bytes'], + 'was_bytes' => $stderr_size, + ); + $stderr_size = $options['max-stderr-bytes']; + } + + $stderr_bytes_by_lane[ $lane_id ] = $stderr_size; +} + +$summary_path = "{$output_dir}/summary.ndjson"; +if ( 'none' !== $options['summary-mode'] && Cli::is_linked_file( $summary_path ) ) { + fwrite( STDERR, "Summary file is a linked file: {$summary_path}\n" ); + exit( 2 ); +} +$summary = 'none' === $options['summary-mode'] ? null : fopen( $summary_path, 'ab' ); +if ( false === $summary ) { + fwrite( STDERR, "Cannot open summary file {$summary_path}\n" ); + exit( 2 ); +} +$started_at = microtime( true ); +$deadline = $options['duration-seconds'] > 0 ? $started_at + $options['duration-seconds'] : null; + +$retained_artifacts_by_signature = array(); +$retained_artifact_dirs = array(); +$startup_pruned_artifacts = 0; +$startup_pruned_partial_artifacts = 0; +$startup_verification_unavailable = false; +$existing_artifacts_by_signature = array(); +$unverified_artifact_signatures = array(); +$partial_artifact_dirs = array(); +$startup_checks = array(); +$startup_checks_available = array(); +$is_replayable_failure_manifest = static function ( $manifest ): bool { + if ( ! is_array( $manifest ) || ! isset( $manifest['signatures'], $manifest['payload_base64'], $manifest['context'], $manifest['failures'], $manifest['input_size'] ) ) { + return false; + } + if ( ! is_array( $manifest['signatures'] ) || array() === $manifest['signatures'] || ! is_string( $manifest['payload_base64'] ) ) { + return false; + } + if ( ! in_array( $manifest['context'], array( 'text', 'attribute', 'both' ), true ) ) { + return false; + } + if ( isset( $manifest['mode'] ) && ! in_array( $manifest['mode'], Cli::valid_modes(), true ) ) { + return false; + } + $payload = base64_decode( $manifest['payload_base64'], true ); + if ( ! is_string( $payload ) || '' === $payload || ! is_int( $manifest['input_size'] ) || strlen( $payload ) !== $manifest['input_size'] ) { + return false; + } + if ( ! is_array( $manifest['failures'] ) || array() === $manifest['failures'] ) { + return false; + } + $failure_signatures = array(); + foreach ( $manifest['failures'] as $failure ) { + if ( ! is_array( $failure ) || ! isset( $failure['signature'] ) || ! is_string( $failure['signature'] ) ) { + return false; + } + $failure_signatures[] = $failure['signature']; + } + $expected = array_values( array_unique( array_map( 'strval', $manifest['signatures'] ) ) ); + $actual = array_values( array_unique( $failure_signatures ) ); + sort( $expected, SORT_STRING ); + sort( $actual, SORT_STRING ); + return $expected === $actual; +}; +$startup_verifier_available = static function ( string $mode ) use ( &$startup_checks, &$startup_checks_available ): bool { + if ( ! isset( $startup_checks_available[ $mode ] ) ) { + Bootstrap::load_targets(); + $oracles = Oracles::build(); + $startup_checks_available[ $mode ] = ! Cli::mode_uses_oracle( $mode ) || $oracles->has_required(); + $startup_checks[ $mode ] = $startup_checks_available[ $mode ] ? new Checks( $oracles ) : null; + } + + return $startup_checks_available[ $mode ] && null !== $startup_checks[ $mode ]; +}; +$failure_manifest_reproduces = static function ( array $manifest ) use ( &$startup_checks, $startup_verifier_available ): ?bool { + $mode = $manifest['mode'] ?? 'oracle'; + if ( ! $startup_verifier_available( $mode ) ) { + return null; + } + + $payload = base64_decode( $manifest['payload_base64'], true ); + if ( ! is_string( $payload ) ) { + return null; + } + + $actual = array_values( + array_unique( + array_map( + static fn( array $failure ): string => $failure['signature'], + 'bytes' === $mode + ? $startup_checks[ $mode ]->run_without_oracle( $manifest['context'], $payload ) + : $startup_checks[ $mode ]->run( $manifest['context'], $payload ) + ) + ) + ); + $expected = array_values( array_unique( array_map( 'strval', $manifest['signatures'] ) ) ); + sort( $actual, SORT_STRING ); + sort( $expected, SORT_STRING ); + + return $expected === $actual; +}; +$startup_artifact_dirs = array(); +$output_items = new \FilesystemIterator( $output_dir, \FilesystemIterator::SKIP_DOTS ); +foreach ( $output_items as $output_item ) { + if ( 0 !== strncmp( $output_item->getBasename(), 'failure-', 8 ) ) { + continue; + } + + if ( $output_item->isLink() ) { + $partial_artifact_dirs[] = $output_item->getPathname(); + continue; + } + + if ( $output_item->isDir() ) { + $startup_artifact_dirs[] = $output_item->getPathname(); + } +} +sort( $startup_artifact_dirs, SORT_STRING ); +foreach ( $startup_artifact_dirs as $artifact_dir ) { + $failure_file = "{$artifact_dir}/failure.json"; + if ( ! is_file( $failure_file ) ) { + $partial_artifact_dirs[] = $artifact_dir; + continue; + } + + $manifest = json_decode( (string) file_get_contents( $failure_file ), true ); + if ( $is_replayable_failure_manifest( $manifest ) ) { + $reproduces = $failure_manifest_reproduces( $manifest ); + $signature_key = Cli::failure_signature_key( $manifest['signatures'], $manifest['mode'] ?? 'oracle' ); + if ( null === $reproduces ) { + $startup_verification_unavailable = true; + $unverified_artifact_signatures[ $signature_key ] = true; + } + if ( false !== $reproduces ) { + $existing_artifacts_by_signature[ $signature_key ][] = $artifact_dir; + continue; + } + } + + $partial_artifact_dirs[] = $artifact_dir; +} + +if ( 'all' !== $options['artifact-retention'] ) { + foreach ( $partial_artifact_dirs as $artifact_dir ) { + if ( ! Cli::remove_tree( $artifact_dir, $output_dir ) ) { + fwrite( STDERR, "Cannot prune partial failure artifact {$artifact_dir}\n" ); + exit( 2 ); + } + ++$startup_pruned_artifacts; + ++$startup_pruned_partial_artifacts; + } +} + +foreach ( $existing_artifacts_by_signature as $signature_key => $artifact_dirs ) { + sort( $artifact_dirs, SORT_STRING ); + $keep = count( $artifact_dirs ); + if ( 'none' === $options['artifact-retention'] ) { + $keep = 0; + } elseif ( 'bounded' === $options['artifact-retention'] && isset( $unverified_artifact_signatures[ $signature_key ] ) ) { + $keep = count( $artifact_dirs ); + } elseif ( 'bounded' === $options['artifact-retention'] ) { + $keep = min( $keep, $options['max-artifacts-per-signature'] ); + } + + foreach ( $artifact_dirs as $index => $artifact_dir ) { + if ( $index < $keep ) { + $retained_artifacts_by_signature[ $signature_key ] = ( $retained_artifacts_by_signature[ $signature_key ] ?? 0 ) + 1; + $artifact_key = realpath( $artifact_dir ); + $retained_artifact_dirs[ false === $artifact_key ? $artifact_dir : $artifact_key ] = $signature_key; + continue; + } + + if ( ! Cli::remove_tree( $artifact_dir, $output_dir ) ) { + fwrite( STDERR, "Cannot prune existing failure artifact {$artifact_dir}\n" ); + exit( 2 ); + } + ++$startup_pruned_artifacts; + } +} + +$state = array( + 'started_at' => gmdate( 'c' ), + 'seed_base' => $seed_base, + 'options' => $options, + 'git' => Cli::git_metadata( $repo_root ), + 'cases' => 0, + 'failures' => 0, + 'bytes' => 0, + 'by_strategy' => array(), + 'by_context' => array(), + 'failure_seeds' => array(), + 'stalled_seeds' => array(), + 'worker_errors' => array(), + 'worker_stderr_truncated' => array(), + 'worker_stderr_startup_truncated' => $startup_truncated_stderr_logs, + 'harness_errors' => 0, + 'oracle_events' => array(), + 'batches' => 0, + 'coverage' => array( + 'edges' => 0, + 'payloads' => 0, + 'pruned_duplicate_payloads' => 0, + 'by_file' => array(), + 'edge_keys' => array(), + 'corpus' => array(), + ), + 'artifact_retention' => array( + 'mode' => $options['artifact-retention'], + 'max_per_signature' => $options['max-artifacts-per-signature'], + 'retained_by_signature' => $retained_artifacts_by_signature, + 'pruned' => $startup_pruned_artifacts, + 'startup_pruned' => $startup_pruned_artifacts, + 'startup_pruned_partial' => $startup_pruned_partial_artifacts, + 'startup_verification_unavailable' => $startup_verification_unavailable, + ), + 'stop_reason' => null, +); + +$next_seed = $seed_base; +$next_start_case = 0; +$lanes = array(); + +$spawn_lane = static function ( int $lane_id ) use ( &$next_seed, &$next_start_case, &$stderr_bytes_by_lane, &$stderr_truncated_lanes, $seed_base, $options, $output_dir ): array { + if ( Cli::mode_uses_start_case_windows( $options['mode'] ) ) { + $seed = $seed_base; + $start_case = $next_start_case; + $next_start_case += $options['cases-per-batch']; + } else { + $seed = $next_seed++; + $start_case = 0; + } + + $command = array( + PHP_BINARY, + __DIR__ . '/worker.php', + '--seed', + (string) $seed, + '--start-case', + (string) $start_case, + '--cases', + (string) $options['cases-per-batch'], + '--max-bytes', + (string) $options['max-bytes'], + '--mode', + $options['mode'], + '--output-dir', + $output_dir, + '--progress-every', + '500', + ); + + $stderr_path = "{$output_dir}/lane-{$lane_id}-stderr.log"; + if ( Cli::is_linked_file( $stderr_path ) ) { + fwrite( STDERR, "Lane stderr log is a linked file: {$stderr_path}\n" ); + exit( 2 ); + } + + $process = proc_open( + $command, + array( + 0 => array( 'file', '/dev/null', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'pipe', 'w' ), + ), + $pipes + ); + if ( ! is_resource( $process ) || ! isset( $pipes[1], $pipes[2] ) || ! is_resource( $pipes[1] ) || ! is_resource( $pipes[2] ) ) { + fwrite( STDERR, "Cannot spawn worker lane {$lane_id}\n" ); + exit( 2 ); + } + + stream_set_blocking( $pipes[1], false ); + stream_set_blocking( $pipes[2], false ); + + return array( + 'id' => $lane_id, + 'seed' => $seed, + 'start_case' => $start_case, + 'process' => $process, + 'stdout' => $pipes[1], + 'stderr' => $pipes[2], + 'stderr_path' => $stderr_path, + 'stderr_bytes' => $stderr_bytes_by_lane[ $lane_id ] ?? 0, + 'stderr_truncated' => isset( $stderr_truncated_lanes[ $lane_id ] ), + 'buffer' => '', + 'last_output' => microtime( true ), + 'reported_failures' => 0, + ); +}; + +$write_state = static function () use ( &$state, $output_dir, $started_at ): bool { + $state['elapsed_sec'] = round( microtime( true ) - $started_at, 1 ); + $state_json = json_encode( $state, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES ); + if ( false === $state_json ) { + return false; + } + + return Cli::write_file( "{$output_dir}/state.json", $state_json ); +}; + +$stop_requested = false; +$summary_write_failed = false; + +$write_summary_record = static function ( array $record ) use ( &$state, &$stop_requested, &$summary_write_failed, $summary, $summary_path ): bool { + if ( null === $summary ) { + return true; + } + + $summary_line = json_encode( $record, JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_SUBSTITUTE ); + if ( false === $summary_line || ! Cli::write_stream( $summary, $summary_line . "\n" ) ) { + if ( $summary_write_failed ) { + return false; + } + $summary_write_failed = true; + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "Cannot write summary file {$summary_path}\n" ); + return false; + } + + return true; +}; + +$summarize_record = static function ( array $record ) use ( $options, $write_summary_record ): bool { + if ( 'none' === $options['summary-mode'] ) { + return true; + } + if ( 'all' === $options['summary-mode'] ) { + return $write_summary_record( $record ); + } + + $type = $record['type'] ?? ''; + if ( 'failure' === $type ) { + if ( ! empty( $record['artifact_retained'] ) && empty( $record['artifact_reused'] ) ) { + return $write_summary_record( $record ); + } + return true; + } + if ( 'coverage' === $type ) { + if ( ! empty( $record['coverage_retained'] ) ) { + return $write_summary_record( $record ); + } + return true; + } + if ( in_array( $type, array( 'fatal', 'oracle-event', 'invalid-worker-output', 'malformed-worker-record', 'unknown-worker-record' ), true ) ) { + return $write_summary_record( $record ); + } + + return true; +}; + +$drain_lane_stderr = static function ( array &$lane ) use ( &$state, &$stop_requested, &$stderr_bytes_by_lane, &$stderr_truncated_lanes, $options ): void { + $chunk = stream_get_contents( $lane['stderr'] ); + if ( false === $chunk || '' === $chunk ) { + return; + } + + $remaining = $options['max-stderr-bytes'] - $lane['stderr_bytes']; + if ( $remaining <= 0 ) { + $lane['stderr_truncated'] = true; + $stderr_truncated_lanes[ $lane['id'] ] = true; + return; + } + + $write = substr( $chunk, 0, $remaining ); + if ( strlen( $chunk ) > strlen( $write ) ) { + $lane['stderr_truncated'] = true; + $stderr_truncated_lanes[ $lane['id'] ] = true; + } + if ( ! Cli::append_file( $lane['stderr_path'], $write ) ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "Cannot write lane stderr log {$lane['stderr_path']}\n" ); + return; + } + + $lane['stderr_bytes'] += strlen( $write ); + $stderr_bytes_by_lane[ $lane['id'] ] = $lane['stderr_bytes']; +}; + +$apply_artifact_retention = static function ( array &$record ) use ( &$state, &$retained_artifact_dirs, $options ): ?string { + $signature_key = Cli::failure_signature_key( $record['signatures'], $record['mode'] ?? 'oracle' ); + $record['signature_key'] = $signature_key; + + $artifact_dir = $record['artifact_dir'] ?? null; + if ( ! is_string( $artifact_dir ) || '' === $artifact_dir || ! is_dir( $artifact_dir ) ) { + $record['artifact_retained'] = false; + $record['artifact_pruned'] = false; + return null; + } + + $artifact_key = realpath( $artifact_dir ); + $artifact_key = false === $artifact_key ? $artifact_dir : $artifact_key; + if ( isset( $retained_artifact_dirs[ $artifact_key ] ) ) { + if ( $signature_key === $retained_artifact_dirs[ $artifact_key ] ) { + $record['artifact_retained'] = true; + $record['artifact_pruned'] = false; + $record['artifact_reused'] = true; + return null; + } + + $previous_signature_key = $retained_artifact_dirs[ $artifact_key ]; + $state['artifact_retention']['retained_by_signature'][ $previous_signature_key ] = + max( 0, ( $state['artifact_retention']['retained_by_signature'][ $previous_signature_key ] ?? 1 ) - 1 ); + if ( 0 === $state['artifact_retention']['retained_by_signature'][ $previous_signature_key ] ) { + unset( $state['artifact_retention']['retained_by_signature'][ $previous_signature_key ] ); + } + unset( $retained_artifact_dirs[ $artifact_key ] ); + $record['artifact_replaced_signature_key'] = $previous_signature_key; + } + + $retain = 'all' === $options['artifact-retention']; + if ( 'bounded' === $options['artifact-retention'] ) { + $retained = $state['artifact_retention']['retained_by_signature'][ $signature_key ] ?? 0; + $retain = $retained < $options['max-artifacts-per-signature']; + } + + if ( $retain ) { + $record['artifact_retained'] = true; + $record['artifact_pruned'] = false; + $state['artifact_retention']['retained_by_signature'][ $signature_key ] = + ( $state['artifact_retention']['retained_by_signature'][ $signature_key ] ?? 0 ) + 1; + $retained_artifact_dirs[ $artifact_key ] = $signature_key; + return null; + } + + $record['artifact_dir'] = null; + $record['artifact_retained'] = false; + $record['artifact_pruned'] = true; + return $artifact_dir; +}; + +$handle_line = static function ( string $line, int $lane_id ) use ( &$state, &$stop_requested, $summarize_record, $apply_artifact_retention, $output_dir ): ?string { + $record = json_decode( $line, true ); + if ( ! is_array( $record ) ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "invalid worker output on lane {$lane_id}\n" ); + $summarize_record( + array( + 'type' => 'invalid-worker-output', + 'lane' => $lane_id, + 'raw_base64' => base64_encode( $line ), + ) + ); + return 'invalid'; + } + + $record['lane'] = $lane_id; + + switch ( $record['type'] ?? '' ) { + case 'failure': + if ( ! isset( $record['seed'], $record['case'], $record['context'], $record['signatures'] ) || ! is_array( $record['signatures'] ) ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "malformed failure record on lane {$lane_id}\n" ); + $record['type'] = 'malformed-worker-record'; + $summarize_record( $record ); + return 'invalid'; + } + $record['mode'] = $record['mode'] ?? 'oracle'; + if ( ! in_array( $record['mode'], Cli::valid_modes(), true ) ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "malformed failure mode on lane {$lane_id}\n" ); + $record['type'] = 'malformed-worker-record'; + $summarize_record( $record ); + return 'invalid'; + } + $prune_artifact_dir = $apply_artifact_retention( $record ); + ++$state['failures']; + fwrite( STDERR, "FAILURE lane {$lane_id} seed {$record['seed']} case {$record['case']}: " . implode( ', ', $record['signatures'] ) . "\n" ); + $summary_written = $summarize_record( $record ); + if ( null !== $prune_artifact_dir ) { + if ( $summary_written && Cli::remove_tree( $prune_artifact_dir, $output_dir ) ) { + ++$state['artifact_retention']['pruned']; + } else { + if ( $summary_written ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "Cannot prune failure artifact {$prune_artifact_dir}\n" ); + } + $record['artifact_dir'] = $prune_artifact_dir; + $record['artifact_retained'] = true; + $record['artifact_pruned'] = false; + $state['artifact_retention']['retained_by_signature'][ $record['signature_key'] ] = + ( $state['artifact_retention']['retained_by_signature'][ $record['signature_key'] ] ?? 0 ) + 1; + } + } + if ( ! empty( $record['artifact_retained'] ) && empty( $record['artifact_reused'] ) ) { + $state['failure_seeds'][] = array( + 'seed' => $record['seed'], + 'case' => $record['case'], + 'mode' => $record['mode'], + 'context' => $record['context'], + 'signatures' => $record['signatures'], + 'signature_key' => $record['signature_key'], + 'artifact' => $record['artifact_dir'] ?? null, + 'artifact_retained' => $record['artifact_retained'], + 'artifact_pruned' => $record['artifact_pruned'], + ); + } + return 'failure'; + + case 'coverage': + if ( ! isset( $record['seed'], $record['case'], $record['context'], $record['strategy'], $record['new_edges'] ) || ! is_array( $record['new_edges'] ) ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "malformed coverage record on lane {$lane_id}\n" ); + $record['type'] = 'malformed-worker-record'; + $summarize_record( $record ); + return 'invalid'; + } + $record['mode'] = $record['mode'] ?? 'coverage'; + if ( 'coverage' !== $record['mode'] ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "malformed coverage mode on lane {$lane_id}\n" ); + $record['type'] = 'malformed-worker-record'; + $summarize_record( $record ); + return 'invalid'; + } + + $global_new_edges = array(); + foreach ( $record['new_edges'] as $edge ) { + if ( ! is_array( $edge ) || ! isset( $edge['key'], $edge['file'], $edge['line'] ) || ! is_string( $edge['key'] ) || ! is_string( $edge['file'] ) || ! is_int( $edge['line'] ) ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "malformed coverage edge on lane {$lane_id}\n" ); + $record['type'] = 'malformed-worker-record'; + $summarize_record( $record ); + return 'invalid'; + } + if ( isset( $state['coverage']['edge_keys'][ $edge['key'] ] ) ) { + continue; + } + + $state['coverage']['edge_keys'][ $edge['key'] ] = true; + $state['coverage']['by_file'][ $edge['file'] ] = ( $state['coverage']['by_file'][ $edge['file'] ] ?? 0 ) + 1; + $global_new_edges[] = $edge; + } + + $artifact_dir = $record['artifact_dir'] ?? null; + if ( array() === $global_new_edges ) { + $record['new_edges'] = array(); + $record['new_edge_count'] = 0; + $record['coverage_retained'] = false; + $record['coverage_duplicate'] = true; + $record['coverage_pruned'] = false; + if ( is_string( $artifact_dir ) && '' !== $artifact_dir && is_dir( $artifact_dir ) ) { + if ( Cli::remove_tree( $artifact_dir, $output_dir ) ) { + $record['artifact_dir'] = null; + $record['artifact_pruned'] = true; + $record['coverage_pruned'] = true; + ++$state['coverage']['pruned_duplicate_payloads']; + } else { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "Cannot prune duplicate coverage artifact {$artifact_dir}\n" ); + } + } + $summarize_record( $record ); + return 'coverage'; + } + + $record['new_edges'] = $global_new_edges; + $record['new_edge_count'] = count( $global_new_edges ); + $state['coverage']['edges'] += count( $global_new_edges ); + $record['coverage_duplicate'] = false; + $record['coverage_pruned'] = false; + $record['coverage_retained'] = is_string( $artifact_dir ) && '' !== $artifact_dir && is_dir( $artifact_dir ) && ! is_link( $artifact_dir ); + if ( $record['coverage_retained'] ) { + $payload = isset( $record['payload_base64'] ) && is_string( $record['payload_base64'] ) + ? base64_decode( $record['payload_base64'], true ) + : null; + ++$state['coverage']['payloads']; + $state['coverage']['corpus'][] = array( + 'seed' => $record['seed'], + 'case' => $record['case'], + 'context' => $record['context'], + 'strategy' => $record['strategy'], + 'edges' => count( $global_new_edges ), + 'artifact' => $artifact_dir, + 'sha256' => is_string( $payload ) ? hash( 'sha256', $payload ) : null, + ); + } + $summarize_record( $record ); + return 'coverage'; + + case 'oracle-event': + $state['oracle_events'][] = $record; + $oracle = $record['oracle'] ?? 'unknown'; + $detail = $record['detail'] ?? 'no detail'; + fwrite( STDERR, "oracle event: {$oracle}: {$detail}\n" ); + $summarize_record( $record ); + return 'oracle-event'; + + case 'fatal': + ++$state['harness_errors']; + $state['oracle_events'][] = $record; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + $reason = $record['reason'] ?? 'unknown'; + fwrite( STDERR, "worker fatal: {$reason}\n" ); + $summarize_record( $record ); + return 'fatal'; + + case 'done': + if ( ! isset( $record['stats'] ) || ! is_array( $record['stats'] ) ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "malformed done record on lane {$lane_id}\n" ); + $record['type'] = 'malformed-worker-record'; + $summarize_record( $record ); + return 'invalid'; + } + $stats = $record['stats']; + if ( ! isset( $stats['cases'], $stats['bytes'], $stats['by_strategy'], $stats['by_context'] ) || ! is_array( $stats['by_strategy'] ) || ! is_array( $stats['by_context'] ) ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "malformed done stats on lane {$lane_id}\n" ); + $record['type'] = 'malformed-worker-record'; + $summarize_record( $record ); + return 'invalid'; + } + $state['cases'] += $stats['cases']; + $state['bytes'] += $stats['bytes']; + foreach ( $stats['by_strategy'] as $strategy => $count ) { + $state['by_strategy'][ $strategy ] = ( $state['by_strategy'][ $strategy ] ?? 0 ) + $count; + } + foreach ( $stats['by_context'] as $context => $count ) { + $state['by_context'][ $context ] = ( $state['by_context'][ $context ] ?? 0 ) + $count; + } + $summarize_record( $record ); + return 'done'; + + case 'progress': + case 'start': + $summarize_record( $record ); + return $record['type']; + } + + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "unknown worker record type on lane {$lane_id}\n" ); + $record['type'] = 'unknown-worker-record'; + $summarize_record( $record ); + return 'invalid'; +}; + +for ( $i = 0; $i < max( 1, $options['lanes'] ); $i++ ) { + $lanes[ $i ] = $spawn_lane( $i ); + ++$state['batches']; +} + +$last_state_write = 0.0; + +while ( array() !== $lanes ) { + $now = microtime( true ); + + if ( ! $stop_requested && null !== $deadline && $now >= $deadline ) { + $state['stop_reason'] = 'duration'; + $stop_requested = true; + } + + if ( ! $stop_requested && $options['max-cases'] > 0 && $state['cases'] >= $options['max-cases'] ) { + $state['stop_reason'] = 'max-cases'; + $stop_requested = true; + } + + $streams = array(); + foreach ( $lanes as $lane_id => $lane ) { + $streams[ "{$lane_id}:stdout" ] = $lane['stdout']; + $streams[ "{$lane_id}:stderr" ] = $lane['stderr']; + } + + $read = array_values( $streams ); + $write = null; + $except = null; + if ( stream_select( $read, $write, $except, 0, 250000 ) > 0 ) { + foreach ( $lanes as $lane_id => &$lane ) { + $chunk = stream_get_contents( $lane['stdout'] ); + if ( false === $chunk || '' === $chunk ) { + continue; + } + + $lane['last_output'] = microtime( true ); + $lane['buffer'] .= $chunk; + + while ( false !== ( $newline = strpos( $lane['buffer'], "\n" ) ) ) { + $line = substr( $lane['buffer'], 0, $newline ); + $lane['buffer'] = substr( $lane['buffer'], $newline + 1 ); + if ( '' !== $line && 'failure' === $handle_line( $line, $lane_id ) ) { + ++$lane['reported_failures']; + } + } + } + unset( $lane ); + } + + foreach ( $lanes as &$lane ) { + $drain_lane_stderr( $lane ); + } + unset( $lane ); + + foreach ( $lanes as $lane_id => $lane ) { + $status = proc_get_status( $lane['process'] ); + $stalled = ( microtime( true ) - $lane['last_output'] ) > $options['stall-timeout']; + + if ( $status['running'] && $stalled ) { + proc_terminate( $lane['process'], 9 ); + $state['stalled_seeds'][] = $lane['seed']; + fwrite( STDERR, "STALL lane {$lane_id} seed {$lane['seed']}: no output for {$options['stall-timeout']}s, killed\n" ); + } elseif ( $status['running'] ) { + continue; + } + + $rest = stream_get_contents( $lane['stdout'] ); + if ( false === $rest ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "cannot read remaining worker output on lane {$lane_id}\n" ); + $rest = ''; + } + $tail = $lane['buffer'] . $rest; + if ( '' !== $tail ) { + foreach ( explode( "\n", $tail ) as $line ) { + if ( '' !== $line && 'failure' === $handle_line( $line, $lane_id ) ) { + ++$lane['reported_failures']; + } + } + } + $drain_lane_stderr( $lane ); + fclose( $lane['stdout'] ); + fclose( $lane['stderr'] ); + $close_code = proc_close( $lane['process'] ); + $exit_code = $status['exitcode'] ?? $close_code; + if ( -1 === $exit_code ) { + $exit_code = $close_code; + } + $accepted_failure_exit = 1 === $exit_code && $lane['reported_failures'] > 0; + if ( 0 !== $exit_code && ! $accepted_failure_exit && ! in_array( $lane['seed'], $state['stalled_seeds'], true ) ) { + $state['worker_errors'][] = array( + 'lane' => $lane_id, + 'seed' => $lane['seed'], + 'exit_code' => $exit_code, + ); + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "worker error lane {$lane_id} seed {$lane['seed']}: exit {$exit_code}\n" ); + } + if ( $lane['stderr_truncated'] ) { + $state['worker_stderr_truncated'][ $lane_id ] = array( + 'lane' => $lane_id, + 'seed' => $lane['seed'], + 'bytes' => $lane['stderr_bytes'], + ); + } + unset( $lanes[ $lane_id ] ); + + if ( ! $stop_requested ) { + $lanes[ $lane_id ] = $spawn_lane( $lane_id ); + ++$state['batches']; + } + } + + if ( microtime( true ) - $last_state_write > 5 ) { + if ( ! $write_state() ) { + ++$state['harness_errors']; + $state['stop_reason'] = 'harness-error'; + $stop_requested = true; + fwrite( STDERR, "Cannot write state file {$output_dir}/state.json\n" ); + } + $last_state_write = microtime( true ); + } +} + +if ( null === $state['stop_reason'] ) { + $state['stop_reason'] = 'lanes-exited'; +} +$state['finished_at'] = gmdate( 'c' ); +if ( ! $write_state() ) { + fwrite( STDERR, "Cannot write state file {$output_dir}/state.json\n" ); + if ( is_resource( $summary ) ) { + fclose( $summary ); + } + exit( 2 ); +} +if ( is_resource( $summary ) ) { + fclose( $summary ); +} + +$elapsed = round( microtime( true ) - $started_at, 1 ); +fwrite( + STDERR, + sprintf( + "Done: %d cases, %d failures, %d stalled, %s bytes in %ss. Artifacts: %s\n", + $state['cases'], + $state['failures'], + count( $state['stalled_seeds'] ), + number_format( $state['bytes'] ), + $elapsed, + $output_dir + ) +); + +if ( $state['harness_errors'] > 0 || array() !== $state['worker_errors'] ) { + exit( 2 ); +} + +exit( ( $state['failures'] > 0 || array() !== $state['stalled_seeds'] ) ? 1 : 0 ); diff --git a/tools/html-decoder-fuzz/tests/harness-smoke.php b/tools/html-decoder-fuzz/tests/harness-smoke.php new file mode 100644 index 0000000000000..515a9639b357d --- /dev/null +++ b/tools/html-decoder-fuzz/tests/harness-smoke.php @@ -0,0 +1,4315 @@ + array( 'file', '/dev/null', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'pipe', 'w' ), + ), + $pipes, + Bootstrap::repo_root(), + array_merge( getenv() ?: array(), $env ) + ); + + if ( ! is_resource( $process ) ) { + return array( + 'code' => 127, + 'stdout' => '', + 'stderr' => 'proc_open failed', + ); + } + + $stdout = stream_get_contents( $pipes[1] ); + $stderr = stream_get_contents( $pipes[2] ); + fclose( $pipes[1] ); + fclose( $pipes[2] ); + + return array( + 'code' => proc_close( $process ), + 'stdout' => (string) $stdout, + 'stderr' => (string) $stderr, + ); +} + +function remove_tree( string $path ): void { + if ( ! is_dir( $path ) ) { + return; + } + + $items = new \RecursiveIteratorIterator( + new \RecursiveDirectoryIterator( $path, \FilesystemIterator::SKIP_DOTS ), + \RecursiveIteratorIterator::CHILD_FIRST + ); + + foreach ( $items as $item ) { + $item->isDir() && ! $item->isLink() ? rmdir( $item->getPathname() ) : unlink( $item->getPathname() ); + } + rmdir( $path ); +} + +/** + * @return array + */ +function summary_start_windows( string $dir, string $mode ): array { + $summary = is_file( $dir . '/summary.ndjson' ) + ? file( $dir . '/summary.ndjson', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES ) + : array(); + $windows = array(); + + if ( is_array( $summary ) ) { + foreach ( $summary as $line ) { + $record = json_decode( $line, true ); + if ( is_array( $record ) && 'start' === ( $record['type'] ?? null ) && $mode === ( $record['mode'] ?? null ) ) { + $windows[] = array( + 'start' => $record['start_case'] ?? null, + 'cases' => $record['cases'] ?? null, + ); + } + } + } + + usort( + $windows, + static fn( array $a, array $b ): int => ( $a['start'] ?? -1 ) <=> ( $b['start'] ?? -1 ) + ); + + return $windows; +} + +function start_windows_are_distinct( array $windows, int $cases_per_batch ): bool { + if ( count( $windows ) < 2 ) { + return false; + } + + $previous_window_end = null; + foreach ( $windows as $window ) { + if ( ! is_int( $window['start'] ) || $cases_per_batch !== $window['cases'] || 0 !== $window['start'] % $cases_per_batch || ( null !== $previous_window_end && $window['start'] < $previous_window_end ) ) { + return false; + } + $previous_window_end = $window['start'] + $window['cases']; + } + + return true; +} + +$oracles = Oracles::build(); +$events = $oracles->drain_events(); +$skip_c1_fault_seed = 2; +$skip_c1_fault_case = 36; + +check( 'required oracles available', $oracles->has_required(), json_encode( $events ) ); +check( 'secondary entity-decode oracle available', in_array( 'entity-decode', $oracles->names(), true ), implode( ',', $oracles->names() ) ); +check( + 'no oracle disabled by battery', + array() === array_filter( $events, static fn( $e ) => 'oracle-disabled' === $e['type'] ), + json_encode( $events ) +); + +$checks = new Checks( $oracles ); +$battery_fails = array(); +foreach ( Oracles::battery() as $i => $vector ) { + list( $context, $payload ) = $vector; + foreach ( $checks->run( $context, $payload ) as $failure ) { + $battery_fails[] = "vector {$i}: {$failure['signature']}"; + } +} +check( 'real targets clean on oracle battery', array() === $battery_fails, implode( '; ', $battery_fails ) ); + +$real_targets = Targets::real(); + +/** + * @return string[] Distinct check names observed. + */ +function broken_run( Oracles $oracles, array $real_targets, array $overrides ): array { + $checks = new Checks( $oracles, array_merge( $real_targets, $overrides ) ); + $seen = array(); + + $cases = array_merge( + Oracles::battery(), + array( + array( 'text', 'a&b' ), + array( 'attribute', '¬x' ), + array( 'attribute', 'jav' ), + array( 'attribute', 'javascript:alert(1)' ), + array( 'attribute', '<⃒tail' ), + ) + ); + + foreach ( $cases as $case ) { + foreach ( $checks->run( $case[0], $case[1] ) as $failure ) { + $seen[ $failure['check'] ] = true; + } + } + + return array_keys( $seen ); +} + +/** + * @return string[] Distinct check names observed. + */ +function fault_run( Oracles $oracles, string $fault, string $payload = 'javascript:alert(1)', string $context = 'attribute' ): array { + $old_fault = getenv( 'HTML_DECODER_FUZZ_FAULT' ); + putenv( "HTML_DECODER_FUZZ_FAULT={$fault}" ); + + try { + $checks = new Checks( $oracles, Targets::resolve() ); + $seen = array(); + + foreach ( $checks->run( $context, $payload ) as $failure ) { + $seen[ $failure['check'] ] = true; + } + + return array_keys( $seen ); + } finally { + if ( false === $old_fault ) { + putenv( 'HTML_DECODER_FUZZ_FAULT' ); + } else { + putenv( "HTML_DECODER_FUZZ_FAULT={$old_fault}" ); + } + } +} + +/** + * @return string[] Distinct check names observed. + */ +function fault_run_without_oracle( Oracles $oracles, string $fault, string $payload ): array { + $old_fault = getenv( 'HTML_DECODER_FUZZ_FAULT' ); + putenv( "HTML_DECODER_FUZZ_FAULT={$fault}" ); + + try { + $checks = new Checks( $oracles, Targets::resolve() ); + $seen = array(); + + foreach ( $checks->run_without_oracle( 'both', $payload ) as $failure ) { + $seen[ $failure['check'] ] = true; + } + + return array_keys( $seen ); + } finally { + if ( false === $old_fault ) { + putenv( 'HTML_DECODER_FUZZ_FAULT' ); + } else { + putenv( "HTML_DECODER_FUZZ_FAULT={$old_fault}" ); + } + } +} + +/** + * @return string[] Distinct check names observed. + */ +function broken_oracle_free_run( Oracles $oracles, array $real_targets, array $overrides ): array { + $checks = new Checks( $oracles, array_merge( $real_targets, $overrides ) ); + $seen = array(); + $cases = array( + array( 'both', "raw\x00bytes" ), + array( 'both', "\xFF\xFE<\"\r" ), + array( 'both', "a¬x\x00z" ), + ); + + foreach ( $cases as $case ) { + foreach ( $checks->run_without_oracle( $case[0], $case[1] ) as $failure ) { + $seen[ $failure['check'] ] = true; + } + } + + return array_keys( $seen ); +} + +function reference_at_eof_shape( string $payload ): ?string { + if ( 1 === preg_match( '/&\z/', $payload ) ) { + return 'bare-introducer'; + } + if ( 1 === preg_match( '/&#(?:[xX])?\z/', $payload ) ) { + return 'partial-numeric-introducer'; + } + if ( 1 === preg_match( '/&#[0-9]+\z/', $payload ) ) { + return 'decimal-digits'; + } + if ( 1 === preg_match( '/&#[xX][0-9A-Fa-f]+\z/', $payload ) ) { + return 'hex-digits'; + } + if ( 1 === preg_match( '/&[A-Za-z][A-Za-z0-9]*\z/', $payload ) ) { + return 'named-prefix'; + } + return null; +} + +/** + * @return array + */ +function numeric_reference_ranges( string $payload ): array { + $ranges = array(); + $match_count = preg_match_all( '/&#(?:([xX])([0-9A-Fa-f]+)|([0-9]+));?/', $payload, $matches, PREG_SET_ORDER ); + if ( false === $match_count || 0 === $match_count ) { + return $ranges; + } + + foreach ( $matches as $match ) { + $is_hex = '' !== ( $match[1] ?? '' ); + $digits = $is_hex ? $match[2] : $match[3]; + $base = $is_hex ? 16 : 10; + $max_digits = $is_hex ? 6 : 7; + $zero_count = strspn( $digits, '0' ); + $significant_digits = substr( $digits, $zero_count ); + + if ( '' === $significant_digits ) { + $ranges['zero-only'] = true; + continue; + } + + if ( strlen( $significant_digits ) > $max_digits ) { + $ranges['digit-count-overflow'] = true; + continue; + } + + $value = intval( $significant_digits, $base ); + if ( $value <= 0x1F ) { + $ranges['c0-control'] = true; + } elseif ( $value >= 0x80 && $value <= 0x9F ) { + $ranges['c1-control'] = true; + } elseif ( $value >= 0xA0 && $value <= 0xD7FF ) { + $ranges['bmp-pre-surrogate'] = true; + } elseif ( $value >= 0xD800 && $value <= 0xDFFF ) { + $ranges['surrogate'] = true; + } elseif ( ( $value >= 0xFDD0 && $value <= 0xFDEF ) || 0xFFFE === $value || 0xFFFF === $value ) { + $ranges['bmp-noncharacter'] = true; + } elseif ( $value >= 0xE000 && $value <= 0xFFFD ) { + $ranges['bmp-post-surrogate'] = true; + } elseif ( $value >= 0x1FFFE && $value <= 0x10FFFF && ( $value & 0xFFFF ) >= 0xFFFE ) { + $ranges['plane-noncharacter'] = true; + } elseif ( $value > 0x10FFFF ) { + $ranges['above-unicode-legal-digits'] = true; + } elseif ( $value >= 0x10000 ) { + $ranges['astral'] = true; + } + } + + return $ranges; +} + +/** + * @param string[] $names + * @return string[] + */ +function name_sweep_base_names( array $names ): array { + $base_names = array(); + foreach ( $names as $name ) { + $base = rtrim( $name, ';' ); + if ( '' !== $base ) { + $base_names[ $base ] = true; + } + } + return array_keys( $base_names ); +} + +/** + * @return string[] + */ +function legacy_follower_sweep_followers(): array { + $followers = array(); + + for ( $byte = 1; $byte <= 0x7F; $byte++ ) { + if ( in_array( $byte, array( 0x0D, 0x22, 0x3C ), true ) ) { + continue; + } + $followers[] = chr( $byte ); + } + + for ( $lead = 0xC2; $lead <= 0xF4; $lead++ ) { + if ( $lead < 0xE0 ) { + $followers[] = chr( $lead ) . "\x80"; + } elseif ( 0xE0 === $lead ) { + $followers[] = "\xE0\xA0\x80"; + } elseif ( $lead < 0xF0 ) { + $followers[] = chr( $lead ) . "\x80\x80"; + } elseif ( 0xF0 === $lead ) { + $followers[] = "\xF0\x90\x80\x80"; + } elseif ( $lead < 0xF4 ) { + $followers[] = chr( $lead ) . "\x80\x80\x80"; + } else { + $followers[] = "\xF4\x80\x80\x80"; + } + } + + for ( $continuation = 0x80; $continuation <= 0xBF; $continuation++ ) { + $followers[] = "\xC2" . chr( $continuation ); + } + + return array_values( array_unique( $followers ) ); +} + +/** + * @return string[] + */ +function prefix_family_sweep_references(): array { + return array( + 'not', + 'not;', + 'notin;', + 'notinva;', + 'ngt;', + 'nGt;', + 'nGtv;', + 'nge;', + 'ngeq;', + 'ngeqq;', + ); +} + +/** + * @return string[] + */ +function prefix_family_sweep_followers(): array { + return array( '', 'x', 'X', '0', '=', "\u{00E9}" ); +} + +/** + * @param string[] $base_names + * @return array + */ +function prefix_family_sweep_cases( array $base_names ): array { + $base_set = array_fill_keys( $base_names, true ); + $cases = array(); + + foreach ( prefix_family_sweep_references() as $reference ) { + if ( ! isset( $base_set[ rtrim( $reference, ';' ) ] ) ) { + continue; + } + + $full_reference = '&' . $reference; + for ( $split = 1; $split < strlen( $full_reference ); $split++ ) { + foreach ( prefix_family_sweep_followers() as $follower ) { + $cases[] = array( + 'reference' => $full_reference, + 'split' => $split, + 'follower' => $follower, + ); + } + } + } + + return $cases; +} + +/** + * @return array + */ +function token_map_sweep_cases(): array { + $method = new \ReflectionMethod( Generator::class, 'token_map_sweep_cases' ); + $method->setAccessible( true ); + return $method->invoke( null ); +} + +/** + * @return string[] + */ +function numeric_boundary_sweep_cases(): array { + $cases = array(); + foreach ( array( 'decimal', 'hex-lower', 'hex-upper', 'hex-mixed' ) as $kind ) { + $is_decimal = 'decimal' === $kind; + $max_digits = $is_decimal ? 7 : 6; + foreach ( array( $max_digits, $max_digits + 1 ) as $digit_count ) { + foreach ( array( false, true ) as $leading_zero ) { + foreach ( array( false, true ) as $semicolon ) { + $cases[] = numeric_boundary_reference( $kind, $digit_count, $leading_zero, $semicolon ); + } + } + } + } + + return array_values( array_unique( $cases ) ); +} + +function numeric_boundary_reference( string $kind, int $digit_count, bool $leading_zero, bool $semicolon ): string { + if ( 'decimal' === $kind ) { + $prefix = '&#'; + $digits = 7 === $digit_count ? '1114111' : substr( str_repeat( '9', $digit_count ), 0, $digit_count ); + } else { + $prefix = 'hex-upper' === $kind ? '&#X' : '&#x'; + $digits = 6 === $digit_count ? '10ffee' : substr( str_repeat( 'abcdef', (int) ceil( $digit_count / 6 ) ), 0, $digit_count ); + if ( 'hex-upper' === $kind ) { + $digits = strtoupper( $digits ); + } elseif ( 'hex-mixed' === $kind ) { + $chars = str_split( $digits ); + foreach ( $chars as $i => $char ) { + if ( 0 === $i % 2 ) { + $chars[ $i ] = strtoupper( $char ); + } + } + $digits = implode( '', $chars ); + } + } + + if ( $leading_zero ) { + $digits = '0' . $digits; + } + + return $prefix . $digits . ( $semicolon ? ';' : '' ); +} + +/** + * @return array{base: string, significant_digits: int, leading_zero: bool, semicolon: bool, mixed_hex: bool} + */ +function numeric_boundary_shape( string $payload ): array { + if ( 1 !== preg_match( '/^&#(?:(x|X)([0-9A-Fa-f]+)|([0-9]+))(;?)$/', $payload, $match ) ) { + return array( + 'base' => 'invalid', + 'significant_digits' => 0, + 'leading_zero' => false, + 'semicolon' => false, + 'mixed_hex' => false, + ); + } + + $is_hex = '' !== ( $match[1] ?? '' ); + $digits = $is_hex ? $match[2] : $match[3]; + $significant = substr( $digits, strspn( $digits, '0' ) ); + $letters = preg_replace( '/[^A-Fa-f]/', '', $digits ); + + return array( + 'base' => $is_hex ? 'hex' : 'decimal', + 'significant_digits' => strlen( $significant ), + 'leading_zero' => strlen( $digits ) > strlen( $significant ), + 'semicolon' => ';' === ( $match[4] ?? '' ), + 'mixed_hex' => $is_hex && '' !== $letters && strtolower( $letters ) !== $letters && strtoupper( $letters ) !== $letters, + ); +} + +/** + * @return string[] + */ +function attribute_prefix_smoke_targets(): array { + return array( + 'javascript:', + 'JaVaScRiPt:', + 'http://', + 'https://', + 'mailto:user@example.com', + 'data:text/plain,', + 'urn:wp:html5:', + 'ftp://', + ); +} + +/** + * @return string[] + */ +function attribute_prefix_encoding_forms( string $payload ): array { + $forms = array(); + + if ( '' !== $payload && '&' !== $payload[0] ) { + $forms['literal'] = true; + } + if ( 1 === preg_match( '/&#[1-9][0-9]*;?/', $payload ) ) { + $forms['decimal'] = true; + } + if ( 1 === preg_match( '/�+[0-9]+;?/', $payload ) ) { + $forms['leading-zero'] = true; + } + if ( 1 === preg_match( '/&#[xX][0-9A-Fa-f]+;?/', $payload ) ) { + $forms['hex'] = true; + } + if ( 1 === preg_match( '/(?:&#[0-9]+(?:$|[^0-9;])|&#[xX][0-9A-Fa-f]+(?:$|[^0-9A-Fa-f;]))/', $payload ) ) { + $forms['semicolonless'] = true; + } + + return array_keys( $forms ); +} + +/** + * @return string[] + */ +function expected_weighted_strategies(): array { + return array( + 'adjacency', + 'attribute-discriminator', + 'attribute-prefix', + 'case-mangled-name', + 'composition', + 'lookalike', + 'multibyte-around', + 'named-exact', + 'named-missing-semi', + 'numeric', + 'plain-no-amp', + 'reference-at-eof', + 'truncation-sweep', + ); +} + +/** + * @return string[] + */ +function expected_corpus_strategies(): array { + return array( + 'corpus-byte-perturb', + 'corpus-reference-duplication', + 'corpus-semicolon-toggle', + 'corpus-splice', + ); +} + +/** + * @return string[] + */ +function corpus_seed_payloads(): array { + $method = new \ReflectionMethod( Generator::class, 'corpus_payloads' ); + $method->setAccessible( true ); + return $method->invoke( null ); +} + +/** + * @param string[] $base_names + * @return array{base_set: array, delete: array, substitution: array>, transpose: array} + */ +function lookalike_mutation_indexes( array $base_names ): array { + $base_set = array_fill_keys( $base_names, true ); + $delete_mutants = array(); + $substitution_patterns = array(); + $transpose_mutants = array(); + + foreach ( $base_names as $base ) { + $length = strlen( $base ); + for ( $i = 0; $i < $length; $i++ ) { + $delete = substr( $base, 0, $i ) . substr( $base, $i + 1 ); + if ( '' !== $delete && ! isset( $base_set[ $delete ] ) ) { + $delete_mutants[ $delete ] = true; + } + + $substitution_patterns[ $length ][ substr( $base, 0, $i ) . "\0" . substr( $base, $i + 1 ) ] = true; + } + + for ( $i = 0; $i < $length - 1; $i++ ) { + if ( $base[ $i ] === $base[ $i + 1 ] ) { + continue; + } + $transpose = substr( $base, 0, $i ) . $base[ $i + 1 ] . $base[ $i ] . substr( $base, $i + 2 ); + if ( ! isset( $base_set[ $transpose ] ) ) { + $transpose_mutants[ $transpose ] = true; + } + } + } + + return array( + 'base_set' => $base_set, + 'delete' => $delete_mutants, + 'substitution' => $substitution_patterns, + 'transpose' => $transpose_mutants, + ); +} + +/** + * @param array{base_set: array, delete: array, substitution: array>, transpose: array} $indexes + * @return string[] + */ +function lookalike_candidate_classes( string $candidate, array $indexes ): array { + if ( '' === $candidate || isset( $indexes['base_set'][ $candidate ] ) ) { + return array(); + } + + $classes = array(); + if ( isset( $indexes['delete'][ $candidate ] ) ) { + $classes['delete'] = true; + } + + $length = strlen( $candidate ); + for ( $i = 0; $i < $length; $i++ ) { + $shorter = substr( $candidate, 0, $i ) . substr( $candidate, $i + 1 ); + if ( isset( $indexes['base_set'][ $shorter ] ) ) { + $classes['insert'] = true; + break; + } + } + + $substitution_patterns = $indexes['substitution'][ $length ] ?? array(); + for ( $i = 0; $i < $length; $i++ ) { + $pattern = substr( $candidate, 0, $i ) . "\0" . substr( $candidate, $i + 1 ); + if ( isset( $substitution_patterns[ $pattern ] ) ) { + $classes['substitute'] = true; + break; + } + } + + if ( isset( $indexes['transpose'][ $candidate ] ) ) { + $classes['transpose'] = true; + } + + return array_keys( $classes ); +} + +function sparse_lookalike_operation( string $candidate, string $base ): ?string { + $candidate_length = strlen( $candidate ); + $base_length = strlen( $base ); + + if ( $candidate_length === $base_length - 1 ) { + for ( $i = 0; $i < $base_length; $i++ ) { + if ( substr( $base, 0, $i ) . substr( $base, $i + 1 ) === $candidate ) { + return 'delete'; + } + } + } + + if ( $candidate_length === $base_length + 1 ) { + for ( $i = 0; $i < $candidate_length; $i++ ) { + if ( substr( $candidate, 0, $i ) . substr( $candidate, $i + 1 ) === $base ) { + return 'insert'; + } + } + } + + if ( $candidate_length !== $base_length ) { + return null; + } + + $diffs = array(); + for ( $i = 0; $i < $base_length; $i++ ) { + if ( $candidate[ $i ] !== $base[ $i ] ) { + $diffs[] = $i; + } + } + + if ( 1 === count( $diffs ) ) { + return 'substitute'; + } + + if ( + 2 === count( $diffs ) && + $diffs[1] === $diffs[0] + 1 && + $candidate[ $diffs[0] ] === $base[ $diffs[1] ] && + $candidate[ $diffs[1] ] === $base[ $diffs[0] ] + ) { + return 'transpose'; + } + + return null; +} + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'decode_text' => static fn( string $text ): string => str_replace( "\u{20AC}", "\u{0080}", \WP_HTML_Decoder::decode_text_node( $text ) ), + 'decode_attribute' => static fn( string $text ): string => str_replace( "\u{20AC}", "\u{0080}", \WP_HTML_Decoder::decode_attribute( $text ) ), + ) +); +check( 'catches decoder skipping C1 remap', in_array( 'decode-mismatch', $seen, true ), implode( ',', $seen ) ); + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'decode_attribute' => static fn( string $text ): string => \WP_HTML_Decoder::decode_text_node( $text ), + 'read_character_reference' => static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + return \WP_HTML_Decoder::read_character_reference( 'attribute' === $context ? 'data' : $context, $text, $at, $match_byte_length ); + }, + ) +); +check( 'catches semicolonless refs decoded in attributes', in_array( 'decode-mismatch', $seen, true ), implode( ',', $seen ) ); + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'read_character_reference' => static function ( string $context, string $text, int $at, &$match_byte_length = null ): ?string { + $result = \WP_HTML_Decoder::read_character_reference( $context, $text, $at, $match_byte_length ); + if ( null !== $result ) { + ++$match_byte_length; + } + return $result; + }, + ) +); +check( + 'catches off-by-one match length', + in_array( 'reader-decode-mismatch', $seen, true ) || in_array( 'reader-overran-input', $seen, true ), + implode( ',', $seen ) +); + +$seen = fault_run( $oracles, 'reader-empty-chunk', 'a&b' ); +check( 'fault target reader-empty-chunk exposes empty chunks', in_array( 'reader-returned-empty-chunk', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'reader-short-match-length', 'a&b' ); +check( 'fault target reader-short-match-length exposes one-byte matches', in_array( 'reader-match-too-short', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'reader-substring-composition' ); +check( 'fault target reader-substring-composition exposes local-reader mismatches', in_array( 'reader-composition-mismatch', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'reader-null-mutates-match-length', 'a&bogus;b' ); +check( 'fault target reader-null-mutates-match-length exposes null match-length mutation', in_array( 'reader-mutated-match-length-on-null', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'reader-non-amp-match', 'a&b' ); +check( 'fault target reader-non-amp-match exposes non-amp reader matches', in_array( 'reader-non-amp-match', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'reader-gapless-drop-span', 'a&b' ); +check( 'fault target reader-gapless-drop-span exposes non-gapless reader walks', in_array( 'reader-walk-not-gapless', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'numeric-invalid-not-replacement', 'a�b' ); +check( 'fault target numeric-invalid-not-replacement exposes invalid numeric replacements', in_array( 'numeric-invalid-not-replacement', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'numeric-c1-not-remapped', 'a€b' ); +check( 'fault target numeric-c1-not-remapped exposes skipped numeric C1 remaps', in_array( 'numeric-c1-not-remapped', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run_without_oracle( $oracles, 'raw-c1-not-pass-through', "\x80\x9F" ); +check( 'fault target raw-c1-not-pass-through exposes raw C1 byte rewrites', in_array( 'raw-c1-not-pass-through', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'text-secondary-oracle', 'a&b', 'text' ); +check( 'fault target text-secondary-oracle exposes secondary text-oracle mismatches', in_array( 'text-secondary-oracle-mismatch', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'text-secondary-oracle', 'a€b', 'text' ); +check( 'secondary text oracle skips numeric references unsupported by html_entity_decode', ! in_array( 'text-secondary-oracle-mismatch', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'text-secondary-oracle', 'aÆlater;b', 'text' ); +check( 'secondary text oracle skips unknown names with legacy prefixes', ! in_array( 'text-secondary-oracle-mismatch', $seen, true ), implode( ',', $seen ) ); + +$single_level_failures = $checks->run( 'both', '&amp;' ); +check( 'single-level decode keeps nested ampersand reference literal', array() === $single_level_failures, json_encode( $single_level_failures ) ); + +$seen = fault_run( $oracles, 'single-level-overdecode', '&amp;', 'text' ); +check( 'fault target single-level-overdecode exposes text double decodes', in_array( 'single-level-decode-overdecoded', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'single-level-overdecode', '&amp;', 'attribute' ); +check( 'fault target single-level-overdecode exposes attribute double decodes', in_array( 'single-level-decode-overdecoded', $seen, true ), implode( ',', $seen ) ); + +$wrong_text = '!a&b'; +$wrong_primary_oracles = new class( $wrong_text ) extends Oracles { + private string $wrong_text; + + public function __construct( string $wrong_text ) { + $this->wrong_text = $wrong_text; + } + + public function decode( string $context, string $payload ): string { + if ( 'text' === $context ) { + return $this->wrong_text; + } + + return parent::decode( $context, $payload ); + } +}; +$wrong_agreement_checks = new Checks( + $wrong_primary_oracles, + array_merge( + $real_targets, + array( + 'decode_text' => static fn( string $text ): string => $wrong_text, + ) + ) +); +$wrong_agreement_seen = array(); +foreach ( $wrong_agreement_checks->run( 'text', 'a&b' ) as $failure ) { + $wrong_agreement_seen[ $failure['check'] ] = true; +} +check( + 'secondary text oracle catches primary and target agreement on wrong text', + isset( $wrong_agreement_seen['text-secondary-oracle-mismatch'] ) && + ! isset( $wrong_agreement_seen['decode-mismatch'] ), + implode( ',', array_keys( $wrong_agreement_seen ) ) +); + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'decode_attribute' => static function ( string $text ): string { + $decoded = \WP_HTML_Decoder::decode_attribute( $text ); + return str_contains( $text, '&' ) ? $decoded : '!' . $decoded; + }, + ) +); +check( 'catches attribute no-amp identity violations in oracle mode', in_array( 'attribute-without-ampersand-not-identity', $seen, true ), implode( ',', $seen ) ); + +$seen = fault_run( $oracles, 'attribute-no-amp-identity', 'plain' ); +check( 'fault target attribute-no-amp-identity exposes attribute no-amp identity violations', in_array( 'attribute-without-ampersand-not-identity', $seen, true ), implode( ',', $seen ) ); + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ): bool { + unset( $case_sensitivity ); + return '' === $search || strlen( $haystack ) < strlen( $search ) || str_starts_with( \WP_HTML_Decoder::decode_attribute( $haystack ), $search ); + }, + ) +); +check( 'catches partial-prefix attribute matcher', in_array( 'attribute-starts-with-mismatch', $seen, true ), implode( ',', $seen ) ); + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ) use ( $real_targets ): bool { + if ( str_starts_with( $haystack, '<⃒' ) && "<\xE2" === $search ) { + return false; + } + return $real_targets['attribute_starts_with']( $haystack, $search, $case_sensitivity ); + }, + ) +); +check( 'catches partial multi-code-point attribute matcher', in_array( 'attribute-starts-with-mismatch', $seen, true ), implode( ',', $seen ) ); + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ) use ( $real_targets ): bool { + if ( 'jav' === $search ) { + return false; + } + return $real_targets['attribute_starts_with']( $haystack, $search, $case_sensitivity ); + }, + ) +); +check( 'catches attribute_starts_with prefix monotonicity violations', in_array( 'attribute-starts-with-prefix-monotonicity', $seen, true ), implode( ',', $seen ) ); + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ) use ( $real_targets ): bool { + if ( str_ends_with( $search, "\x7F" ) ) { + return true; + } + return $real_targets['attribute_starts_with']( $haystack, $search, $case_sensitivity ); + }, + ) +); +check( 'catches attribute_starts_with extension monotonicity violations', in_array( 'attribute-starts-with-extension-monotonicity', $seen, true ), implode( ',', $seen ) ); + +$seen = broken_run( + $oracles, + $real_targets, + array( + 'attribute_starts_with' => static function ( string $haystack, string $search, string $case_sensitivity ) use ( $real_targets ): bool { + if ( 'ascii-case-insensitive' === $case_sensitivity && 'jav' === $search ) { + return false; + } + return $real_targets['attribute_starts_with']( $haystack, $search, $case_sensitivity ); + }, + ) +); +check( 'catches attribute_starts_with case monotonicity violations', in_array( 'attribute-starts-with-case-monotonicity', $seen, true ), implode( ',', $seen ) ); + +$attribute_faults = array( + 'attribute-prefix-monotonicity' => 'attribute-starts-with-prefix-monotonicity', + 'attribute-extension-monotonicity' => 'attribute-starts-with-extension-monotonicity', + 'attribute-case-monotonicity' => 'attribute-starts-with-case-monotonicity', +); +foreach ( $attribute_faults as $fault => $expected_check ) { + $seen = fault_run( $oracles, $fault ); + check( "fault target {$fault} exposes {$expected_check}", in_array( $expected_check, $seen, true ), implode( ',', $seen ) ); +} + +$seen = fault_run( $oracles, 'attribute-multicodepoint-prefix', '<⃒tail' ); +check( 'fault target attribute-multicodepoint-prefix exposes partial replacement prefixes', in_array( 'attribute-starts-with-mismatch', $seen, true ), implode( ',', $seen ) ); + +$seen = broken_oracle_free_run( + $oracles, + $real_targets, + array( + 'decode_text' => static fn( string $text ): string => str_replace( "\x00", '', \WP_HTML_Decoder::decode_text_node( $text ) ), + 'decode_attribute' => static fn( string $text ): string => str_replace( "\x00", '', \WP_HTML_Decoder::decode_attribute( $text ) ), + ) +); +check( + 'catches oracle-free no-amp byte identity violations', + in_array( 'text-without-ampersand-not-identity', $seen, true ) && + in_array( 'attribute-without-ampersand-not-identity', $seen, true ), + implode( ',', $seen ) +); + +$names = Bootstrap::named_reference_names(); +check( 'uses generated named-reference map', count( $names ) > 2000, (string) count( $names ) ); + +$a = ( new Generator( new Prng( '7:3' ), 4096, $names ) )->generate(); +$b = ( new Generator( new Prng( '7:3' ), 4096, $names ) )->generate(); +check( 'generator deterministic for (seed, case)', $a === $b ); + +$custom_names = array( 'zz;', 'amp;', 'LongName;', 'abc', 'copy', 'z' ); +$reversed_custom_names = array_reverse( $custom_names ); +$order_stable_error = ''; +for ( $i = 0; $i < 80; $i++ ) { + $ordered_generator = new Generator( new Prng( "order-stable:{$i}" ), 4096, $custom_names ); + $reversed_generator = new Generator( new Prng( "order-stable:{$i}" ), 4096, $reversed_custom_names ); + if ( $ordered_generator->generate() !== $reversed_generator->generate() ) { + $order_stable_error = "weighted case {$i}"; + break; + } + + $ordered_sweep = new Generator( new Prng( "order-stable-name:{$i}" ), 4096, $custom_names ); + $reversed_sweep = new Generator( new Prng( "order-stable-name:{$i}" ), 4096, $reversed_custom_names ); + if ( $ordered_sweep->generate_name_sweep( $i ) !== $reversed_sweep->generate_name_sweep( $i ) ) { + $order_stable_error = "name sweep case {$i}"; + break; + } + + $ordered_legacy = new Generator( new Prng( "order-stable-legacy:{$i}" ), 4096, $custom_names ); + $reversed_legacy = new Generator( new Prng( "order-stable-legacy:{$i}" ), 4096, $reversed_custom_names ); + if ( $ordered_legacy->generate_legacy_follower_sweep( $i ) !== $reversed_legacy->generate_legacy_follower_sweep( $i ) ) { + $order_stable_error = "legacy follower case {$i}"; + break; + } +} +check( 'generator sorts injected named-reference lists deterministically', '' === $order_stable_error, $order_stable_error ); + +$name_sweep_generator = new Generator( new Prng( 'name-sweep' ), 4096, $names ); +$name_sweep_base_names = name_sweep_base_names( $names ); +$name_sweep_followers = array( '', 'x', 'X', '0', '=', '-', ' ', '/', "\u{00E9}" ); +$name_sweep_period = count( $name_sweep_base_names ) * 2 * count( $name_sweep_followers ); +$name_sweep_mismatch = ''; +$name_sweep_contexts = array(); +$name_sweep_strategies = array(); +$name_sweep_unsafe = 0; +for ( $i = 0; $i < $name_sweep_period; $i++ ) { + $generated = $name_sweep_generator->generate_name_sweep( $i ); + $name_sweep_contexts[ $generated['context'] ] = true; + $name_sweep_strategies[ $generated['strategy'] ] = true; + if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) { + ++$name_sweep_unsafe; + } + + if ( '' === $name_sweep_mismatch ) { + $variant = $i % ( 2 * count( $name_sweep_followers ) ); + $expected = '&' . $name_sweep_base_names[ intdiv( $i, 2 * count( $name_sweep_followers ) ) ] . + ( $variant >= count( $name_sweep_followers ) ? ';' : '' ) . + $name_sweep_followers[ $variant % count( $name_sweep_followers ) ]; + if ( $generated['payload'] !== $expected ) { + $name_sweep_mismatch = "case {$i}: expected " . bin2hex( $expected ) . ' got ' . bin2hex( $generated['payload'] ); + } + } +} +check( 'name-sweep period covers every base/semicolon/follower case', $name_sweep_generator->name_sweep_period() === $name_sweep_period && $name_sweep_period > count( $names ), (string) $name_sweep_period ); +check( 'name-sweep generator maps cases deterministically', '' === $name_sweep_mismatch, $name_sweep_mismatch ); +check( 'name-sweep cases run both contexts', array( 'both' ) === array_keys( $name_sweep_contexts ), implode( ',', array_keys( $name_sweep_contexts ) ) ); +check( 'name-sweep uses one strategy label', array( 'name-sweep' ) === array_keys( $name_sweep_strategies ), implode( ',', array_keys( $name_sweep_strategies ) ) ); +check( 'name-sweep payloads are oracle-safe', 0 === $name_sweep_unsafe, (string) $name_sweep_unsafe ); + +$legacy_follower_generator = new Generator( new Prng( 'legacy-follower-sweep' ), 4096, $names ); +$legacy_names = array_values( array_filter( $names, static fn( string $name ): bool => ! str_ends_with( $name, ';' ) ) ); +$legacy_followers = legacy_follower_sweep_followers(); +$legacy_period = count( $legacy_names ) * count( $legacy_followers ); +$legacy_mismatch = ''; +$legacy_contexts = array(); +$legacy_strategies = array(); +$legacy_unsafe = 0; +$legacy_seen_names = array(); +$legacy_seen_followers = array(); +$legacy_ascii_followers = array(); +$legacy_utf8_leads = array(); +$legacy_utf8_continuations = array(); +for ( $i = 0; $i < $legacy_period; $i++ ) { + $generated = $legacy_follower_generator->generate_legacy_follower_sweep( $i ); + $name = $legacy_names[ intdiv( $i, count( $legacy_followers ) ) ]; + $follower = $legacy_followers[ $i % count( $legacy_followers ) ]; + $expected = '&' . $name . $follower; + + $legacy_contexts[ $generated['context'] ] = true; + $legacy_strategies[ $generated['strategy'] ] = true; + $legacy_seen_names[ $name ] = true; + $legacy_seen_followers[ $follower ] = true; + if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) { + ++$legacy_unsafe; + } + if ( '' === $legacy_mismatch && $expected !== $generated['payload'] ) { + $legacy_mismatch = "case {$i}: expected " . bin2hex( $expected ) . ' got ' . bin2hex( $generated['payload'] ); + } + + if ( 1 === strlen( $follower ) ) { + $legacy_ascii_followers[ ord( $follower ) ] = true; + } else { + $legacy_utf8_leads[ ord( $follower[0] ) ] = true; + for ( $j = 1; $j < strlen( $follower ); $j++ ) { + $legacy_utf8_continuations[ ord( $follower[ $j ] ) ] = true; + } + } +} +$expected_ascii_followers = array_values( + array_filter( + range( 1, 0x7F ), + static fn( int $byte ): bool => ! in_array( $byte, array( 0x0D, 0x22, 0x3C ), true ) + ) +); +check( 'legacy-follower period covers every legacy name and follower', $legacy_follower_generator->legacy_follower_sweep_period() === $legacy_period && count( $legacy_seen_names ) === count( $legacy_names ) && count( $legacy_seen_followers ) === count( $legacy_followers ), (string) $legacy_period ); +check( 'legacy-follower generator maps cases deterministically', '' === $legacy_mismatch, $legacy_mismatch ); +check( 'legacy-follower cases run both contexts', array( 'both' ) === array_keys( $legacy_contexts ), implode( ',', array_keys( $legacy_contexts ) ) ); +check( 'legacy-follower uses one strategy label', array( 'legacy-follower-sweep' ) === array_keys( $legacy_strategies ), implode( ',', array_keys( $legacy_strategies ) ) ); +check( 'legacy-follower payloads are oracle-safe', 0 === $legacy_unsafe, (string) $legacy_unsafe ); +check( 'legacy-follower covers every oracle-safe ASCII follower byte', array() === array_diff( $expected_ascii_followers, array_keys( $legacy_ascii_followers ) ), implode( ',', array_keys( $legacy_ascii_followers ) ) ); +check( 'legacy-follower covers valid UTF-8 lead bytes', array() === array_diff( range( 0xC2, 0xF4 ), array_keys( $legacy_utf8_leads ) ), implode( ',', array_map( static fn( int $byte ): string => dechex( $byte ), array_keys( $legacy_utf8_leads ) ) ) ); +check( 'legacy-follower covers UTF-8 continuation bytes', array() === array_diff( range( 0x80, 0xBF ), array_keys( $legacy_utf8_continuations ) ), implode( ',', array_map( static fn( int $byte ): string => dechex( $byte ), array_keys( $legacy_utf8_continuations ) ) ) ); + +$prefix_family_generator = new Generator( new Prng( 'prefix-family-sweep' ), 4096, $names ); +$prefix_family_cases = prefix_family_sweep_cases( $name_sweep_base_names ); +$prefix_family_mismatch = ''; +$prefix_family_contexts = array(); +$prefix_family_strategies = array(); +$prefix_family_unsafe = 0; +$prefix_family_references = array(); +$prefix_family_split_keys = array(); +$prefix_family_followers = array(); +for ( $i = 0; $i < count( $prefix_family_cases ); $i++ ) { + $generated = $prefix_family_generator->generate_prefix_family_sweep( $i ); + $case = $prefix_family_cases[ $i ]; + $expected = substr( $case['reference'], 0, $case['split'] ) . $case['follower']; + + $prefix_family_contexts[ $generated['context'] ] = true; + $prefix_family_strategies[ $generated['strategy'] ] = true; + $prefix_family_references[ $case['reference'] ] = true; + $prefix_family_split_keys[ $case['reference'] . ':' . $case['split'] ] = true; + $prefix_family_followers[ $case['follower'] ] = true; + if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) { + ++$prefix_family_unsafe; + } + if ( '' === $prefix_family_mismatch && $expected !== $generated['payload'] ) { + $prefix_family_mismatch = "case {$i}: expected " . bin2hex( $expected ) . ' got ' . bin2hex( $generated['payload'] ); + } +} +$expected_prefix_split_count = 0; +foreach ( array_keys( $prefix_family_references ) as $reference ) { + $expected_prefix_split_count += strlen( $reference ) - 1; +} +check( + 'prefix-family period covers every reference split and follower', + $prefix_family_generator->prefix_family_sweep_period() === count( $prefix_family_cases ) && + array() === array_diff( + array_map( static fn( string $reference ): string => '&' . $reference, prefix_family_sweep_references() ), + array_keys( $prefix_family_references ) + ) && + count( $prefix_family_references ) === count( prefix_family_sweep_references() ) && + count( $prefix_family_split_keys ) === $expected_prefix_split_count && + count( $prefix_family_followers ) === count( prefix_family_sweep_followers() ), + (string) count( $prefix_family_cases ) . ' ' . implode( ',', array_keys( $prefix_family_references ) ) +); +check( 'prefix-family generator maps cases deterministically', '' === $prefix_family_mismatch, $prefix_family_mismatch ); +check( 'prefix-family cases run both contexts', array( 'both' ) === array_keys( $prefix_family_contexts ), implode( ',', array_keys( $prefix_family_contexts ) ) ); +check( 'prefix-family uses one strategy label', array( 'prefix-family-sweep' ) === array_keys( $prefix_family_strategies ), implode( ',', array_keys( $prefix_family_strategies ) ) ); +check( 'prefix-family payloads are oracle-safe', 0 === $prefix_family_unsafe, (string) $prefix_family_unsafe ); +check( 'prefix-family covers expected ambiguous followers', array() === array_diff( prefix_family_sweep_followers(), array_keys( $prefix_family_followers ) ), implode( ',', array_keys( $prefix_family_followers ) ) ); + +$token_map_generator = new Generator( new Prng( 'token-map-sweep' ), 4096, $names ); +$token_map_cases = token_map_sweep_cases(); +$token_map_structure = Bootstrap::named_reference_structure(); +$token_map_minimal_large_names = array_values( + array_filter( + $token_map_structure['large_names'], + static fn( string $name ): bool => strlen( $name ) === $token_map_structure['key_length'] + 1 + ) +); +$token_map_large_name_set = array_fill_keys( $token_map_structure['large_names'], true ); +$token_map_small_name_set = array_fill_keys( $token_map_structure['small_names'], true ); +$token_map_mismatch = ''; +$token_map_contexts = array(); +$token_map_strategies = array(); +$token_map_shapes = array(); +$token_map_prefixes = array(); +$token_map_small_exact_names = array(); +$token_map_small_extended_names = array(); +$token_map_large_exact_names = array(); +$token_map_large_extended_names = array(); +$token_map_divergence_errors = array(); +$token_map_unsafe = 0; +$token_map_fault_case_index = null; +for ( $i = 0; $i < count( $token_map_cases ); $i++ ) { + $case = $token_map_cases[ $i ]; + $generated = $token_map_generator->generate_token_map_sweep( $i ); + $shape = $case['shape']; + + $token_map_contexts[ $generated['context'] ] = true; + $token_map_strategies[ $generated['strategy'] ] = true; + $token_map_shapes[ $shape ] = true; + if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) { + ++$token_map_unsafe; + } + if ( '' === $token_map_mismatch && $case['payload'] !== $generated['payload'] ) { + $token_map_mismatch = "case {$i}: expected " . bin2hex( $case['payload'] ) . ' got ' . bin2hex( $generated['payload'] ); + } + + if ( 'large-prefix-divergent' === $shape ) { + $prefix = $case['prefix'] ?? ''; + $token_map_prefixes[ $prefix ] = true; + $payload_name = substr( $case['payload'], 1 ); + $rest = substr( $payload_name, strlen( $prefix ) ); + $first_rest = '' === $rest ? '' : $rest[0]; + $used_first_rest_chars = array(); + foreach ( $token_map_structure['large_names_by_prefix'][ $prefix ] ?? array() as $name ) { + $name_rest = substr( $name, strlen( $prefix ) ); + if ( '' !== $name_rest ) { + $used_first_rest_chars[ $name_rest[0] ] = true; + } + } + + if ( + strlen( $prefix ) !== $token_map_structure['key_length'] || + ! str_starts_with( $case['payload'], '&' . $prefix ) || + ! str_ends_with( $case['payload'], ';' ) || + isset( $token_map_large_name_set[ $payload_name ] ) || + isset( $token_map_small_name_set[ $payload_name ] ) || + '' === $first_rest || + isset( $used_first_rest_chars[ $first_rest ] ) + ) { + $token_map_divergence_errors[] = "{$i}:" . bin2hex( $case['payload'] ); + } + } elseif ( 'small-boundary-exact' === $shape ) { + $token_map_small_exact_names[ $case['name'] ?? '' ] = true; + } elseif ( 'small-boundary-extended' === $shape ) { + $token_map_small_extended_names[ $case['name'] ?? '' ] = true; + if ( null === $token_map_fault_case_index ) { + $token_map_fault_case_index = $i; + } + } elseif ( 'large-boundary-exact' === $shape ) { + $token_map_large_exact_names[ $case['name'] ?? '' ] = true; + } elseif ( 'large-boundary-extended' === $shape ) { + $token_map_large_extended_names[ $case['name'] ?? '' ] = true; + } +} +$expected_token_map_shapes = array( + 'large-prefix-divergent', + 'small-boundary-exact', + 'small-boundary-extended', + 'large-boundary-exact', + 'large-boundary-extended', +); +check( + 'token-map structure exposes two-byte large-word prefixes', + 2 === $token_map_structure['key_length'] && + count( $token_map_structure['group_prefixes'] ) > 0 && + count( $token_map_structure['group_prefixes'] ) === count( $token_map_structure['large_names_by_prefix'] ), + json_encode( + array( + 'key_length' => $token_map_structure['key_length'], + 'prefixes' => count( $token_map_structure['group_prefixes'] ), + ) + ) +); +check( + 'token-map period covers prefix divergences and boundary names', + $token_map_generator->token_map_period() === count( $token_map_cases ) && + array() === array_diff( $token_map_structure['group_prefixes'], array_keys( $token_map_prefixes ) ) && + count( $token_map_prefixes ) === count( $token_map_structure['group_prefixes'] ) && + array() === array_diff( $token_map_structure['small_names'], array_keys( $token_map_small_exact_names ) ) && + array() === array_diff( $token_map_structure['small_names'], array_keys( $token_map_small_extended_names ) ) && + array() === array_diff( $token_map_minimal_large_names, array_keys( $token_map_large_exact_names ) ) && + array() === array_diff( $token_map_minimal_large_names, array_keys( $token_map_large_extended_names ) ), + (string) count( $token_map_cases ) +); +check( 'token-map generator maps cases deterministically', '' === $token_map_mismatch, $token_map_mismatch ); +check( 'token-map cases run both contexts', array( 'both' ) === array_keys( $token_map_contexts ), implode( ',', array_keys( $token_map_contexts ) ) ); +check( 'token-map uses one strategy label', array( 'token-map-structure-sweep' ) === array_keys( $token_map_strategies ), implode( ',', array_keys( $token_map_strategies ) ) ); +check( 'token-map payloads are oracle-safe', 0 === $token_map_unsafe, (string) $token_map_unsafe ); +check( + 'token-map emits expected structure-aware shapes', + array() === array_diff( $expected_token_map_shapes, array_keys( $token_map_shapes ) ) && + array() === array_diff( array_keys( $token_map_shapes ), $expected_token_map_shapes ), + implode( ',', array_keys( $token_map_shapes ) ) +); +check( 'token-map large-prefix probes diverge after the shared map prefix', array() === $token_map_divergence_errors, implode( ',', $token_map_divergence_errors ) ); +check( 'token-map has semicolonless boundary fault case', null !== $token_map_fault_case_index, json_encode( $token_map_cases ) ); + +$numeric_boundary_generator = new Generator( new Prng( 'numeric-boundary-sweep' ), 4096, $names ); +$numeric_boundary_cases = numeric_boundary_sweep_cases(); +$numeric_boundary_mismatch = ''; +$numeric_boundary_contexts = array(); +$numeric_boundary_strategies = array(); +$numeric_boundary_unsafe = 0; +$numeric_boundary_shapes = array(); +$numeric_boundary_mixed_hex = false; +$numeric_boundary_exact_max_replacements = array(); +$numeric_boundary_overflow_non_replacements = array(); +for ( $i = 0; $i < count( $numeric_boundary_cases ); $i++ ) { + $generated = $numeric_boundary_generator->generate_numeric_boundary_sweep( $i ); + $expected = $numeric_boundary_cases[ $i ]; + $shape = numeric_boundary_shape( $generated['payload'] ); + $decoded = \WP_HTML_Decoder::decode_text_node( $generated['payload'] ); + + $numeric_boundary_contexts[ $generated['context'] ] = true; + $numeric_boundary_strategies[ $generated['strategy'] ] = true; + if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) { + ++$numeric_boundary_unsafe; + } + if ( '' === $numeric_boundary_mismatch && $expected !== $generated['payload'] ) { + $numeric_boundary_mismatch = "case {$i}: expected {$expected} got {$generated['payload']}"; + } + $numeric_boundary_shapes[] = $shape['base'] . ':' . $shape['significant_digits'] . ':' . ( $shape['leading_zero'] ? 'zero' : 'plain' ) . ':' . ( $shape['semicolon'] ? 'semi' : 'nosemi' ); + $numeric_boundary_mixed_hex = $numeric_boundary_mixed_hex || $shape['mixed_hex']; + if ( ( 'decimal' === $shape['base'] && 7 === $shape['significant_digits'] ) || ( 'hex' === $shape['base'] && 6 === $shape['significant_digits'] ) ) { + if ( "\u{FFFD}" === $decoded ) { + $numeric_boundary_exact_max_replacements[] = $generated['payload']; + } + } elseif ( ( 'decimal' === $shape['base'] && 8 === $shape['significant_digits'] ) || ( 'hex' === $shape['base'] && 7 === $shape['significant_digits'] ) ) { + if ( "\u{FFFD}" !== $decoded ) { + $numeric_boundary_overflow_non_replacements[] = $generated['payload'] . ':' . bin2hex( $decoded ); + } + } +} +$expected_numeric_boundary_shapes = array(); +foreach ( array( 'decimal' => 7, 'hex' => 6 ) as $base => $max_digits ) { + foreach ( array( $max_digits, $max_digits + 1 ) as $digit_count ) { + foreach ( array( 'plain', 'zero' ) as $zero ) { + foreach ( array( 'nosemi', 'semi' ) as $semicolon ) { + $expected_numeric_boundary_shapes[] = "{$base}:{$digit_count}:{$zero}:{$semicolon}"; + } + } + } +} +check( 'numeric-boundary period covers digit count, leading zero, and semicolon variants', $numeric_boundary_generator->numeric_boundary_sweep_period() === count( $numeric_boundary_cases ) && array() === array_diff( $expected_numeric_boundary_shapes, array_unique( $numeric_boundary_shapes ) ), implode( ',', array_unique( $numeric_boundary_shapes ) ) ); +check( 'numeric-boundary period keeps decimal and hex casing variants distinct', 32 === count( $numeric_boundary_cases ), (string) count( $numeric_boundary_cases ) ); +check( 'numeric-boundary exact-max digit cases stay in Unicode range', array() === $numeric_boundary_exact_max_replacements, implode( ',', $numeric_boundary_exact_max_replacements ) ); +check( 'numeric-boundary max-plus-one digit cases decode as invalid', array() === $numeric_boundary_overflow_non_replacements, implode( ',', $numeric_boundary_overflow_non_replacements ) ); +check( 'numeric-boundary generator maps cases deterministically', '' === $numeric_boundary_mismatch, $numeric_boundary_mismatch ); +check( 'numeric-boundary cases run both contexts', array( 'both' ) === array_keys( $numeric_boundary_contexts ), implode( ',', array_keys( $numeric_boundary_contexts ) ) ); +check( 'numeric-boundary uses one strategy label', array( 'numeric-boundary-sweep' ) === array_keys( $numeric_boundary_strategies ), implode( ',', array_keys( $numeric_boundary_strategies ) ) ); +check( 'numeric-boundary payloads are oracle-safe', 0 === $numeric_boundary_unsafe, (string) $numeric_boundary_unsafe ); +check( 'numeric-boundary emits mixed-case hex digits', $numeric_boundary_mixed_hex, implode( ',', $numeric_boundary_cases ) ); + +$corpus_period_generator = new Generator( new Prng( 'corpus-period' ), 4096, $names ); +$corpus_strategies = array(); +$corpus_contexts = array(); +$corpus_payloads = array(); +$corpus_unsafe = 0; +for ( $i = 0; $i < 600; $i++ ) { + $generated = ( new Generator( new Prng( "1:{$i}" ), 4096, $names ) )->generate_corpus_mutation( $i ); + $corpus_strategies[ $generated['strategy'] ] = true; + $corpus_contexts[ $generated['context'] ] = true; + $corpus_payloads[ $generated['payload'] ] = true; + if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) { + ++$corpus_unsafe; + } +} +$seen_corpus_strategies = array_keys( $corpus_strategies ); +sort( $seen_corpus_strategies ); +$corpus_seed_payloads = corpus_seed_payloads(); +$required_corpus_payloads = array( + 'FOO>BAR', + 'ZZ>9YY', + 'ZZ>aYY', + 'ZZ£_id=23', + 'ZZ∏_id=23', + 'ZZÆ=', +); +check( 'corpus mutation seed corpus includes retained and external vectors', $corpus_period_generator->corpus_period() >= 40, (string) $corpus_period_generator->corpus_period() ); +check( 'corpus seed retains html5lib text and attribute entity vectors', array() === array_diff( $required_corpus_payloads, $corpus_seed_payloads ), implode( ',', array_diff( $required_corpus_payloads, $corpus_seed_payloads ) ) ); +check( 'corpus mutation generator emits every mutation strategy', expected_corpus_strategies() === $seen_corpus_strategies, implode( ',', $seen_corpus_strategies ) ); +check( 'corpus mutation cases run both contexts', array( 'both' ) === array_keys( $corpus_contexts ), implode( ',', array_keys( $corpus_contexts ) ) ); +check( 'corpus mutation payloads are oracle-safe', 0 === $corpus_unsafe, (string) $corpus_unsafe ); +check( 'corpus mutation diversifies retained payload shapes', count( $corpus_payloads ) > 300, (string) count( $corpus_payloads ) ); + +$semicolon_toggle_method = new \ReflectionMethod( Generator::class, 'mutate_corpus_semicolon_toggle' ); +$semicolon_toggle_method->setAccessible( true ); +$duplication_method = new \ReflectionMethod( Generator::class, 'mutate_corpus_reference_duplication' ); +$duplication_method->setAccessible( true ); +$byte_perturb_method = new \ReflectionMethod( Generator::class, 'mutate_corpus_byte_perturb' ); +$byte_perturb_method->setAccessible( true ); +$splice_method = new \ReflectionMethod( Generator::class, 'mutate_corpus_splice' ); +$splice_method->setAccessible( true ); + +check( + 'corpus semicolon toggle adds and removes semicolons', + '&' === $semicolon_toggle_method->invoke( new Generator( new Prng( 'corpus-toggle-remove' ), 4096, $names ), '&' ) && + '&' === $semicolon_toggle_method->invoke( new Generator( new Prng( 'corpus-toggle-add' ), 4096, $names ), '&' ) +); +check( + 'corpus reference duplication duplicates matched reference text', + 'x∉∉y' === $duplication_method->invoke( new Generator( new Prng( 'corpus-duplication' ), 4096, $names ), 'x∉y' ) +); +$corpus_utf8_mutation_errors = array(); +for ( $i = 0; $i < 100; $i++ ) { + $byte_payload = $byte_perturb_method->invoke( new Generator( new Prng( "corpus-utf8-byte:{$i}" ), 4096, $names ), "\u{00E9}&\u{2603}" ); + $splice_payload = $splice_method->invoke( + new Generator( new Prng( "corpus-utf8-splice:{$i}" ), 4096, $names ), + "A\u{00E9}B", + array( "\u{2603}&\u{00E9}", 'plain >' ) + ); + if ( ! mb_check_encoding( $byte_payload, 'UTF-8' ) ) { + $corpus_utf8_mutation_errors[] = 'byte:' . $i . ':' . bin2hex( $byte_payload ); + } + if ( ! mb_check_encoding( $splice_payload, 'UTF-8' ) ) { + $corpus_utf8_mutation_errors[] = 'splice:' . $i . ':' . bin2hex( $splice_payload ); + } +} +check( 'corpus byte perturb and splice preserve UTF-8 boundaries', array() === $corpus_utf8_mutation_errors, implode( ',', $corpus_utf8_mutation_errors ) ); + +$lookalike_indexes = lookalike_mutation_indexes( $name_sweep_base_names ); +$lookalike_candidates = array(); +for ( $i = 0; $i < 6000; $i++ ) { + $generated = ( new Generator( new Prng( "lookalike-smoke:{$i}" ), 4096, $names ) )->generate(); + if ( 'lookalike' !== $generated['strategy'] ) { + continue; + } + if ( 1 !== preg_match( '/&([A-Za-z0-9]+);?/', $generated['payload'], $match ) ) { + continue; + } + + $candidate = $match[1]; + $classes = lookalike_candidate_classes( $candidate, $lookalike_indexes ); + if ( array() === $classes ) { + continue; + } + + $lookalike_candidates[ $candidate ] = true; +} +check( 'lookalike generator emits edit-distance-1 name misses', count( $lookalike_candidates ) >= 100, (string) count( $lookalike_candidates ) ); + +$sparse_lookalike_names = array( 'abcde;', 'vwxyz' ); +$sparse_lookalike_bases = name_sweep_base_names( $sparse_lookalike_names ); +$sparse_lookalike_classes = array(); +for ( $i = 0; $i < 6000; $i++ ) { + $generated = ( new Generator( new Prng( "lookalike-sparse-smoke:{$i}" ), 4096, $sparse_lookalike_names ) )->generate(); + if ( 'lookalike' !== $generated['strategy'] || 1 !== preg_match( '/&([A-Za-z0-9]+);?/', $generated['payload'], $match ) ) { + continue; + } + + foreach ( $sparse_lookalike_bases as $base ) { + $operation = sparse_lookalike_operation( $match[1], $base ); + if ( null !== $operation ) { + $sparse_lookalike_classes[ $operation ] = true; + break; + } + } +} +check( + 'lookalike generator exercises every edit operation branch', + array() === array_diff( array( 'delete', 'insert', 'substitute', 'transpose' ), array_keys( $sparse_lookalike_classes ) ), + implode( ',', array_keys( $sparse_lookalike_classes ) ) +); + +$case_mangled_candidates = array(); +$case_mangled_invalid = array(); +$base_names_by_lowercase = array(); +foreach ( $name_sweep_base_names as $base ) { + $base_names_by_lowercase[ strtolower( $base ) ][] = $base; +} +for ( $i = 0; $i < 8000; $i++ ) { + $generated = ( new Generator( new Prng( "case-mangled-smoke:{$i}" ), 4096, $names ) )->generate(); + if ( 'case-mangled-name' !== $generated['strategy'] ) { + continue; + } + if ( 1 !== preg_match( '/&([A-Za-z0-9]+);/', $generated['payload'], $match ) ) { + continue; + } + + $candidate = $match[1]; + $case_mangled_candidates[ $candidate ] = true; + if ( isset( $lookalike_indexes['base_set'][ $candidate ] ) || ! isset( $base_names_by_lowercase[ strtolower( $candidate ) ] ) ) { + $case_mangled_invalid[] = $candidate; + } +} +check( 'case-mangled generator emits case-only name misses', count( $case_mangled_candidates ) >= 100 && array() === $case_mangled_invalid, implode( ',', array_slice( $case_mangled_invalid, 0, 20 ) ) . ':' . count( $case_mangled_candidates ) ); + +$case_mangle_method = new \ReflectionMethod( Generator::class, 'case_mangle_name_base' ); +$case_mangle_method->setAccessible( true ); +$case_mangle_direct_errors = array(); +for ( $i = 0; $i < 50; $i++ ) { + $lower_mutated = $case_mangle_method->invoke( new Generator( new Prng( "case-mangle-lower:{$i}" ), 4096, $names ), 'amp' ); + $upper_mutated = $case_mangle_method->invoke( new Generator( new Prng( "case-mangle-upper:{$i}" ), 4096, $names ), 'AMP' ); + if ( 'amp' === $lower_mutated || 'amp' !== strtolower( $lower_mutated ) ) { + $case_mangle_direct_errors[] = 'lower:' . $lower_mutated; + } + if ( 'AMP' === $upper_mutated || 'AMP' !== strtoupper( $upper_mutated ) ) { + $case_mangle_direct_errors[] = 'upper:' . $upper_mutated; + } +} +check( + 'case-mangle helper flips lowercase and uppercase source letters directly', + array() === $case_mangle_direct_errors, + implode( ',', array_slice( $case_mangle_direct_errors, 0, 20 ) ) +); + +$generator_reflection = new \ReflectionClass( Generator::class ); +$alphabet_constant = $generator_reflection->getReflectionConstant( 'ASCII_ALPHABET' ); +$ascii_alphabet = null === $alphabet_constant ? '' : (string) $alphabet_constant->getValue(); +check( + 'oracle-safe generator alphabet includes space, tab, LF, and FF followers', + str_contains( $ascii_alphabet, ' ' ) && + str_contains( $ascii_alphabet, "\t" ) && + str_contains( $ascii_alphabet, "\n" ) && + str_contains( $ascii_alphabet, "\f" ) && + Generator::is_oracle_safe_payload( $ascii_alphabet ), + bin2hex( $ascii_alphabet ) +); + +$strategies = array(); +$contexts = array(); +$unsafe = 0; +$reference_at_eof = 0; +$reference_at_eof_bad = 0; +$reference_at_eof_shapes = array(); +$attribute_multicodepoint_prefix = 0; +$composition = 0; +$composition_bad_shape = 0; +$composition_multi_reference_fragments = 0; +$total = 1200; +for ( $i = 0; $i < $total; $i++ ) { + $generated = ( new Generator( new Prng( "smoke:{$i}" ), 4096, $names ) )->generate(); + $strategies[ $generated['strategy'] ] = true; + $contexts[ $generated['context'] ] = true; + if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) { + ++$unsafe; + } + if ( 'reference-at-eof' === $generated['strategy'] ) { + ++$reference_at_eof; + $shape = reference_at_eof_shape( $generated['payload'] ); + if ( null === $shape ) { + ++$reference_at_eof_bad; + } else { + $reference_at_eof_shapes[ $shape ] = true; + } + } + if ( + 'attribute-prefix' === $generated['strategy'] && + ( + str_starts_with( $generated['payload'], '<⃒' ) || + str_starts_with( $generated['payload'], '>⃒' ) || + str_starts_with( $generated['payload'], '≪̸' ) || + str_starts_with( $generated['payload'], '=⃥' ) + ) + ) { + ++$attribute_multicodepoint_prefix; + } + if ( 'composition' === $generated['strategy'] ) { + ++$composition; + $fragments = explode( '|', $generated['payload'] ); + if ( count( $fragments ) < 2 || count( $fragments ) > 3 || in_array( '', $fragments, true ) ) { + ++$composition_bad_shape; + } + + $reference_fragments = 0; + foreach ( $fragments as $fragment ) { + if ( str_contains( $fragment, '&' ) ) { + ++$reference_fragments; + } + } + if ( $reference_fragments >= 2 ) { + ++$composition_multi_reference_fragments; + } + } +} +$seen_strategies = array_keys( $strategies ); +sort( $seen_strategies ); +check( 'all weighted strategies appear', expected_weighted_strategies() === $seen_strategies, implode( ',', $seen_strategies ) ); +check( 'generated cases run both contexts', array( 'both' ) === array_keys( $contexts ), implode( ',', array_keys( $contexts ) ) ); +check( 'generated payloads are oracle-safe', 0 === $unsafe, (string) $unsafe ); +check( 'attribute-prefix generator emits multi-code-point references', $attribute_multicodepoint_prefix > 0, (string) $attribute_multicodepoint_prefix ); +check( 'composition generator emits 2-3 separated fragments', $composition > 0 && 0 === $composition_bad_shape, "{$composition_bad_shape}/{$composition}" ); +check( 'composition generator splices multiple reference-bearing fragments', $composition_multi_reference_fragments > 0, "{$composition_multi_reference_fragments}/{$composition}" ); +check( 'reference-at-EOF cases end inside a reference', $reference_at_eof > 0 && 0 === $reference_at_eof_bad, "{$reference_at_eof_bad}/{$reference_at_eof}" ); +check( + 'reference-at-EOF covers expected suffix shapes', + array() === array_diff( + array( 'bare-introducer', 'partial-numeric-introducer', 'decimal-digits', 'hex-digits', 'named-prefix' ), + array_keys( $reference_at_eof_shapes ) + ), + implode( ',', array_keys( $reference_at_eof_shapes ) ) +); + +$small_compositions = 0; +$small_composition_bad = array(); +foreach ( array( 3, 5, 7, 12 ) as $max_bytes ) { + for ( $i = 0; $i < 1200; $i++ ) { + $generated = ( new Generator( new Prng( "composition-small:{$max_bytes}:{$i}" ), $max_bytes, $names ) )->generate(); + if ( 'composition' !== $generated['strategy'] ) { + continue; + } + + ++$small_compositions; + $fragments = explode( '|', $generated['payload'] ); + if ( + strlen( $generated['payload'] ) > $max_bytes || + count( $fragments ) < 2 || + count( $fragments ) > 3 || + in_array( '', $fragments, true ) + ) { + $small_composition_bad[] = "{$max_bytes}:{$i}:" . bin2hex( $generated['payload'] ); + } + } +} +check( 'composition generator keeps small max-bytes fragments nonempty', $small_compositions > 0 && array() === $small_composition_bad, implode( ',', $small_composition_bad ) ); + +$attribute_prefix_targets = array(); +$attribute_prefix_forms = array(); +$attribute_prefix_bad_targets = array(); +for ( $i = 0; $i < 8000; $i++ ) { + $generated = ( new Generator( new Prng( "attribute-prefix-smoke:{$i}" ), 4096, $names ) )->generate(); + if ( 'attribute-prefix' !== $generated['strategy'] ) { + continue; + } + + $decoded = $oracles->decode( 'attribute', $generated['payload'] ); + foreach ( attribute_prefix_smoke_targets() as $target ) { + if ( ! str_starts_with( $decoded, $target ) ) { + continue; + } + + $attribute_prefix_targets[ $target ] = true; + foreach ( attribute_prefix_encoding_forms( $generated['payload'] ) as $form ) { + $attribute_prefix_forms[ $form ] = true; + } + if ( ! \WP_HTML_Decoder::attribute_starts_with( $generated['payload'], $target, 'case-sensitive' ) ) { + $attribute_prefix_bad_targets[] = $target . ':' . bin2hex( substr( $generated['payload'], 0, 64 ) ); + } + break; + } +} +check( + 'attribute-prefix encoder covers every target string', + array() === array_diff( attribute_prefix_smoke_targets(), array_keys( $attribute_prefix_targets ) ), + implode( ',', array_keys( $attribute_prefix_targets ) ) +); +check( + 'attribute-prefix encoder covers literal, numeric, zero, hex, and semicolonless forms', + array() === array_diff( array( 'literal', 'decimal', 'leading-zero', 'hex', 'semicolonless' ), array_keys( $attribute_prefix_forms ) ), + implode( ',', array_keys( $attribute_prefix_forms ) ) +); +check( 'attribute-prefix encoded targets satisfy attribute_starts_with', array() === $attribute_prefix_bad_targets, implode( ',', $attribute_prefix_bad_targets ) ); + +$semicolonless_guard = new \ReflectionMethod( Generator::class, 'would_extend_semicolonless_numeric' ); +check( + 'attribute-prefix semicolonless numeric guard protects terminators and digits', + true === $semicolonless_guard->invoke( null, 'decimal', ';' ) && + true === $semicolonless_guard->invoke( null, 'hex', ';' ) && + true === $semicolonless_guard->invoke( null, 'decimal', '7' ) && + true === $semicolonless_guard->invoke( null, 'hex', 'A' ) && + false === $semicolonless_guard->invoke( null, 'decimal', 'A' ) && + false === $semicolonless_guard->invoke( null, null, ';' ) +); + +$numeric_ranges = array(); +$numeric_c1_values = array(); +$numeric_bmp_terminal_noncharacters = array(); +$numeric_noncharacter_planes = array(); +for ( $i = 0; $i < 6000; $i++ ) { + $generated = ( new Generator( new Prng( "numeric-range-smoke:{$i}" ), 4096, $names ) )->generate(); + foreach ( numeric_reference_ranges( $generated['payload'] ) as $range => $_ ) { + $numeric_ranges[ $range ] = true; + } + $match_count = preg_match_all( '/&#(?:([xX])([0-9A-Fa-f]+)|([0-9]+));?/', $generated['payload'], $matches, PREG_SET_ORDER ); + if ( false !== $match_count && $match_count > 0 ) { + foreach ( $matches as $match ) { + $is_hex = '' !== ( $match[1] ?? '' ); + $digits = $is_hex ? $match[2] : $match[3]; + $significant_digits = substr( $digits, strspn( $digits, '0' ) ); + if ( '' === $significant_digits || strlen( $significant_digits ) > ( $is_hex ? 6 : 7 ) ) { + continue; + } + + $value = intval( $significant_digits, $is_hex ? 16 : 10 ); + if ( $value >= 0x80 && $value <= 0x9F ) { + $numeric_c1_values[ $value ] = true; + } + if ( 0xFFFE === $value || 0xFFFF === $value ) { + $numeric_bmp_terminal_noncharacters[ $value ] = true; + } + if ( $value >= 0x1FFFE && $value <= 0x10FFFF && ( $value & 0xFFFF ) >= 0xFFFE ) { + $numeric_noncharacter_planes[ $value >> 16 ] = true; + } + } + } + if ( + array() === array_diff( + array( + 'zero-only', + 'c0-control', + 'c1-control', + 'bmp-pre-surrogate', + 'bmp-post-surrogate', + 'surrogate', + 'bmp-noncharacter', + 'plane-noncharacter', + 'astral', + 'above-unicode-legal-digits', + 'digit-count-overflow', + ), + array_keys( $numeric_ranges ) + ) && + 32 === count( $numeric_c1_values ) && + 2 === count( $numeric_bmp_terminal_noncharacters ) && + 16 === count( $numeric_noncharacter_planes ) + ) { + break; + } +} +check( + 'numeric generator covers range buckets', + array() === array_diff( + array( + 'zero-only', + 'c0-control', + 'c1-control', + 'bmp-pre-surrogate', + 'bmp-post-surrogate', + 'surrogate', + 'bmp-noncharacter', + 'plane-noncharacter', + 'astral', + 'above-unicode-legal-digits', + 'digit-count-overflow', + ), + array_keys( $numeric_ranges ) + ), + implode( ',', array_keys( $numeric_ranges ) ) +); +$expected_c1_values = range( 0x80, 0x9F ); +check( + 'numeric generator covers all C1 remap rows', + array() === array_diff( $expected_c1_values, array_keys( $numeric_c1_values ) ), + implode( ',', array_map( static fn( int $value ): string => dechex( $value ), array_keys( $numeric_c1_values ) ) ) +); +check( + 'numeric generator covers BMP terminal noncharacters', + array() === array_diff( array( 0xFFFE, 0xFFFF ), array_keys( $numeric_bmp_terminal_noncharacters ) ), + implode( ',', array_map( static fn( int $value ): string => dechex( $value ), array_keys( $numeric_bmp_terminal_noncharacters ) ) ) +); +check( + 'numeric generator covers per-plane noncharacters', + array() === array_diff( range( 1, 16 ), array_keys( $numeric_noncharacter_planes ) ), + implode( ',', array_keys( $numeric_noncharacter_planes ) ) +); + +$byte_strategies = array(); +$byte_contexts = array(); +$byte_unsafe = 0; +$byte_nul = 0; +for ( $i = 0; $i < $total; $i++ ) { + $generated = ( new Generator( new Prng( "byte-smoke:{$i}" ), 4096, $names ) )->generate_bytes(); + $byte_strategies[ $generated['strategy'] ] = true; + $byte_contexts[ $generated['context'] ] = true; + if ( ! Generator::is_oracle_safe_payload( $generated['payload'] ) ) { + ++$byte_unsafe; + } + if ( str_contains( $generated['payload'], "\x00" ) ) { + ++$byte_nul; + } +} +check( 'all 5 byte-space strategies appear', 5 === count( $byte_strategies ), implode( ',', array_keys( $byte_strategies ) ) ); +check( 'byte-space cases run both contexts', array( 'both' ) === array_keys( $byte_contexts ), implode( ',', array_keys( $byte_contexts ) ) ); +check( 'byte-space generator emits unsafe payloads', $byte_unsafe > 0, (string) $byte_unsafe ); +check( 'byte-space generator emits NUL bytes', $byte_nul > 0, (string) $byte_nul ); + +$trap_oracles = new class() extends Oracles { + public function decode( string $context, string $payload ): string { + throw new \RuntimeException( "oracle trap called for {$context} " . bin2hex( $payload ) ); + } +}; +$unsafe_byte_failures = ( new Checks( $trap_oracles ) )->run_without_oracle( 'both', "\xFF\x00<\"\r" ); +check( + 'oracle-free byte checks accept unsafe payloads', + array() === $unsafe_byte_failures, + json_encode( $unsafe_byte_failures ) +); +$raw_c1_failures = ( new Checks( $trap_oracles ) )->run_without_oracle( 'both', "\x80\x9F" ); +check( + 'oracle-free byte checks pass raw C1 bytes through unchanged', + array() === $raw_c1_failures, + json_encode( $raw_c1_failures ) +); + +$fuzz_failures = 0; +for ( $i = 0; $i < 300; $i++ ) { + $generated = ( new Generator( new Prng( "smoke-run:{$i}" ), 4096, $names ) )->generate(); + $failures = $checks->run( $generated['context'], $generated['payload'] ); + foreach ( $failures as $failure ) { + ++$fuzz_failures; + echo " finding: {$failure['signature']} on " . bin2hex( substr( $generated['payload'], 0, 48 ) ) . "\n"; + } +} +check( '300-case fuzz run clean', 0 === $fuzz_failures ); + +$byte_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'bytes', '--seed', '1', '--cases', '200', '--progress-every', '200' ) ); +check( '200-case byte-space worker clean', 0 === $byte_worker['code'], $byte_worker['stdout'] . $byte_worker['stderr'] ); + +$name_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'names', '--seed', '1', '--cases', '300', '--progress-every', '300' ) ); +check( + '300-case name-sweep worker clean', + 0 === $name_worker['code'] && str_contains( $name_worker['stdout'], '"name-sweep":300' ), + $name_worker['stdout'] . $name_worker['stderr'] +); + +$legacy_follower_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'legacy-followers', '--seed', '1', '--cases', '300', '--progress-every', '300' ) ); +check( + '300-case legacy-follower worker clean', + 0 === $legacy_follower_worker['code'] && str_contains( $legacy_follower_worker['stdout'], '"legacy-follower-sweep":300' ), + $legacy_follower_worker['stdout'] . $legacy_follower_worker['stderr'] +); + +$prefix_family_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'prefix-families', '--seed', '1', '--cases', '300', '--progress-every', '300' ) ); +check( + '300-case prefix-family worker clean', + 0 === $prefix_family_worker['code'] && str_contains( $prefix_family_worker['stdout'], '"prefix-family-sweep":300' ), + $prefix_family_worker['stdout'] . $prefix_family_worker['stderr'] +); + +$numeric_boundary_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'numeric-boundaries', '--seed', '1', '--cases', '64', '--progress-every', '64' ) ); +check( + '64-case numeric-boundary worker clean', + 0 === $numeric_boundary_worker['code'] && str_contains( $numeric_boundary_worker['stdout'], '"numeric-boundary-sweep":64' ), + $numeric_boundary_worker['stdout'] . $numeric_boundary_worker['stderr'] +); + +$corpus_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'corpus', '--seed', '1', '--cases', '300', '--progress-every', '300' ) ); +$corpus_worker_has_strategies = true; +foreach ( expected_corpus_strategies() as $strategy ) { + $corpus_worker_has_strategies = $corpus_worker_has_strategies && str_contains( $corpus_worker['stdout'], '"' . $strategy . '"' ); +} +check( + '300-case corpus mutation worker clean', + 0 === $corpus_worker['code'] && $corpus_worker_has_strategies, + $corpus_worker['stdout'] . $corpus_worker['stderr'] +); + +$token_map_worker = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'token-map', '--seed', '1', '--cases', '300', '--progress-every', '300' ) ); +check( + '300-case token-map worker clean', + 0 === $token_map_worker['code'] && str_contains( $token_map_worker['stdout'], '"token-map-structure-sweep":300' ), + $token_map_worker['stdout'] . $token_map_worker['stderr'] +); + +$coverage_unavailable_worker = run_process( + array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'coverage', '--seed', '1', '--cases', '1', '--progress-every', '1' ), + array( + 'HTML_DECODER_FUZZ_DISABLE_PCOV' => '1', + 'HTML_DECODER_FUZZ_FAKE_COVERAGE' => '0', + ) +); +check( + 'coverage worker reports unavailable pcov', + 2 === $coverage_unavailable_worker['code'] && str_contains( $coverage_unavailable_worker['stdout'], 'coverage mode requires pcov' ), + $coverage_unavailable_worker['stdout'] . $coverage_unavailable_worker['stderr'] +); + +$coverage_worker_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-coverage-worker-' . getmypid(); +remove_tree( $coverage_worker_dir ); +$coverage_worker = run_process( + array( PHP_BINARY, __DIR__ . '/../worker.php', '--mode', 'coverage', '--seed', '1', '--cases', '8', '--progress-every', '8', '--output-dir', $coverage_worker_dir ), + array( 'HTML_DECODER_FUZZ_FAKE_COVERAGE' => '1' ) +); +$coverage_worker_manifests = glob( $coverage_worker_dir . '/coverage-corpus/payload-*/coverage.json' ); +$coverage_worker_manifest = is_array( $coverage_worker_manifests ) && array() !== $coverage_worker_manifests + ? json_decode( (string) file_get_contents( $coverage_worker_manifests[0] ), true ) + : array(); +check( + 'coverage worker retains fake new-edge payloads', + 0 === $coverage_worker['code'] && + str_contains( $coverage_worker['stdout'], '"type":"coverage"' ) && + str_contains( $coverage_worker['stdout'], '"coverage_new_edges"' ) && + is_array( $coverage_worker_manifests ) && + count( $coverage_worker_manifests ) > 0, + $coverage_worker['stdout'] . $coverage_worker['stderr'] +); +check( + 'coverage corpus manifest records payload and target edges', + is_array( $coverage_worker_manifest ) && + 'coverage' === ( $coverage_worker_manifest['mode'] ?? null ) && + 'fake' === ( $coverage_worker_manifest['coverage_provider'] ?? null ) && + is_string( $coverage_worker_manifest['payload_base64'] ?? null ) && + ( $coverage_worker_manifest['new_edge_count'] ?? 0 ) > 0 && + is_array( $coverage_worker_manifest['new_edges'] ?? null ), + json_encode( $coverage_worker_manifest ) +); +remove_tree( $coverage_worker_dir ); + +$byte_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-byte-runner-' . getmypid(); +remove_tree( $byte_runner_dir ); +$byte_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'bytes', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--cases-per-batch', + '200', + '--summary-mode', + 'none', + '--output-dir', + $byte_runner_dir, + ) +); +$byte_runner_state = is_file( $byte_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $byte_runner_dir . '/state.json' ), true ) + : array(); +check( + '200-case byte-space runner clean', + 0 === $byte_runner['code'] && + 200 === ( $byte_runner_state['cases'] ?? 0 ) && + 200 === ( $byte_runner_state['by_context']['both'] ?? 0 ), + $byte_runner['stdout'] . $byte_runner['stderr'] . json_encode( $byte_runner_state ) +); +remove_tree( $byte_runner_dir ); + +$name_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-name-runner-' . getmypid(); +remove_tree( $name_runner_dir ); +$name_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'names', + '--lanes', + '2', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--cases-per-batch', + '100', + '--summary-mode', + 'all', + '--output-dir', + $name_runner_dir, + ) +); +$name_runner_state = is_file( $name_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $name_runner_dir . '/state.json' ), true ) + : array(); +check( + 'name-sweep runner clean', + 0 === $name_runner['code'] && + ( $name_runner_state['cases'] ?? 0 ) >= 200 && + ( $name_runner_state['cases'] ?? null ) === ( $name_runner_state['by_strategy']['name-sweep'] ?? null ) && + ( $name_runner_state['cases'] ?? null ) === ( $name_runner_state['by_context']['both'] ?? null ), + $name_runner['stdout'] . $name_runner['stderr'] . json_encode( $name_runner_state ) +); +$name_runner_windows = summary_start_windows( $name_runner_dir, 'names' ); +check( + 'name-sweep runner uses distinct start-case windows', + start_windows_are_distinct( $name_runner_windows, 100 ), + json_encode( $name_runner_windows ) +); +remove_tree( $name_runner_dir ); + +$legacy_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-legacy-follower-runner-' . getmypid(); +remove_tree( $legacy_runner_dir ); +$legacy_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'legacy-followers', + '--lanes', + '2', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--cases-per-batch', + '100', + '--summary-mode', + 'all', + '--output-dir', + $legacy_runner_dir, + ) +); +$legacy_runner_state = is_file( $legacy_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $legacy_runner_dir . '/state.json' ), true ) + : array(); +check( + 'legacy-follower runner clean', + 0 === $legacy_runner['code'] && + ( $legacy_runner_state['cases'] ?? 0 ) >= 200 && + ( $legacy_runner_state['cases'] ?? null ) === ( $legacy_runner_state['by_strategy']['legacy-follower-sweep'] ?? null ) && + ( $legacy_runner_state['cases'] ?? null ) === ( $legacy_runner_state['by_context']['both'] ?? null ), + $legacy_runner['stdout'] . $legacy_runner['stderr'] . json_encode( $legacy_runner_state ) +); +$legacy_runner_windows = summary_start_windows( $legacy_runner_dir, 'legacy-followers' ); +check( + 'legacy-follower runner uses distinct start-case windows', + start_windows_are_distinct( $legacy_runner_windows, 100 ), + json_encode( $legacy_runner_windows ) +); +remove_tree( $legacy_runner_dir ); + +$prefix_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-prefix-family-runner-' . getmypid(); +remove_tree( $prefix_runner_dir ); +$prefix_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'prefix-families', + '--lanes', + '2', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--cases-per-batch', + '100', + '--summary-mode', + 'all', + '--output-dir', + $prefix_runner_dir, + ) +); +$prefix_runner_state = is_file( $prefix_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $prefix_runner_dir . '/state.json' ), true ) + : array(); +check( + 'prefix-family runner clean', + 0 === $prefix_runner['code'] && + ( $prefix_runner_state['cases'] ?? 0 ) >= 200 && + ( $prefix_runner_state['cases'] ?? null ) === ( $prefix_runner_state['by_strategy']['prefix-family-sweep'] ?? null ) && + ( $prefix_runner_state['cases'] ?? null ) === ( $prefix_runner_state['by_context']['both'] ?? null ), + $prefix_runner['stdout'] . $prefix_runner['stderr'] . json_encode( $prefix_runner_state ) +); +$prefix_runner_windows = summary_start_windows( $prefix_runner_dir, 'prefix-families' ); +check( + 'prefix-family runner uses distinct start-case windows', + start_windows_are_distinct( $prefix_runner_windows, 100 ), + json_encode( $prefix_runner_windows ) +); +remove_tree( $prefix_runner_dir ); + +$numeric_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-numeric-boundary-runner-' . getmypid(); +remove_tree( $numeric_runner_dir ); +$numeric_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'numeric-boundaries', + '--lanes', + '2', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--cases-per-batch', + '100', + '--summary-mode', + 'all', + '--output-dir', + $numeric_runner_dir, + ) +); +$numeric_runner_state = is_file( $numeric_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $numeric_runner_dir . '/state.json' ), true ) + : array(); +check( + 'numeric-boundary runner clean', + 0 === $numeric_runner['code'] && + ( $numeric_runner_state['cases'] ?? 0 ) >= 200 && + ( $numeric_runner_state['cases'] ?? null ) === ( $numeric_runner_state['by_strategy']['numeric-boundary-sweep'] ?? null ) && + ( $numeric_runner_state['cases'] ?? null ) === ( $numeric_runner_state['by_context']['both'] ?? null ), + $numeric_runner['stdout'] . $numeric_runner['stderr'] . json_encode( $numeric_runner_state ) +); +$numeric_runner_windows = summary_start_windows( $numeric_runner_dir, 'numeric-boundaries' ); +check( + 'numeric-boundary runner uses distinct start-case windows', + start_windows_are_distinct( $numeric_runner_windows, 100 ), + json_encode( $numeric_runner_windows ) +); +remove_tree( $numeric_runner_dir ); + +$corpus_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-corpus-runner-' . getmypid(); +remove_tree( $corpus_runner_dir ); +$corpus_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'corpus', + '--lanes', + '2', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--cases-per-batch', + '100', + '--summary-mode', + 'all', + '--output-dir', + $corpus_runner_dir, + ) +); +$corpus_runner_state = is_file( $corpus_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $corpus_runner_dir . '/state.json' ), true ) + : array(); +$corpus_runner_strategies = array_keys( $corpus_runner_state['by_strategy'] ?? array() ); +sort( $corpus_runner_strategies ); +check( + 'corpus mutation runner clean', + 0 === $corpus_runner['code'] && + ( $corpus_runner_state['cases'] ?? 0 ) >= 200 && + ( $corpus_runner_state['cases'] ?? null ) === ( $corpus_runner_state['by_context']['both'] ?? null ) && + ( $corpus_runner_state['cases'] ?? null ) === array_sum( $corpus_runner_state['by_strategy'] ?? array() ) && + expected_corpus_strategies() === $corpus_runner_strategies, + $corpus_runner['stdout'] . $corpus_runner['stderr'] . json_encode( $corpus_runner_state ) +); +$corpus_runner_windows = summary_start_windows( $corpus_runner_dir, 'corpus' ); +check( + 'corpus mutation runner uses distinct start-case windows', + start_windows_are_distinct( $corpus_runner_windows, 100 ), + json_encode( $corpus_runner_windows ) +); +remove_tree( $corpus_runner_dir ); + +$token_map_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-token-map-runner-' . getmypid(); +remove_tree( $token_map_runner_dir ); +$token_map_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'token-map', + '--lanes', + '2', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--cases-per-batch', + '100', + '--summary-mode', + 'all', + '--output-dir', + $token_map_runner_dir, + ) +); +$token_map_runner_state = is_file( $token_map_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $token_map_runner_dir . '/state.json' ), true ) + : array(); +check( + 'token-map runner clean', + 0 === $token_map_runner['code'] && + ( $token_map_runner_state['cases'] ?? 0 ) >= 200 && + ( $token_map_runner_state['cases'] ?? null ) === ( $token_map_runner_state['by_strategy']['token-map-structure-sweep'] ?? null ) && + ( $token_map_runner_state['cases'] ?? null ) === ( $token_map_runner_state['by_context']['both'] ?? null ), + $token_map_runner['stdout'] . $token_map_runner['stderr'] . json_encode( $token_map_runner_state ) +); +$token_map_runner_windows = summary_start_windows( $token_map_runner_dir, 'token-map' ); +check( + 'token-map runner uses distinct start-case windows', + start_windows_are_distinct( $token_map_runner_windows, 100 ), + json_encode( $token_map_runner_windows ) +); +remove_tree( $token_map_runner_dir ); + +$coverage_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-coverage-runner-' . getmypid(); +remove_tree( $coverage_runner_dir ); +$coverage_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'coverage', + '--lanes', + '2', + '--duration-seconds', + '0', + '--max-cases', + '40', + '--cases-per-batch', + '20', + '--summary-mode', + 'failures', + '--output-dir', + $coverage_runner_dir, + ), + array( 'HTML_DECODER_FUZZ_FAKE_COVERAGE' => '1' ) +); +$coverage_runner_state = is_file( $coverage_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $coverage_runner_dir . '/state.json' ), true ) + : array(); +$coverage_runner_manifests = glob( $coverage_runner_dir . '/coverage-corpus/payload-*/coverage.json' ); +$coverage_summary = is_file( $coverage_runner_dir . '/summary.ndjson' ) + ? file( $coverage_runner_dir . '/summary.ndjson', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES ) + : array(); +$coverage_summary_retained = 0; +if ( is_array( $coverage_summary ) ) { + foreach ( $coverage_summary as $line ) { + $record = json_decode( $line, true ); + if ( is_array( $record ) && 'coverage' === ( $record['type'] ?? null ) && ! empty( $record['coverage_retained'] ) ) { + ++$coverage_summary_retained; + } + } +} +check( + 'coverage runner aggregates fake new-edge corpus', + 0 === $coverage_runner['code'] && + ( $coverage_runner_state['cases'] ?? 0 ) >= 40 && + ( $coverage_runner_state['cases'] ?? null ) === ( $coverage_runner_state['by_context']['both'] ?? null ) && + ( $coverage_runner_state['cases'] ?? null ) === array_sum( $coverage_runner_state['by_strategy'] ?? array() ) && + ( $coverage_runner_state['coverage']['edges'] ?? 0 ) > 0 && + ( $coverage_runner_state['coverage']['payloads'] ?? 0 ) > 0 && + is_array( $coverage_runner_manifests ) && + count( $coverage_runner_manifests ) === ( $coverage_runner_state['coverage']['payloads'] ?? -1 ) && + $coverage_summary_retained === ( $coverage_runner_state['coverage']['payloads'] ?? -1 ), + $coverage_runner['stdout'] . $coverage_runner['stderr'] . json_encode( $coverage_runner_state ) +); +remove_tree( $coverage_runner_dir ); + +$name_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'names', '--seed', '1', '--case', '0' ) ); +check( 'name-sweep replay regenerates clean case', 0 === $name_replay['code'], $name_replay['stdout'] . $name_replay['stderr'] ); + +$legacy_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'legacy-followers', '--seed', '1', '--case', '0' ) ); +check( 'legacy-follower replay regenerates clean case', 0 === $legacy_replay['code'], $legacy_replay['stdout'] . $legacy_replay['stderr'] ); + +$prefix_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'prefix-families', '--seed', '1', '--case', '37' ) ); +check( + 'prefix-family replay regenerates clean case', + 0 === $prefix_replay['code'] && + str_contains( $prefix_replay['stdout'], 'mode prefix-families, strategy prefix-family-sweep' ) && + str_contains( $prefix_replay['stdout'], 'Hex preview: 266e6f7478' ), + $prefix_replay['stdout'] . $prefix_replay['stderr'] +); + +$prefix_fault_seed_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'prefix-families', '--seed', '1', '--case', '37' ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) +); +check( 'faulted prefix-family seed replay reproduces generated case', 1 === $prefix_fault_seed_replay['code'], $prefix_fault_seed_replay['stdout'] . $prefix_fault_seed_replay['stderr'] ); + +$numeric_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'numeric-boundaries', '--seed', '1', '--case', '25' ) ); +check( + 'numeric-boundary replay regenerates mixed-case hex case', + 0 === $numeric_replay['code'] && + str_contains( $numeric_replay['stdout'], 'mode numeric-boundaries, strategy numeric-boundary-sweep' ) && + str_contains( $numeric_replay['stdout'], 'Hex preview: 2623783130466645653b' ), + $numeric_replay['stdout'] . $numeric_replay['stderr'] +); + +$numeric_fault_seed_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'numeric-boundaries', '--seed', '1', '--case', '0' ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +check( 'faulted numeric-boundary seed replay reproduces generated case', 1 === $numeric_fault_seed_replay['code'], $numeric_fault_seed_replay['stdout'] . $numeric_fault_seed_replay['stderr'] ); + +$corpus_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'corpus', '--seed', '1', '--case', '0' ) ); +check( + 'corpus mutation replay regenerates clean case', + 0 === $corpus_replay['code'] && + str_contains( $corpus_replay['stdout'], 'mode corpus, strategy corpus-byte-perturb' ) && + str_contains( $corpus_replay['stdout'], 'Hex preview: 64262335383b' ), + $corpus_replay['stdout'] . $corpus_replay['stderr'] +); + +$single_level_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'corpus', '--seed', '1', '--case', '11875' ) ); +check( + 'corpus replay regenerates single-level decode fixture', + 0 === $single_level_replay['code'] && + str_contains( $single_level_replay['stdout'], 'mode corpus, strategy corpus-splice' ) && + str_contains( $single_level_replay['stdout'], 'Hex preview: 26616d703b616d703b5a' ), + $single_level_replay['stdout'] . $single_level_replay['stderr'] +); + +$corpus_fault_seed_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'corpus', '--seed', '1', '--case', '0' ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +check( 'faulted corpus mutation seed replay reproduces generated case', 1 === $corpus_fault_seed_replay['code'], $corpus_fault_seed_replay['stdout'] . $corpus_fault_seed_replay['stderr'] ); + +$token_map_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'token-map', '--seed', '1', '--case', '0' ) ); +check( + 'token-map replay regenerates clean case', + 0 === $token_map_replay['code'] && + str_contains( $token_map_replay['stdout'], 'mode token-map, strategy token-map-structure-sweep' ), + $token_map_replay['stdout'] . $token_map_replay['stderr'] +); + +if ( null !== $token_map_fault_case_index ) { + $token_map_fault_seed_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'token-map', '--seed', '1', '--case', (string) $token_map_fault_case_index ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted token-map seed replay reproduces generated case', 1 === $token_map_fault_seed_replay['code'], $token_map_fault_seed_replay['stdout'] . $token_map_fault_seed_replay['stderr'] ); +} + +$coverage_replay = run_process( array( PHP_BINARY, __DIR__ . '/../replay.php', '--mode', 'coverage', '--seed', '1', '--case', '0' ) ); +check( + 'coverage replay regenerates clean generated case', + 0 === $coverage_replay['code'] && + str_contains( $coverage_replay['stdout'], 'mode coverage, strategy numeric' ), + $coverage_replay['stdout'] . $coverage_replay['stderr'] +); + +$name_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-name-fault-' . getmypid(); +remove_tree( $name_pipeline_dir ); +$faulted_name_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'names', + '--seed', + '1', + '--start-case', + '11593', + '--cases', + '1', + '--output-dir', + $name_pipeline_dir, + '--progress-every', + '1', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) +); +check( 'faulted name-sweep worker reports findings', 1 === $faulted_name_worker['code'], $faulted_name_worker['stdout'] . $faulted_name_worker['stderr'] ); + +$name_failure_files = glob( $name_pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted name-sweep worker writes failure artifact', is_array( $name_failure_files ) && array() !== $name_failure_files ); + +$name_failure_file = is_array( $name_failure_files ) && array() !== $name_failure_files ? $name_failure_files[0] : null; +if ( null !== $name_failure_file ) { + $name_manifest = json_decode( (string) file_get_contents( $name_failure_file ), true ); + check( + 'name-sweep failure artifact records mode and signature', + 'names' === ( $name_manifest['mode'] ?? null ) && + 'name-sweep' === ( $name_manifest['strategy'] ?? null ) && + 11593 === ( $name_manifest['case'] ?? null ) && + in_array( 'decode-mismatch:attribute', $name_manifest['signatures'] ?? array(), true ), + json_encode( $name_manifest ) + ); + + $name_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $name_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted name-sweep replay reproduces finding', 1 === $name_fault_replay['code'], $name_fault_replay['stdout'] . $name_fault_replay['stderr'] ); + + $name_fault_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $name_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted name-sweep minimizer preserves signature', 0 === $name_fault_minimize['code'], $name_fault_minimize['stdout'] . $name_fault_minimize['stderr'] ); +} +remove_tree( $name_pipeline_dir ); + +$legacy_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-legacy-follower-fault-' . getmypid(); +remove_tree( $legacy_pipeline_dir ); +$faulted_legacy_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'legacy-followers', + '--seed', + '1', + '--start-case', + '0', + '--cases', + '80', + '--output-dir', + $legacy_pipeline_dir, + '--progress-every', + '80', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) +); +check( 'faulted legacy-follower worker reports findings', 1 === $faulted_legacy_worker['code'], $faulted_legacy_worker['stdout'] . $faulted_legacy_worker['stderr'] ); + +$legacy_failure_files = glob( $legacy_pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted legacy-follower worker writes failure artifact', is_array( $legacy_failure_files ) && array() !== $legacy_failure_files ); + +$legacy_failure_file = is_array( $legacy_failure_files ) && array() !== $legacy_failure_files ? $legacy_failure_files[0] : null; +if ( null !== $legacy_failure_file ) { + $legacy_manifest = json_decode( (string) file_get_contents( $legacy_failure_file ), true ); + check( + 'legacy-follower failure artifact records mode and signature', + 'legacy-followers' === ( $legacy_manifest['mode'] ?? null ) && + 'legacy-follower-sweep' === ( $legacy_manifest['strategy'] ?? null ) && + in_array( 'decode-mismatch:attribute', $legacy_manifest['signatures'] ?? array(), true ), + json_encode( $legacy_manifest ) + ); + + $legacy_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $legacy_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted legacy-follower replay reproduces finding', 1 === $legacy_fault_replay['code'], $legacy_fault_replay['stdout'] . $legacy_fault_replay['stderr'] ); + + $legacy_fault_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $legacy_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted legacy-follower minimizer preserves signature', 0 === $legacy_fault_minimize['code'], $legacy_fault_minimize['stdout'] . $legacy_fault_minimize['stderr'] ); +} +remove_tree( $legacy_pipeline_dir ); + +$prefix_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-prefix-family-fault-' . getmypid(); +remove_tree( $prefix_pipeline_dir ); +$faulted_prefix_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'prefix-families', + '--seed', + '1', + '--start-case', + '37', + '--cases', + '1', + '--output-dir', + $prefix_pipeline_dir, + '--progress-every', + '1', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) +); +check( 'faulted prefix-family worker reports findings', 1 === $faulted_prefix_worker['code'], $faulted_prefix_worker['stdout'] . $faulted_prefix_worker['stderr'] ); + +$prefix_failure_files = glob( $prefix_pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted prefix-family worker writes failure artifact', is_array( $prefix_failure_files ) && array() !== $prefix_failure_files ); + +$prefix_failure_file = is_array( $prefix_failure_files ) && array() !== $prefix_failure_files ? $prefix_failure_files[0] : null; +if ( null !== $prefix_failure_file ) { + $prefix_manifest = json_decode( (string) file_get_contents( $prefix_failure_file ), true ); + check( + 'prefix-family failure artifact records mode and signature', + 'prefix-families' === ( $prefix_manifest['mode'] ?? null ) && + 'prefix-family-sweep' === ( $prefix_manifest['strategy'] ?? null ) && + 37 === ( $prefix_manifest['case'] ?? null ) && + in_array( 'decode-mismatch:attribute', $prefix_manifest['signatures'] ?? array(), true ), + json_encode( $prefix_manifest ) + ); + + $prefix_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $prefix_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted prefix-family replay reproduces finding', 1 === $prefix_fault_replay['code'], $prefix_fault_replay['stdout'] . $prefix_fault_replay['stderr'] ); + + $prefix_fault_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $prefix_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted prefix-family minimizer preserves signature', 0 === $prefix_fault_minimize['code'], $prefix_fault_minimize['stdout'] . $prefix_fault_minimize['stderr'] ); +} +remove_tree( $prefix_pipeline_dir ); + +$numeric_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-numeric-boundary-fault-' . getmypid(); +remove_tree( $numeric_pipeline_dir ); +$faulted_numeric_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'numeric-boundaries', + '--seed', + '1', + '--start-case', + '0', + '--cases', + '1', + '--output-dir', + $numeric_pipeline_dir, + '--progress-every', + '1', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +check( 'faulted numeric-boundary worker reports findings', 1 === $faulted_numeric_worker['code'], $faulted_numeric_worker['stdout'] . $faulted_numeric_worker['stderr'] ); + +$numeric_failure_files = glob( $numeric_pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted numeric-boundary worker writes failure artifact', is_array( $numeric_failure_files ) && array() !== $numeric_failure_files ); + +$numeric_failure_file = is_array( $numeric_failure_files ) && array() !== $numeric_failure_files ? $numeric_failure_files[0] : null; +if ( null !== $numeric_failure_file ) { + $numeric_manifest = json_decode( (string) file_get_contents( $numeric_failure_file ), true ); + check( + 'numeric-boundary failure artifact records mode and signature', + 'numeric-boundaries' === ( $numeric_manifest['mode'] ?? null ) && + 'numeric-boundary-sweep' === ( $numeric_manifest['strategy'] ?? null ) && + 0 === ( $numeric_manifest['case'] ?? null ) && + in_array( 'reader-overran-input:text', $numeric_manifest['signatures'] ?? array(), true ), + json_encode( $numeric_manifest ) + ); + + $numeric_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $numeric_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) + ); + check( 'faulted numeric-boundary replay reproduces finding', 1 === $numeric_fault_replay['code'], $numeric_fault_replay['stdout'] . $numeric_fault_replay['stderr'] ); + + $numeric_fault_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $numeric_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) + ); + check( 'faulted numeric-boundary minimizer preserves signature', 0 === $numeric_fault_minimize['code'], $numeric_fault_minimize['stdout'] . $numeric_fault_minimize['stderr'] ); +} +remove_tree( $numeric_pipeline_dir ); + +$corpus_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-corpus-fault-' . getmypid(); +remove_tree( $corpus_pipeline_dir ); +$faulted_corpus_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'corpus', + '--seed', + '1', + '--start-case', + '0', + '--cases', + '1', + '--output-dir', + $corpus_pipeline_dir, + '--progress-every', + '1', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +check( 'faulted corpus mutation worker reports findings', 1 === $faulted_corpus_worker['code'], $faulted_corpus_worker['stdout'] . $faulted_corpus_worker['stderr'] ); + +$corpus_failure_files = glob( $corpus_pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted corpus mutation worker writes failure artifact', is_array( $corpus_failure_files ) && array() !== $corpus_failure_files ); + +$corpus_failure_file = is_array( $corpus_failure_files ) && array() !== $corpus_failure_files ? $corpus_failure_files[0] : null; +if ( null !== $corpus_failure_file ) { + $corpus_manifest = json_decode( (string) file_get_contents( $corpus_failure_file ), true ); + check( + 'corpus mutation failure artifact records mode and signature', + 'corpus' === ( $corpus_manifest['mode'] ?? null ) && + 'corpus-byte-perturb' === ( $corpus_manifest['strategy'] ?? null ) && + 0 === ( $corpus_manifest['case'] ?? null ) && + in_array( 'reader-overran-input:text', $corpus_manifest['signatures'] ?? array(), true ), + json_encode( $corpus_manifest ) + ); + + $corpus_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $corpus_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) + ); + check( 'faulted corpus mutation replay reproduces finding', 1 === $corpus_fault_replay['code'], $corpus_fault_replay['stdout'] . $corpus_fault_replay['stderr'] ); + + $corpus_fault_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $corpus_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) + ); + check( 'faulted corpus mutation minimizer preserves signature', 0 === $corpus_fault_minimize['code'], $corpus_fault_minimize['stdout'] . $corpus_fault_minimize['stderr'] ); +} +remove_tree( $corpus_pipeline_dir ); + +$single_level_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-single-level-fault-' . getmypid(); +remove_tree( $single_level_pipeline_dir ); +$faulted_single_level_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'corpus', + '--seed', + '1', + '--start-case', + '11875', + '--cases', + '1', + '--output-dir', + $single_level_pipeline_dir, + '--progress-every', + '1', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'single-level-overdecode' ) +); +check( 'faulted single-level corpus worker reports findings', 1 === $faulted_single_level_worker['code'], $faulted_single_level_worker['stdout'] . $faulted_single_level_worker['stderr'] ); + +$single_level_failure_files = glob( $single_level_pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted single-level corpus worker writes failure artifact', is_array( $single_level_failure_files ) && array() !== $single_level_failure_files ); + +$single_level_failure_file = is_array( $single_level_failure_files ) && array() !== $single_level_failure_files ? $single_level_failure_files[0] : null; +if ( null !== $single_level_failure_file ) { + $single_level_manifest = json_decode( (string) file_get_contents( $single_level_failure_file ), true ); + check( + 'single-level corpus failure artifact records mode and signature', + 'corpus' === ( $single_level_manifest['mode'] ?? null ) && + 'corpus-splice' === ( $single_level_manifest['strategy'] ?? null ) && + 11875 === ( $single_level_manifest['case'] ?? null ) && + in_array( 'single-level-decode-overdecoded:text', $single_level_manifest['signatures'] ?? array(), true ) && + in_array( 'single-level-decode-overdecoded:attribute', $single_level_manifest['signatures'] ?? array(), true ), + json_encode( $single_level_manifest ) + ); + + $single_level_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $single_level_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'single-level-overdecode' ) + ); + check( 'faulted single-level corpus replay reproduces finding', 1 === $single_level_fault_replay['code'], $single_level_fault_replay['stdout'] . $single_level_fault_replay['stderr'] ); + + $single_level_fault_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $single_level_failure_file, '--signature', 'single-level-decode-overdecoded:text' ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'single-level-overdecode' ) + ); + check( 'faulted single-level corpus minimizer preserves signature', 0 === $single_level_fault_minimize['code'], $single_level_fault_minimize['stdout'] . $single_level_fault_minimize['stderr'] ); +} +remove_tree( $single_level_pipeline_dir ); + +if ( null !== $token_map_fault_case_index ) { + $token_map_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-token-map-fault-' . getmypid(); + remove_tree( $token_map_pipeline_dir ); + $faulted_token_map_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'token-map', + '--seed', + '1', + '--start-case', + (string) $token_map_fault_case_index, + '--cases', + '1', + '--output-dir', + $token_map_pipeline_dir, + '--progress-every', + '1', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted token-map worker reports findings', 1 === $faulted_token_map_worker['code'], $faulted_token_map_worker['stdout'] . $faulted_token_map_worker['stderr'] ); + + $token_map_failure_files = glob( $token_map_pipeline_dir . '/failure-*/failure.json' ); + check( 'faulted token-map worker writes failure artifact', is_array( $token_map_failure_files ) && array() !== $token_map_failure_files ); + + $token_map_failure_file = is_array( $token_map_failure_files ) && array() !== $token_map_failure_files ? $token_map_failure_files[0] : null; + if ( null !== $token_map_failure_file ) { + $token_map_manifest = json_decode( (string) file_get_contents( $token_map_failure_file ), true ); + check( + 'token-map failure artifact records mode and signature', + 'token-map' === ( $token_map_manifest['mode'] ?? null ) && + 'token-map-structure-sweep' === ( $token_map_manifest['strategy'] ?? null ) && + $token_map_fault_case_index === ( $token_map_manifest['case'] ?? null ) && + in_array( 'decode-mismatch:attribute', $token_map_manifest['signatures'] ?? array(), true ), + json_encode( $token_map_manifest ) + ); + + $token_map_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $token_map_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted token-map replay reproduces finding', 1 === $token_map_fault_replay['code'], $token_map_fault_replay['stdout'] . $token_map_fault_replay['stderr'] ); + + $token_map_fault_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $token_map_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'attribute-semicolonless' ) + ); + check( 'faulted token-map minimizer preserves signature', 0 === $token_map_fault_minimize['code'], $token_map_fault_minimize['stdout'] . $token_map_fault_minimize['stderr'] ); + } + remove_tree( $token_map_pipeline_dir ); +} + +$coverage_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-coverage-fault-' . getmypid(); +remove_tree( $coverage_pipeline_dir ); +$faulted_coverage_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'coverage', + '--seed', + '1', + '--start-case', + '57', + '--cases', + '1', + '--output-dir', + $coverage_pipeline_dir, + '--progress-every', + '1', + ), + array( + 'HTML_DECODER_FUZZ_FAKE_COVERAGE' => '1', + 'HTML_DECODER_FUZZ_FAULT' => 'reader-empty-chunk', + ) +); +check( 'faulted coverage worker reports findings', 1 === $faulted_coverage_worker['code'], $faulted_coverage_worker['stdout'] . $faulted_coverage_worker['stderr'] ); + +$coverage_failure_files = glob( $coverage_pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted coverage worker writes failure artifact', is_array( $coverage_failure_files ) && array() !== $coverage_failure_files ); + +$coverage_failure_file = is_array( $coverage_failure_files ) && array() !== $coverage_failure_files ? $coverage_failure_files[0] : null; +if ( null !== $coverage_failure_file ) { + $coverage_manifest = json_decode( (string) file_get_contents( $coverage_failure_file ), true ); + check( + 'coverage failure artifact records mode and signature', + 'coverage' === ( $coverage_manifest['mode'] ?? null ) && + 57 === ( $coverage_manifest['case'] ?? null ) && + in_array( 'reader-returned-empty-chunk:text', $coverage_manifest['signatures'] ?? array(), true ), + json_encode( $coverage_manifest ) + ); + + $coverage_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $coverage_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'reader-empty-chunk' ) + ); + check( 'faulted coverage replay reproduces finding', 1 === $coverage_fault_replay['code'], $coverage_fault_replay['stdout'] . $coverage_fault_replay['stderr'] ); + + $coverage_fault_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $coverage_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'reader-empty-chunk' ) + ); + check( 'faulted coverage minimizer preserves signature', 0 === $coverage_fault_minimize['code'], $coverage_fault_minimize['stdout'] . $coverage_fault_minimize['stderr'] ); +} +remove_tree( $coverage_pipeline_dir ); + +$reader_fault_pipelines = array( + array( + 'fault' => 'reader-empty-chunk', + 'case' => 57, + 'signature' => 'reader-returned-empty-chunk:text', + ), + array( + 'fault' => 'reader-short-match-length', + 'case' => 57, + 'signature' => 'reader-match-too-short:text', + ), + array( + 'fault' => 'reader-substring-composition', + 'case' => 97, + 'signature' => 'reader-composition-mismatch:text', + ), + array( + 'fault' => 'reader-null-mutates-match-length', + 'case' => 7, + 'signature' => 'reader-mutated-match-length-on-null:text', + ), + array( + 'fault' => 'reader-non-amp-match', + 'case' => 0, + 'signature' => 'reader-non-amp-match:text', + ), + array( + 'fault' => 'reader-gapless-drop-span', + 'case' => 0, + 'signature' => 'reader-walk-not-gapless:text', + ), + array( + 'fault' => 'numeric-invalid-not-replacement', + 'case' => 0, + 'signature' => 'numeric-invalid-not-replacement:text', + ), + array( + 'fault' => 'numeric-c1-not-remapped', + 'case' => 2, + 'signature' => 'numeric-c1-not-remapped:text', + ), + array( + 'fault' => 'text-secondary-oracle', + 'case' => 4, + 'signature' => 'text-secondary-oracle-mismatch:text', + 'minimize_signature' => 'text-secondary-oracle-mismatch:text', + ), + array( + 'fault' => 'attribute-no-amp-identity', + 'case' => 38, + 'signature' => 'attribute-without-ampersand-not-identity:attribute', + ), +); +foreach ( $reader_fault_pipelines as $reader_pipeline ) { + $reader_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-' . $reader_pipeline['fault'] . '-' . getmypid(); + remove_tree( $reader_pipeline_dir ); + $faulted_reader_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--seed', + '1', + '--start-case', + (string) $reader_pipeline['case'], + '--cases', + '1', + '--output-dir', + $reader_pipeline_dir, + '--progress-every', + '1', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => $reader_pipeline['fault'] ) + ); + check( "faulted {$reader_pipeline['fault']} worker reports findings", 1 === $faulted_reader_worker['code'], $faulted_reader_worker['stdout'] . $faulted_reader_worker['stderr'] ); + + $reader_failure_files = glob( $reader_pipeline_dir . '/failure-*/failure.json' ); + check( "faulted {$reader_pipeline['fault']} worker writes failure artifact", is_array( $reader_failure_files ) && array() !== $reader_failure_files ); + + $reader_failure_file = is_array( $reader_failure_files ) && array() !== $reader_failure_files ? $reader_failure_files[0] : null; + if ( null !== $reader_failure_file ) { + $reader_manifest = json_decode( (string) file_get_contents( $reader_failure_file ), true ); + check( + "{$reader_pipeline['fault']} failure artifact records mode and signature", + 'oracle' === ( $reader_manifest['mode'] ?? null ) && + $reader_pipeline['case'] === ( $reader_manifest['case'] ?? null ) && + in_array( $reader_pipeline['signature'], $reader_manifest['signatures'] ?? array(), true ), + json_encode( $reader_manifest ) + ); + + $reader_fault_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $reader_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => $reader_pipeline['fault'] ) + ); + check( "faulted {$reader_pipeline['fault']} replay reproduces finding", 1 === $reader_fault_replay['code'], $reader_fault_replay['stdout'] . $reader_fault_replay['stderr'] ); + + $reader_fault_minimize_command = array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $reader_failure_file ); + if ( isset( $reader_pipeline['minimize_signature'] ) ) { + $reader_fault_minimize_command[] = '--signature'; + $reader_fault_minimize_command[] = $reader_pipeline['minimize_signature']; + } + $reader_fault_minimize = run_process( + $reader_fault_minimize_command, + array( 'HTML_DECODER_FUZZ_FAULT' => $reader_pipeline['fault'] ) + ); + check( "faulted {$reader_pipeline['fault']} minimizer preserves signature", 0 === $reader_fault_minimize['code'], $reader_fault_minimize['stdout'] . $reader_fault_minimize['stderr'] ); + } + remove_tree( $reader_pipeline_dir ); +} + +$zero_cases = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--cases', '0' ) ); +check( 'worker rejects zero cases', 2 === $zero_cases['code'], $zero_cases['stdout'] . $zero_cases['stderr'] ); + +$zero_batch = run_process( array( PHP_BINARY, __DIR__ . '/../runner.php', '--cases-per-batch', '0', '--duration-seconds', '1', '--output-dir', sys_get_temp_dir() . '/html-decoder-fuzz-smoke-bad-runner-' . getmypid() ) ); +check( 'runner rejects zero cases per batch', 2 === $zero_batch['code'], $zero_batch['stdout'] . $zero_batch['stderr'] ); + +$unwritable_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unwritable-' . getmypid(); +remove_tree( $unwritable_dir ); +mkdir( $unwritable_dir, 0555, true ); +chmod( $unwritable_dir, 0555 ); +clearstatcache( true, $unwritable_dir ); +$unwritable_runner = run_process( array( PHP_BINARY, __DIR__ . '/../runner.php', '--duration-seconds', '1', '--output-dir', $unwritable_dir ) ); +chmod( $unwritable_dir, 0755 ); +clearstatcache( true, $unwritable_dir ); +remove_tree( $unwritable_dir ); +check( 'runner rejects unwritable output dir', 2 === $unwritable_runner['code'], $unwritable_runner['stdout'] . $unwritable_runner['stderr'] ); + +$unreadable_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unreadable-' . getmypid(); +remove_tree( $unreadable_dir ); +mkdir( $unreadable_dir, 0333, true ); +chmod( $unreadable_dir, 0333 ); +clearstatcache( true, $unreadable_dir ); +$unreadable_runner = run_process( array( PHP_BINARY, __DIR__ . '/../runner.php', '--duration-seconds', '1', '--output-dir', $unreadable_dir ) ); +chmod( $unreadable_dir, 0755 ); +clearstatcache( true, $unreadable_dir ); +remove_tree( $unreadable_dir ); +check( 'runner rejects unreadable output dir', 2 === $unreadable_runner['code'], $unreadable_runner['stdout'] . $unreadable_runner['stderr'] ); + +$bad_state_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-bad-state-' . getmypid(); +remove_tree( $bad_state_dir ); +mkdir( $bad_state_dir, 0777, true ); +mkdir( $bad_state_dir . '/state.json' ); +$bad_state_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--output-dir', + $bad_state_dir, + ) +); +remove_tree( $bad_state_dir ); +check( 'runner reports state write failures', 2 === $bad_state_runner['code'], $bad_state_runner['stdout'] . $bad_state_runner['stderr'] ); + +$state_hardlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-state-hardlink-' . getmypid(); +remove_tree( $state_hardlink_dir ); +mkdir( $state_hardlink_dir, 0777, true ); +$state_hardlink_target = $state_hardlink_dir . '-target'; +file_put_contents( $state_hardlink_target, "sentinel\n" ); +$state_hardlink_created = @link( $state_hardlink_target, $state_hardlink_dir . '/state.json' ); +if ( $state_hardlink_created ) { + $state_hardlink_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--summary-mode', + 'none', + '--output-dir', + $state_hardlink_dir, + ) + ); + check( + 'runner rejects hardlinked state output', + 2 === $state_hardlink_runner['code'] && "sentinel\n" === file_get_contents( $state_hardlink_target ), + $state_hardlink_runner['stdout'] . $state_hardlink_runner['stderr'] + ); +} else { + check( 'runner rejects hardlinked state output', true, 'hardlink unavailable' ); +} +remove_tree( $state_hardlink_dir ); +@unlink( $state_hardlink_target ); + +$bad_summary_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-bad-summary-' . getmypid(); +remove_tree( $bad_summary_dir ); +mkdir( $bad_summary_dir, 0777, true ); +mkdir( $bad_summary_dir . '/summary.ndjson' ); +$bad_summary_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--output-dir', + $bad_summary_dir, + ) +); +remove_tree( $bad_summary_dir ); +check( 'runner reports summary open failures', 2 === $bad_summary_runner['code'], $bad_summary_runner['stdout'] . $bad_summary_runner['stderr'] ); + +$summary_symlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-summary-symlink-' . getmypid(); +remove_tree( $summary_symlink_dir ); +mkdir( $summary_symlink_dir, 0777, true ); +$summary_symlink_target = $summary_symlink_dir . '-target'; +file_put_contents( $summary_symlink_target, "sentinel\n" ); +$summary_symlink_created = @symlink( $summary_symlink_target, $summary_symlink_dir . '/summary.ndjson' ); +if ( $summary_symlink_created ) { + $summary_symlink_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--output-dir', + $summary_symlink_dir, + ) + ); + check( + 'runner rejects symlinked summary output', + 2 === $summary_symlink_runner['code'] && "sentinel\n" === file_get_contents( $summary_symlink_target ), + $summary_symlink_runner['stdout'] . $summary_symlink_runner['stderr'] + ); +} else { + check( 'runner rejects symlinked summary output', true, 'symlink unavailable' ); +} +remove_tree( $summary_symlink_dir ); +@unlink( $summary_symlink_target ); + +$summary_hardlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-summary-hardlink-' . getmypid(); +remove_tree( $summary_hardlink_dir ); +mkdir( $summary_hardlink_dir, 0777, true ); +$summary_hardlink_target = $summary_hardlink_dir . '-target'; +file_put_contents( $summary_hardlink_target, "sentinel\n" ); +$summary_hardlink_created = @link( $summary_hardlink_target, $summary_hardlink_dir . '/summary.ndjson' ); +if ( $summary_hardlink_created ) { + $summary_hardlink_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--output-dir', + $summary_hardlink_dir, + ) + ); + check( + 'runner rejects hardlinked summary output', + 2 === $summary_hardlink_runner['code'] && "sentinel\n" === file_get_contents( $summary_hardlink_target ), + $summary_hardlink_runner['stdout'] . $summary_hardlink_runner['stderr'] + ); +} else { + check( 'runner rejects hardlinked summary output', true, 'hardlink unavailable' ); +} +remove_tree( $summary_hardlink_dir ); +@unlink( $summary_hardlink_target ); + +$lane_stderr_symlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-symlink-' . getmypid(); +remove_tree( $lane_stderr_symlink_dir ); +mkdir( $lane_stderr_symlink_dir, 0777, true ); +$lane_stderr_symlink_target = $lane_stderr_symlink_dir . '-target'; +file_put_contents( $lane_stderr_symlink_target, "sentinel\n" ); +$lane_stderr_symlink_created = @symlink( $lane_stderr_symlink_target, $lane_stderr_symlink_dir . '/lane-0-stderr.log' ); +if ( $lane_stderr_symlink_created ) { + $lane_stderr_symlink_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--output-dir', + $lane_stderr_symlink_dir, + ) + ); + check( + 'runner rejects symlinked lane stderr output', + 2 === $lane_stderr_symlink_runner['code'] && "sentinel\n" === file_get_contents( $lane_stderr_symlink_target ), + $lane_stderr_symlink_runner['stdout'] . $lane_stderr_symlink_runner['stderr'] + ); +} else { + check( 'runner rejects symlinked lane stderr output', true, 'symlink unavailable' ); +} +remove_tree( $lane_stderr_symlink_dir ); +@unlink( $lane_stderr_symlink_target ); + +$lane_stderr_hardlink_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-hardlink-' . getmypid(); +remove_tree( $lane_stderr_hardlink_dir ); +mkdir( $lane_stderr_hardlink_dir, 0777, true ); +$lane_stderr_hardlink_target = $lane_stderr_hardlink_dir . '-target'; +file_put_contents( $lane_stderr_hardlink_target, "sentinel\n" ); +$lane_stderr_hardlink_created = @link( $lane_stderr_hardlink_target, $lane_stderr_hardlink_dir . '/lane-0-stderr.log' ); +if ( $lane_stderr_hardlink_created ) { + $lane_stderr_hardlink_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--output-dir', + $lane_stderr_hardlink_dir, + ) + ); + check( + 'runner rejects hardlinked lane stderr output', + 2 === $lane_stderr_hardlink_runner['code'] && "sentinel\n" === file_get_contents( $lane_stderr_hardlink_target ), + $lane_stderr_hardlink_runner['stdout'] . $lane_stderr_hardlink_runner['stderr'] + ); +} else { + check( 'runner rejects hardlinked lane stderr output', true, 'hardlink unavailable' ); +} +remove_tree( $lane_stderr_hardlink_dir ); +@unlink( $lane_stderr_hardlink_target ); + +$lane_stderr_fifo_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-fifo-' . getmypid(); +remove_tree( $lane_stderr_fifo_dir ); +mkdir( $lane_stderr_fifo_dir, 0777, true ); +$lane_stderr_fifo_created = function_exists( 'posix_mkfifo' ) && @posix_mkfifo( $lane_stderr_fifo_dir . '/lane-0-stderr.log', 0600 ); +if ( $lane_stderr_fifo_created ) { + $lane_stderr_fifo_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--output-dir', + $lane_stderr_fifo_dir, + ) + ); + check( + 'runner rejects non-regular lane stderr output', + 2 === $lane_stderr_fifo_runner['code'], + $lane_stderr_fifo_runner['stdout'] . $lane_stderr_fifo_runner['stderr'] + ); +} else { + check( 'runner rejects non-regular lane stderr output', true, 'fifo unavailable' ); +} +remove_tree( $lane_stderr_fifo_dir ); + +$lane_stderr_cap_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-cap-' . getmypid(); +remove_tree( $lane_stderr_cap_dir ); +$lane_stderr_cap_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '40', + '--cases-per-batch', + '20', + '--max-stderr-bytes', + '128', + '--output-dir', + $lane_stderr_cap_dir, + ), + array( 'HTML_DECODER_FUZZ_STDERR_BYTES_PER_CASE' => '100' ) +); +$lane_stderr_cap_state = is_file( $lane_stderr_cap_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $lane_stderr_cap_dir . '/state.json' ), true ) + : array(); +$lane_stderr_cap_size = is_file( $lane_stderr_cap_dir . '/lane-0-stderr.log' ) ? filesize( $lane_stderr_cap_dir . '/lane-0-stderr.log' ) : 0; +check( + 'runner caps per-lane stderr logs', + 0 === $lane_stderr_cap_runner['code'] && + $lane_stderr_cap_size <= 128 && + 1 === count( $lane_stderr_cap_state['worker_stderr_truncated'] ?? array() ), + $lane_stderr_cap_runner['stdout'] . $lane_stderr_cap_runner['stderr'] . ' stderr_size=' . $lane_stderr_cap_size . ' state=' . json_encode( $lane_stderr_cap_state['worker_stderr_truncated'] ?? null ) +); +$lane_stderr_cap_reuse_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '40', + '--cases-per-batch', + '20', + '--max-stderr-bytes', + '128', + '--output-dir', + $lane_stderr_cap_dir, + ), + array( 'HTML_DECODER_FUZZ_STDERR_BYTES_PER_CASE' => '100' ) +); +$lane_stderr_cap_reuse_size = is_file( $lane_stderr_cap_dir . '/lane-0-stderr.log' ) ? filesize( $lane_stderr_cap_dir . '/lane-0-stderr.log' ) : 0; +check( + 'runner preserves per-lane stderr cap on reused output dirs', + 0 === $lane_stderr_cap_reuse_runner['code'] && $lane_stderr_cap_reuse_size <= 128, + $lane_stderr_cap_reuse_runner['stdout'] . $lane_stderr_cap_reuse_runner['stderr'] . ' stderr_size=' . $lane_stderr_cap_reuse_size +); +remove_tree( $lane_stderr_cap_dir ); + +$lane_stderr_oversize_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-oversize-' . getmypid(); +remove_tree( $lane_stderr_oversize_dir ); +mkdir( $lane_stderr_oversize_dir, 0777, true ); +file_put_contents( $lane_stderr_oversize_dir . '/lane-0-stderr.log', str_repeat( 'X', 512 ) ); +$lane_stderr_oversize_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--max-stderr-bytes', + '128', + '--output-dir', + $lane_stderr_oversize_dir, + ) +); +$lane_stderr_oversize_state = is_file( $lane_stderr_oversize_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $lane_stderr_oversize_dir . '/state.json' ), true ) + : array(); +$lane_stderr_oversize_size = is_file( $lane_stderr_oversize_dir . '/lane-0-stderr.log' ) ? filesize( $lane_stderr_oversize_dir . '/lane-0-stderr.log' ) : 0; +check( + 'runner truncates oversized reused lane stderr logs', + 0 === $lane_stderr_oversize_runner['code'] && + $lane_stderr_oversize_size <= 128 && + array() !== ( $lane_stderr_oversize_state['worker_stderr_startup_truncated'] ?? array() ), + $lane_stderr_oversize_runner['stdout'] . $lane_stderr_oversize_runner['stderr'] . ' stderr_size=' . $lane_stderr_oversize_size . ' state=' . json_encode( $lane_stderr_oversize_state['worker_stderr_startup_truncated'] ?? null ) +); +remove_tree( $lane_stderr_oversize_dir ); + +$lane_stderr_stale_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-lane-stderr-stale-' . getmypid(); +remove_tree( $lane_stderr_stale_dir ); +mkdir( $lane_stderr_stale_dir, 0777, true ); +file_put_contents( $lane_stderr_stale_dir . '/lane-1-stderr.log', str_repeat( 'X', 512 ) ); +$lane_stderr_stale_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--max-stderr-bytes', + '128', + '--output-dir', + $lane_stderr_stale_dir, + ) +); +$lane_stderr_stale_size = is_file( $lane_stderr_stale_dir . '/lane-1-stderr.log' ) ? filesize( $lane_stderr_stale_dir . '/lane-1-stderr.log' ) : 0; +check( + 'runner truncates stale stderr logs from inactive lanes', + 0 === $lane_stderr_stale_runner['code'] && $lane_stderr_stale_size <= 128, + $lane_stderr_stale_runner['stdout'] . $lane_stderr_stale_runner['stderr'] . ' stderr_size=' . $lane_stderr_stale_size +); +remove_tree( $lane_stderr_stale_dir ); + +$no_summary_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-no-summary-' . getmypid(); +remove_tree( $no_summary_dir ); +$no_summary_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--summary-mode', + 'none', + '--output-dir', + $no_summary_dir, + ) +); +check( + 'runner can disable summary output', + 0 === $no_summary_runner['code'] && ! file_exists( $no_summary_dir . '/summary.ndjson' ), + $no_summary_runner['stdout'] . $no_summary_runner['stderr'] +); +remove_tree( $no_summary_dir ); + +$partial_artifact_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-partial-artifact-' . getmypid(); +remove_tree( $partial_artifact_dir ); +mkdir( $partial_artifact_dir . '/failure-orphan', 0777, true ); +file_put_contents( $partial_artifact_dir . '/failure-orphan/payload.txt', 'orphaned payload' ); +$partial_artifact_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--artifact-retention', + 'none', + '--output-dir', + $partial_artifact_dir, + ) +); +$partial_artifact_state = is_file( $partial_artifact_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $partial_artifact_dir . '/state.json' ), true ) + : array(); +check( + 'runner prunes partial failure artifacts on startup', + 0 === $partial_artifact_runner['code'] && + ! is_dir( $partial_artifact_dir . '/failure-orphan' ) && + ( $partial_artifact_state['artifact_retention']['startup_pruned_partial'] ?? 0 ) > 0, + $partial_artifact_runner['stdout'] . $partial_artifact_runner['stderr'] . json_encode( $partial_artifact_state['artifact_retention'] ?? null ) +); +remove_tree( $partial_artifact_dir ); + +$symlink_artifact_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-symlink-artifact-' . getmypid(); +remove_tree( $symlink_artifact_dir ); +mkdir( $symlink_artifact_dir . '/keepdir', 0777, true ); +file_put_contents( $symlink_artifact_dir . '/keepdir/important.txt', 'keep me' ); +$symlink_created = @symlink( $symlink_artifact_dir . '/keepdir', $symlink_artifact_dir . '/failure-link' ); +if ( $symlink_created ) { + $symlink_artifact_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--artifact-retention', + 'none', + '--output-dir', + $symlink_artifact_dir, + ) + ); + check( + 'runner prunes artifact symlinks without deleting targets', + 0 === $symlink_artifact_runner['code'] && + ! file_exists( $symlink_artifact_dir . '/failure-link' ) && + is_file( $symlink_artifact_dir . '/keepdir/important.txt' ), + $symlink_artifact_runner['stdout'] . $symlink_artifact_runner['stderr'] + ); +} else { + check( 'runner prunes artifact symlinks without deleting targets', true, 'symlink unavailable' ); +} +remove_tree( $symlink_artifact_dir ); + +$glob_meta_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-glob-meta-' . getmypid(); +remove_tree( $glob_meta_dir ); +mkdir( $glob_meta_dir . '/run-*', 0777, true ); +mkdir( $glob_meta_dir . '/run-victim/keepdir', 0777, true ); +file_put_contents( $glob_meta_dir . '/run-victim/keepdir/important.txt', 'keep me' ); +$glob_meta_symlink_created = @symlink( $glob_meta_dir . '/run-victim/keepdir', $glob_meta_dir . '/run-victim/failure-link' ); +$glob_meta_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--cases-per-batch', + '1', + '--artifact-retention', + 'none', + '--output-dir', + $glob_meta_dir . '/run-*', + ) +); +check( + 'runner treats output dir metacharacters literally during startup pruning', + 0 === $glob_meta_runner['code'] && + ( ! $glob_meta_symlink_created || file_exists( $glob_meta_dir . '/run-victim/failure-link' ) ) && + is_file( $glob_meta_dir . '/run-victim/keepdir/important.txt' ), + $glob_meta_runner['stdout'] . $glob_meta_runner['stderr'] +); +remove_tree( $glob_meta_dir ); + +$symlink_write_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-symlink-write-' . getmypid(); +remove_tree( $symlink_write_dir ); +mkdir( $symlink_write_dir . '/keepdir', 0777, true ); +file_put_contents( + $symlink_write_dir . '/keepdir/failure.json', + json_encode( + array( + 'signatures' => array( 'decode-mismatch:text', 'reader-decode-mismatch:text' ), + ) + ) +); +$symlink_write_created = @symlink( $symlink_write_dir . '/keepdir', $symlink_write_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}" ); +if ( $symlink_write_created ) { + $symlink_write_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--seed', + (string) $skip_c1_fault_seed, + '--cases', + '200', + '--output-dir', + $symlink_write_dir, + '--progress-every', + '200', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) + ); + $symlink_write_suffixed = glob( $symlink_write_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}-sig*/failure.json" ); + check( + 'worker does not write through symlinked failure artifact dirs', + 1 === $symlink_write_worker['code'] && + ! is_file( $symlink_write_dir . '/keepdir/payload.txt' ) && + is_array( $symlink_write_suffixed ) && + array() !== $symlink_write_suffixed, + $symlink_write_worker['stdout'] . $symlink_write_worker['stderr'] + ); +} else { + check( 'worker does not write through symlinked failure artifact dirs', true, 'symlink unavailable' ); +} +remove_tree( $symlink_write_dir ); + +$incomplete_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-incomplete-manifest-' . getmypid(); +remove_tree( $incomplete_manifest_dir ); +mkdir( $incomplete_manifest_dir . '/failure-bad', 0777, true ); +file_put_contents( + $incomplete_manifest_dir . '/failure-bad/failure.json', + json_encode( + array( + 'signatures' => array( 'reader-decode-mismatch:text' ), + 'context' => 'text', + 'payload_base64' => '', + ) + ) +); +$incomplete_manifest_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--seed-base', + '1', + '--cases-per-batch', + '1', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $incomplete_manifest_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +$incomplete_manifest_state = is_file( $incomplete_manifest_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $incomplete_manifest_dir . '/state.json' ), true ) + : array(); +$incomplete_manifest_files = glob( $incomplete_manifest_dir . '/failure-*/failure.json' ); +$retained_manifest = is_array( $incomplete_manifest_files ) && 1 === count( $incomplete_manifest_files ) + ? json_decode( (string) file_get_contents( $incomplete_manifest_files[0] ), true ) + : array(); +check( + 'runner ignores incomplete manifests when enforcing retention cap', + 1 === $incomplete_manifest_runner['code'] && + ! is_dir( $incomplete_manifest_dir . '/failure-bad' ) && + ( $incomplete_manifest_state['artifact_retention']['startup_pruned_partial'] ?? 0 ) > 0 && + is_array( $retained_manifest ) && + isset( $retained_manifest['payload_base64'] ), + $incomplete_manifest_runner['stdout'] . $incomplete_manifest_runner['stderr'] . json_encode( $incomplete_manifest_state['artifact_retention'] ?? null ) +); +remove_tree( $incomplete_manifest_dir ); + +$nonreproducing_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-nonreproducing-manifest-' . getmypid(); +remove_tree( $nonreproducing_manifest_dir ); +mkdir( $nonreproducing_manifest_dir . '/failure-fake', 0777, true ); +$fake_payload = 'plain text'; +file_put_contents( + $nonreproducing_manifest_dir . '/failure-fake/failure.json', + json_encode( + array( + 'signatures' => array( 'reader-decode-mismatch:text' ), + 'context' => 'text', + 'input_size' => strlen( $fake_payload ), + 'payload_base64' => base64_encode( $fake_payload ), + 'failures' => array( + array( 'signature' => 'reader-decode-mismatch:text' ), + ), + ) + ) +); +$nonreproducing_manifest_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--seed-base', + '1', + '--cases-per-batch', + '1', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $nonreproducing_manifest_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +$nonreproducing_manifest_state = is_file( $nonreproducing_manifest_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $nonreproducing_manifest_dir . '/state.json' ), true ) + : array(); +$nonreproducing_manifest_files = glob( $nonreproducing_manifest_dir . '/failure-*/failure.json' ); +$nonreproducing_retained = is_array( $nonreproducing_manifest_files ) && 1 === count( $nonreproducing_manifest_files ) + ? json_decode( (string) file_get_contents( $nonreproducing_manifest_files[0] ), true ) + : array(); +check( + 'runner ignores non-reproducing manifests when enforcing retention cap', + 1 === $nonreproducing_manifest_runner['code'] && + ! is_dir( $nonreproducing_manifest_dir . '/failure-fake' ) && + ( $nonreproducing_manifest_state['artifact_retention']['startup_pruned_partial'] ?? 0 ) > 0 && + is_array( $nonreproducing_retained ) && + isset( $nonreproducing_retained['payload_base64'] ) && + 'plain text' !== base64_decode( $nonreproducing_retained['payload_base64'], true ), + $nonreproducing_manifest_runner['stdout'] . $nonreproducing_manifest_runner['stderr'] . json_encode( $nonreproducing_manifest_state['artifact_retention'] ?? null ) +); +remove_tree( $nonreproducing_manifest_dir ); + +$unverified_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unverified-manifest-' . getmypid(); +remove_tree( $unverified_manifest_dir ); +$unverified_seed_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '200', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $unverified_manifest_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +$unverified_before = glob( $unverified_manifest_dir . '/failure-*/failure.json' ); +$unverified_runner = run_process( + array( + PHP_BINARY, + '-d', + 'disable_functions=mb_check_encoding', + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--seed-base', + '9999', + '--cases-per-batch', + '1', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $unverified_manifest_dir, + ) +); +$unverified_after = glob( $unverified_manifest_dir . '/failure-*/failure.json' ); +$unverified_state = is_file( $unverified_manifest_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $unverified_manifest_dir . '/state.json' ), true ) + : array(); +check( + 'runner preserves retained artifacts when startup verification is unavailable', + 1 === $unverified_seed_runner['code'] && + 0 === $unverified_runner['code'] && + is_array( $unverified_before ) && + is_array( $unverified_after ) && + count( $unverified_before ) === count( $unverified_after ) && + ( false !== ( $unverified_state['artifact_retention']['startup_verification_unavailable'] ?? false ) ), + $unverified_seed_runner['stdout'] . $unverified_seed_runner['stderr'] . $unverified_runner['stdout'] . $unverified_runner['stderr'] . json_encode( $unverified_state['artifact_retention'] ?? null ) +); +remove_tree( $unverified_manifest_dir ); + +$unverified_weak_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unverified-weak-manifest-' . getmypid(); +remove_tree( $unverified_weak_manifest_dir ); +$unverified_weak_seed_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '200', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $unverified_weak_manifest_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +mkdir( $unverified_weak_manifest_dir . '/failure-000weak', 0777, true ); +file_put_contents( + $unverified_weak_manifest_dir . '/failure-000weak/failure.json', + json_encode( + array( + 'signatures' => array( 'decode-mismatch:text', 'reader-decode-mismatch:text' ), + 'context' => 'text', + 'payload_base64' => base64_encode( 'x' ), + ) + ) +); +$unverified_weak_runner = run_process( + array( + PHP_BINARY, + '-d', + 'disable_functions=mb_check_encoding', + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--seed-base', + '9999', + '--cases-per-batch', + '1', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $unverified_weak_manifest_dir, + ) +); +$unverified_weak_state = is_file( $unverified_weak_manifest_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $unverified_weak_manifest_dir . '/state.json' ), true ) + : array(); +check( + 'runner ignores weak manifests when startup verification is unavailable', + 1 === $unverified_weak_seed_runner['code'] && + 0 === $unverified_weak_runner['code'] && + ! is_dir( $unverified_weak_manifest_dir . '/failure-000weak' ) && + is_file( $unverified_weak_manifest_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}/failure.json" ) && + ( $unverified_weak_state['artifact_retention']['startup_pruned_partial'] ?? 0 ) > 0 && + ( false !== ( $unverified_weak_state['artifact_retention']['startup_verification_unavailable'] ?? false ) ), + $unverified_weak_seed_runner['stdout'] . $unverified_weak_seed_runner['stderr'] . $unverified_weak_runner['stdout'] . $unverified_weak_runner['stderr'] . json_encode( $unverified_weak_state['artifact_retention'] ?? null ) +); +remove_tree( $unverified_weak_manifest_dir ); + +$unverified_fake_manifest_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-unverified-fake-manifest-' . getmypid(); +remove_tree( $unverified_fake_manifest_dir ); +$unverified_fake_seed_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '200', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $unverified_fake_manifest_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +$fake_payload = 'x'; +mkdir( $unverified_fake_manifest_dir . '/failure-000fake', 0777, true ); +file_put_contents( + $unverified_fake_manifest_dir . '/failure-000fake/failure.json', + json_encode( + array( + 'signatures' => array( 'decode-mismatch:text', 'reader-decode-mismatch:text' ), + 'context' => 'text', + 'input_size' => strlen( $fake_payload ), + 'payload_base64' => base64_encode( $fake_payload ), + 'failures' => array( + array( 'signature' => 'decode-mismatch:text' ), + array( 'signature' => 'reader-decode-mismatch:text' ), + ), + ) + ) +); +$unverified_fake_runner = run_process( + array( + PHP_BINARY, + '-d', + 'disable_functions=mb_check_encoding', + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--seed-base', + '9999', + '--cases-per-batch', + '1', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $unverified_fake_manifest_dir, + ) +); +$unverified_fake_state = is_file( $unverified_fake_manifest_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $unverified_fake_manifest_dir . '/state.json' ), true ) + : array(); +$unverified_fake_counts = $unverified_fake_state['artifact_retention']['retained_by_signature'] ?? array(); +check( + 'runner preserves real artifacts when startup verification cannot reject full-shape fakes', + 1 === $unverified_fake_seed_runner['code'] && + 0 === $unverified_fake_runner['code'] && + is_file( $unverified_fake_manifest_dir . '/failure-000fake/failure.json' ) && + is_file( $unverified_fake_manifest_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}/failure.json" ) && + array_sum( is_array( $unverified_fake_counts ) ? $unverified_fake_counts : array() ) >= 2 && + ( false !== ( $unverified_fake_state['artifact_retention']['startup_verification_unavailable'] ?? false ) ), + $unverified_fake_seed_runner['stdout'] . $unverified_fake_seed_runner['stderr'] . $unverified_fake_runner['stdout'] . $unverified_fake_runner['stderr'] . json_encode( $unverified_fake_state['artifact_retention'] ?? null ) +); +remove_tree( $unverified_fake_manifest_dir ); + +$bad_integer = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--cases', 'abc' ) ); +check( 'worker rejects non-numeric integer options', 2 === $bad_integer['code'], $bad_integer['stdout'] . $bad_integer['stderr'] ); + +$huge_integer = run_process( array( PHP_BINARY, __DIR__ . '/../worker.php', '--cases', '999999999999999999999999999999999999999' ) ); +check( 'worker rejects out-of-range integer options', 2 === $huge_integer['code'], $huge_integer['stdout'] . $huge_integer['stderr'] ); + +$byte_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-byte-' . getmypid(); +remove_tree( $byte_pipeline_dir ); +$faulted_byte_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'bytes', + '--seed', + '1', + '--cases', + '200', + '--output-dir', + $byte_pipeline_dir, + '--progress-every', + '200', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' ) +); +check( 'faulted byte-space worker reports findings', 1 === $faulted_byte_worker['code'], $faulted_byte_worker['stdout'] . $faulted_byte_worker['stderr'] ); + +$byte_failure_files = glob( $byte_pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted byte-space worker writes failure artifact', is_array( $byte_failure_files ) && array() !== $byte_failure_files ); + +$byte_failure_file = is_array( $byte_failure_files ) && array() !== $byte_failure_files ? $byte_failure_files[0] : null; +if ( null !== $byte_failure_file ) { + $byte_manifest = json_decode( (string) file_get_contents( $byte_failure_file ), true ); + check( 'byte-space failure artifact records mode', 'bytes' === ( $byte_manifest['mode'] ?? null ) ); + + $byte_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $byte_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' ) + ); + check( 'faulted byte-space replay reproduces finding', 1 === $byte_replay['code'], $byte_replay['stdout'] . $byte_replay['stderr'] ); + + $byte_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $byte_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' ) + ); + check( 'faulted byte-space minimizer preserves signature', 0 === $byte_minimize['code'], $byte_minimize['stdout'] . $byte_minimize['stderr'] ); +} +remove_tree( $byte_pipeline_dir ); + +$raw_c1_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-raw-c1-' . getmypid(); +remove_tree( $raw_c1_pipeline_dir ); +$faulted_raw_c1_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'bytes', + '--seed', + '1', + '--start-case', + '3', + '--cases', + '1', + '--output-dir', + $raw_c1_pipeline_dir, + '--progress-every', + '1', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'raw-c1-not-pass-through' ) +); +check( 'faulted raw-C1 byte worker reports findings', 1 === $faulted_raw_c1_worker['code'], $faulted_raw_c1_worker['stdout'] . $faulted_raw_c1_worker['stderr'] ); + +$raw_c1_failure_file = $raw_c1_pipeline_dir . '/failure-seed1-case3/failure.json'; +check( 'faulted raw-C1 byte worker writes failure artifact', is_file( $raw_c1_failure_file ) ); + +if ( is_file( $raw_c1_failure_file ) ) { + $raw_c1_manifest = json_decode( (string) file_get_contents( $raw_c1_failure_file ), true ); + check( + 'raw-C1 byte failure artifact records mode and signature', + 'bytes' === ( $raw_c1_manifest['mode'] ?? null ) && + in_array( 'raw-c1-not-pass-through:text', $raw_c1_manifest['signatures'] ?? array(), true ), + json_encode( $raw_c1_manifest ) + ); + + $raw_c1_replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $raw_c1_failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'raw-c1-not-pass-through' ) + ); + check( 'faulted raw-C1 byte replay reproduces finding', 1 === $raw_c1_replay['code'], $raw_c1_replay['stdout'] . $raw_c1_replay['stderr'] ); + + $raw_c1_minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $raw_c1_failure_file, '--signature', 'raw-c1-not-pass-through:text' ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'raw-c1-not-pass-through' ) + ); + check( 'faulted raw-C1 byte minimizer preserves signature', 0 === $raw_c1_minimize['code'], $raw_c1_minimize['stdout'] . $raw_c1_minimize['stderr'] ); +} +remove_tree( $raw_c1_pipeline_dir ); + +$byte_mode_collision_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-byte-mode-collision-' . getmypid(); +remove_tree( $byte_mode_collision_dir ); +mkdir( $byte_mode_collision_dir . '/failure-seed1-case3', 0777, true ); +file_put_contents( + $byte_mode_collision_dir . '/failure-seed1-case3/failure.json', + json_encode( + array( + 'signatures' => array( + 'text-without-ampersand-not-identity:text', + 'reader-decode-mismatch:text', + 'attribute-without-ampersand-not-identity:attribute', + 'reader-decode-mismatch:attribute', + ), + ) + ) +); +$byte_mode_collision_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--mode', + 'bytes', + '--seed', + '1', + '--cases', + '4', + '--output-dir', + $byte_mode_collision_dir, + '--progress-every', + '4', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' ) +); +$byte_mode_collision_suffixed = glob( $byte_mode_collision_dir . '/failure-seed1-case3-sig*/failure.json' ); +check( + 'worker separates same-signature artifacts by mode', + 1 === $byte_mode_collision_worker['code'] && + is_file( $byte_mode_collision_dir . '/failure-seed1-case3/failure.json' ) && + is_array( $byte_mode_collision_suffixed ) && + array() !== $byte_mode_collision_suffixed, + $byte_mode_collision_worker['stdout'] . $byte_mode_collision_worker['stderr'] +); +remove_tree( $byte_mode_collision_dir ); + +$byte_runner_pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-byte-runner-fault-' . getmypid(); +remove_tree( $byte_runner_pipeline_dir ); +$faulted_byte_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'bytes', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--cases-per-batch', + '200', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $byte_runner_pipeline_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'byte-no-amp-identity' ) +); +$faulted_byte_runner_state = is_file( $byte_runner_pipeline_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $byte_runner_pipeline_dir . '/state.json' ), true ) + : array(); +$faulted_byte_runner_modes = array_unique( array_map( static fn( $seed ): string => $seed['mode'] ?? '', $faulted_byte_runner_state['failure_seeds'] ?? array() ) ); +check( + 'faulted byte-space runner reports findings', + 1 === $faulted_byte_runner['code'] && + ( $faulted_byte_runner_state['failures'] ?? 0 ) > 0 && + array( 'bytes' ) === array_values( $faulted_byte_runner_modes ), + $faulted_byte_runner['stdout'] . $faulted_byte_runner['stderr'] . json_encode( $faulted_byte_runner_state ) +); +remove_tree( $byte_runner_pipeline_dir ); + +$mixed_mode_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-mixed-mode-runner-' . getmypid(); +remove_tree( $mixed_mode_runner_dir ); +$mixed_mode_oracle_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + '1', + '--cases-per-batch', + '200', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $mixed_mode_runner_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +$mixed_mode_byte_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--mode', + 'bytes', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + '1', + '--cases-per-batch', + '200', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $mixed_mode_runner_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +$mixed_mode_state = is_file( $mixed_mode_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $mixed_mode_runner_dir . '/state.json' ), true ) + : array(); +$mixed_mode_failure_modes = array_unique( array_map( static fn( $seed ): string => $seed['mode'] ?? '', $mixed_mode_state['failure_seeds'] ?? array() ) ); +check( + 'runner separates retained same-signature artifacts by mode', + 1 === $mixed_mode_oracle_runner['code'] && + 1 === $mixed_mode_byte_runner['code'] && + in_array( 'bytes', $mixed_mode_failure_modes, true ), + $mixed_mode_oracle_runner['stdout'] . $mixed_mode_oracle_runner['stderr'] . $mixed_mode_byte_runner['stdout'] . $mixed_mode_byte_runner['stderr'] . json_encode( $mixed_mode_state['artifact_retention'] ?? null ) +); +remove_tree( $mixed_mode_runner_dir ); + +$pipeline_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-' . getmypid(); +remove_tree( $pipeline_dir ); +$faulted_worker = run_process( + array( + PHP_BINARY, + __DIR__ . '/../worker.php', + '--seed', + (string) $skip_c1_fault_seed, + '--cases', + '200', + '--output-dir', + $pipeline_dir, + '--progress-every', + '200', + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +check( 'faulted worker reports findings', 1 === $faulted_worker['code'], $faulted_worker['stdout'] . $faulted_worker['stderr'] ); + +$failure_files = glob( $pipeline_dir . '/failure-*/failure.json' ); +check( 'faulted worker writes failure artifact', is_array( $failure_files ) && array() !== $failure_files ); + +$failure_file = is_array( $failure_files ) && array() !== $failure_files ? $failure_files[0] : null; +if ( null !== $failure_file ) { + $manifest = json_decode( (string) file_get_contents( $failure_file ), true ); + $detail = $manifest['failures'][0]['detail'] ?? array(); + check( 'failure artifact includes full expected/got', isset( $detail['expected_base64'], $detail['got_base64'] ) ); + + $replay = run_process( + array( PHP_BINARY, __DIR__ . '/../replay.php', '--failure', $failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) + ); + check( 'faulted replay reproduces finding', 1 === $replay['code'], $replay['stdout'] . $replay['stderr'] ); + + $minimize = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $failure_file ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) + ); + check( 'faulted minimizer preserves signature', 0 === $minimize['code'], $minimize['stdout'] . $minimize['stderr'] ); + + $minimize_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-minimize-' . getmypid(); + remove_tree( $minimize_dir ); + $minimize_output_dir = run_process( + array( PHP_BINARY, __DIR__ . '/../minimize.php', '--failure', $failure_file, '--output-dir', $minimize_dir ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) + ); + check( + 'minimizer creates requested output directory', + 0 === $minimize_output_dir['code'] && is_file( $minimize_dir . '/minimized.json' ), + $minimize_output_dir['stdout'] . $minimize_output_dir['stderr'] + ); + remove_tree( $minimize_dir ); +} + +remove_tree( $pipeline_dir ); + +$runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-runner-' . getmypid(); +remove_tree( $runner_dir ); +$faulted_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1000', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '1000', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $runner_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +$runner_state = is_file( $runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $runner_dir . '/state.json' ), true ) + : array(); +check( + 'faulted runner reports findings', + 1 === $faulted_runner['code'] && ( $runner_state['failures'] ?? 0 ) > 0, + $faulted_runner['stdout'] . $faulted_runner['stderr'] +); +$retained_counts = $runner_state['artifact_retention']['retained_by_signature'] ?? array(); +check( + 'faulted runner caps retained artifacts by signature', + array() !== $retained_counts && array() === array_filter( $retained_counts, static fn( $count ) => $count > 1 ), + json_encode( $retained_counts ) +); +check( + 'faulted runner prunes repeated failure artifacts', + ( $runner_state['artifact_retention']['pruned'] ?? 0 ) > 0, + json_encode( $runner_state['artifact_retention'] ?? null ) +); +$retained_failure_dirs = glob( $runner_dir . '/failure-*/failure.json' ); +check( + 'faulted runner prunes over-cap failure directories', + is_array( $retained_failure_dirs ) && count( $retained_failure_dirs ) === array_sum( $retained_counts ), + 'dirs=' . ( is_array( $retained_failure_dirs ) ? count( $retained_failure_dirs ) : 0 ) . ' counts=' . json_encode( $retained_counts ) +); +$runner_summary_failures = array(); +if ( is_file( $runner_dir . '/summary.ndjson' ) ) { + foreach ( file( $runner_dir . '/summary.ndjson', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES ) ?: array() as $line ) { + $summary_record = json_decode( $line, true ); + if ( is_array( $summary_record ) && 'failure' === ( $summary_record['type'] ?? null ) ) { + $runner_summary_failures[] = $summary_record; + } + } +} +check( + 'faulted runner writes bounded default failure summary', + count( $runner_summary_failures ) === array_sum( $retained_counts ) && + ( $runner_state['failures'] ?? 0 ) > count( $runner_summary_failures ), + 'failures=' . ( $runner_state['failures'] ?? 0 ) . ' summary_failures=' . count( $runner_summary_failures ) +); +$runner_state_failure_seeds = $runner_state['failure_seeds'] ?? array(); +check( + 'faulted runner writes bounded failure seed state', + is_array( $runner_state_failure_seeds ) && + count( $runner_state_failure_seeds ) === array_sum( $retained_counts ) && + ( $runner_state['failures'] ?? 0 ) > count( $runner_state_failure_seeds ), + 'failures=' . ( $runner_state['failures'] ?? 0 ) . ' state_failure_seeds=' . count( $runner_state_failure_seeds ) +); + +$reuse_same_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1000', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '1000', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $runner_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +$reuse_same_state = is_file( $runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $runner_dir . '/state.json' ), true ) + : array(); +$reuse_same_counts = $reuse_same_state['artifact_retention']['retained_by_signature'] ?? array(); +$reuse_same_dirs = glob( $runner_dir . '/failure-*/failure.json' ); +check( + 'runner preserves retained same-seed artifacts on reuse', + 1 === $reuse_same_runner['code'] && + is_array( $reuse_same_dirs ) && + count( $reuse_same_dirs ) === array_sum( $reuse_same_counts ) && + is_file( $runner_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}/failure.json" ) && + array() === array_filter( $reuse_same_counts, static fn( $count ) => $count > 1 ), + $reuse_same_runner['stdout'] . $reuse_same_runner['stderr'] . json_encode( $reuse_same_state['artifact_retention'] ?? null ) +); +remove_tree( $runner_dir ); + +$different_signature_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-different-signature-reuse-' . getmypid(); +remove_tree( $different_signature_dir ); +$different_signature_first = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '200', + '--max-artifacts-per-signature', + '100', + '--output-dir', + $different_signature_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +$different_signature_second = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '200', + '--max-artifacts-per-signature', + '100', + '--artifact-retention', + 'all', + '--output-dir', + $different_signature_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'match-length-off-by-one' ) +); +$different_signature_case_files = glob( $different_signature_dir . "/failure-seed{$skip_c1_fault_seed}-case{$skip_c1_fault_case}*/failure.json" ); +$different_signature_seen = array(); +foreach ( is_array( $different_signature_case_files ) ? $different_signature_case_files : array() as $failure_file ) { + $manifest = json_decode( (string) file_get_contents( $failure_file ), true ); + if ( is_array( $manifest ) && isset( $manifest['signatures'] ) && is_array( $manifest['signatures'] ) ) { + $different_signature_seen[] = implode( ',', $manifest['signatures'] ); + } +} +check( + 'runner preserves same-seed artifacts with different signatures', + 1 === $different_signature_first['code'] && + 1 === $different_signature_second['code'] && + in_array( 'decode-mismatch:text,reader-decode-mismatch:text,decode-mismatch:attribute,reader-decode-mismatch:attribute', $different_signature_seen, true ) && + in_array( 'reader-decode-mismatch:text,reader-decode-mismatch:attribute', $different_signature_seen, true ), + $different_signature_first['stdout'] . $different_signature_first['stderr'] . $different_signature_second['stdout'] . $different_signature_second['stderr'] . json_encode( $different_signature_seen ) +); +remove_tree( $different_signature_dir ); + +$overcap_reuse_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-overcap-reuse-' . getmypid(); +remove_tree( $overcap_reuse_dir ); +$overcap_seed_run = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1000', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '1000', + '--artifact-retention', + 'all', + '--output-dir', + $overcap_reuse_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +$overcap_before_dirs = glob( $overcap_reuse_dir . '/failure-*/failure.json' ); +$overcap_prune_run = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '1', + '--seed-base', + '9999', + '--cases-per-batch', + '1', + '--max-artifacts-per-signature', + '1', + '--output-dir', + $overcap_reuse_dir, + ) +); +$overcap_state = is_file( $overcap_reuse_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $overcap_reuse_dir . '/state.json' ), true ) + : array(); +$overcap_counts = $overcap_state['artifact_retention']['retained_by_signature'] ?? array(); +$overcap_after_dirs = glob( $overcap_reuse_dir . '/failure-*/failure.json' ); +check( + 'runner prunes reused output dirs back under cap', + 1 === $overcap_seed_run['code'] && + 0 === $overcap_prune_run['code'] && + is_array( $overcap_before_dirs ) && + is_array( $overcap_after_dirs ) && + count( $overcap_before_dirs ) > count( $overcap_after_dirs ) && + ( $overcap_state['artifact_retention']['startup_pruned'] ?? 0 ) > 0 && + count( $overcap_after_dirs ) === array_sum( $overcap_counts ) && + array() === array_filter( $overcap_counts, static fn( $count ) => $count > 1 ), + $overcap_seed_run['stdout'] . $overcap_seed_run['stderr'] . $overcap_prune_run['stdout'] . $overcap_prune_run['stderr'] . json_encode( $overcap_state['artifact_retention'] ?? null ) +); +remove_tree( $overcap_reuse_dir ); + +$no_artifact_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-no-artifacts-' . getmypid(); +remove_tree( $no_artifact_dir ); +$no_artifact_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '200', + '--artifact-retention', + 'none', + '--output-dir', + $no_artifact_dir, + ), + array( 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap' ) +); +$no_artifact_state = is_file( $no_artifact_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $no_artifact_dir . '/state.json' ), true ) + : array(); +$no_artifact_dirs = glob( $no_artifact_dir . '/failure-*/failure.json' ); +check( + 'runner can prune all failure artifacts', + 1 === $no_artifact_runner['code'] && ( $no_artifact_state['failures'] ?? 0 ) > 0 && ( $no_artifact_state['artifact_retention']['pruned'] ?? 0 ) > 0 && is_array( $no_artifact_dirs ) && 0 === count( $no_artifact_dirs ), + $no_artifact_runner['stdout'] . $no_artifact_runner['stderr'] . json_encode( $no_artifact_state['artifact_retention'] ?? null ) +); +remove_tree( $no_artifact_dir ); + +$corrupt_runner_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-corrupt-runner-' . getmypid(); +remove_tree( $corrupt_runner_dir ); +$corrupt_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '200', + '--output-dir', + $corrupt_runner_dir, + ), + array( + 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap', + 'HTML_DECODER_FUZZ_CORRUPT_FAILURE_EVENT' => '1', + ) +); +$corrupt_runner_state = is_file( $corrupt_runner_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $corrupt_runner_dir . '/state.json' ), true ) + : array(); +check( + 'runner treats malformed finding events as harness errors', + 2 === $corrupt_runner['code'] && ( $corrupt_runner_state['harness_errors'] ?? 0 ) > 0, + $corrupt_runner['stdout'] . $corrupt_runner['stderr'] +); +remove_tree( $corrupt_runner_dir ); + +$bogus_mode_dir = sys_get_temp_dir() . '/html-decoder-fuzz-smoke-bogus-mode-' . getmypid(); +remove_tree( $bogus_mode_dir ); +$bogus_mode_runner = run_process( + array( + PHP_BINARY, + __DIR__ . '/../runner.php', + '--lanes', + '1', + '--duration-seconds', + '0', + '--max-cases', + '200', + '--seed-base', + (string) $skip_c1_fault_seed, + '--cases-per-batch', + '200', + '--output-dir', + $bogus_mode_dir, + ), + array( + 'HTML_DECODER_FUZZ_FAULT' => 'skip-c1-remap', + 'HTML_DECODER_FUZZ_BOGUS_FAILURE_MODE' => '1', + ) +); +$bogus_mode_state = is_file( $bogus_mode_dir . '/state.json' ) + ? json_decode( (string) file_get_contents( $bogus_mode_dir . '/state.json' ), true ) + : array(); +check( + 'runner treats bogus failure modes as harness errors', + 2 === $bogus_mode_runner['code'] && ( $bogus_mode_state['harness_errors'] ?? 0 ) > 0, + $bogus_mode_runner['stdout'] . $bogus_mode_runner['stderr'] +); +remove_tree( $bogus_mode_dir ); + +echo $failed > 0 ? "\n{$failed} smoke check(s) FAILED\n" : "\nAll smoke checks passed\n"; +exit( $failed > 0 ? 1 : 0 ); diff --git a/tools/html-decoder-fuzz/worker.php b/tools/html-decoder-fuzz/worker.php new file mode 100644 index 0000000000000..cbe0fbb1adf35 --- /dev/null +++ b/tools/html-decoder-fuzz/worker.php @@ -0,0 +1,318 @@ + 1, + 'cases' => 1000, + 'start-case' => 0, + 'max-bytes' => 4096, + 'mode' => 'oracle', + 'output-dir' => '', + 'progress-every' => 500, + ) +); + +Cli::require_int_at_least( $options, 'cases', 1 ); +Cli::require_int_at_least( $options, 'start-case', 0 ); +Cli::require_int_at_least( $options, 'max-bytes', 1 ); +Cli::require_int_at_least( $options, 'progress-every', 1 ); +Cli::require_one_of( $options, 'mode', Cli::valid_modes() ); + +Bootstrap::load_targets(); + +$oracles = Oracles::build(); +foreach ( $oracles->drain_events() as $event ) { + Cli::emit( array( 'type' => 'oracle-event' ) + $event ); +} + +if ( Cli::mode_uses_oracle( $options['mode'] ) && ! $oracles->has_required() ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => 'required oracle unavailable or failed the battery', + ) + ); + exit( 2 ); +} + +$coverage = null; +if ( 'coverage' === $options['mode'] ) { + if ( ! CoverageGuidance::available() ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => CoverageGuidance::unavailable_reason(), + ) + ); + exit( 2 ); + } + $coverage = new CoverageGuidance(); +} + +$output_dir = $options['output-dir']; +if ( '' !== $output_dir && ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot create output dir {$output_dir}", + ) + ); + exit( 2 ); +} + +$checks = new Checks( $oracles ); +$reference_names = Bootstrap::named_reference_names(); +$seed = (string) $options['seed']; +$start = $options['start-case']; +$end = $start + $options['cases']; +$stderr_bytes_per_case = max( 0, (int) getenv( 'HTML_DECODER_FUZZ_STDERR_BYTES_PER_CASE' ) ); +$stats = array( + 'cases' => 0, + 'failures' => 0, + 'bytes' => 0, + 'by_strategy' => array(), + 'by_context' => array(), +); +if ( null !== $coverage ) { + $stats['coverage_new_edges'] = 0; + $stats['coverage_payloads'] = 0; +} +$started_at = microtime( true ); + +Cli::emit( + array( + 'type' => 'start', + 'seed' => $seed, + 'start_case' => $start, + 'cases' => $options['cases'], + 'max_bytes' => $options['max-bytes'], + 'mode' => $options['mode'], + 'environment' => Cli::environment_metadata( $oracles ), + ) +); + +for ( $case = $start; $case < $end; $case++ ) { + if ( $stderr_bytes_per_case > 0 ) { + fwrite( STDERR, str_repeat( 'E', $stderr_bytes_per_case ) . "\n" ); + } + + $prng = new Prng( "{$seed}:{$case}" ); + $generator = new Generator( $prng, $options['max-bytes'], $reference_names ); + if ( 'bytes' === $options['mode'] ) { + $generated = $generator->generate_bytes(); + } elseif ( 'names' === $options['mode'] ) { + $generated = $generator->generate_name_sweep( $case ); + } elseif ( 'legacy-followers' === $options['mode'] ) { + $generated = $generator->generate_legacy_follower_sweep( $case ); + } elseif ( 'prefix-families' === $options['mode'] ) { + $generated = $generator->generate_prefix_family_sweep( $case ); + } elseif ( 'numeric-boundaries' === $options['mode'] ) { + $generated = $generator->generate_numeric_boundary_sweep( $case ); + } elseif ( 'corpus' === $options['mode'] ) { + $generated = $generator->generate_corpus_mutation( $case ); + } elseif ( 'token-map' === $options['mode'] ) { + $generated = $generator->generate_token_map_sweep( $case ); + } elseif ( 'coverage' === $options['mode'] ) { + $generated = $generator->generate(); + } else { + $generated = $generator->generate(); + } + $payload = $generated['payload']; + $context = $generated['context']; + $strategy = $generated['strategy']; + + if ( null !== $coverage ) { + $coverage->begin_case(); + } + $failures = 'bytes' === $options['mode'] + ? $checks->run_without_oracle( $context, $payload ) + : $checks->run( $context, $payload ); + $coverage_edges = null === $coverage ? array() : $coverage->finish_case( $payload, $context, $strategy ); + + ++$stats['cases']; + $stats['bytes'] += strlen( $payload ); + $stats['by_strategy'][ $strategy ] = ( $stats['by_strategy'][ $strategy ] ?? 0 ) + 1; + $stats['by_context'][ $context ] = ( $stats['by_context'][ $context ] ?? 0 ) + 1; + + if ( null !== $coverage ) { + $new_edges = $coverage->new_edges( $coverage_edges ); + if ( array() !== $new_edges ) { + $stats['coverage_new_edges'] += count( $new_edges ); + ++$stats['coverage_payloads']; + + try { + $coverage_artifact = $coverage->retain_payload( $output_dir, $seed, $case, $generated, $payload, $new_edges ); + } catch ( \RuntimeException $exception ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => $exception->getMessage(), + ) + ); + exit( 2 ); + } + + $coverage_record = array( + 'type' => 'coverage', + 'seed' => $seed, + 'case' => $case, + 'mode' => $options['mode'], + 'context' => $context, + 'strategy' => $strategy, + 'input_size' => strlen( $payload ), + 'coverage_provider' => $coverage->provider(), + 'edge_count' => count( $coverage_edges ), + 'seen_edge_count' => $coverage->seen_edge_count(), + 'new_edge_count' => count( $new_edges ), + 'new_edges' => $new_edges, + ) + $coverage_artifact; + if ( strlen( $payload ) <= 4096 ) { + $coverage_record['payload_base64'] = base64_encode( $payload ); + } + Cli::emit( $coverage_record ); + } + } + + if ( array() !== $failures ) { + $stats['failures'] += count( $failures ); + + $record = array( + 'type' => 'failure', + 'seed' => $seed, + 'case' => $case, + 'mode' => $options['mode'], + 'context' => $context, + 'strategy' => $strategy, + 'input_size' => strlen( $payload ), + 'signatures' => array_values( array_unique( array_column( $failures, 'signature' ) ) ), + 'failures' => $failures, + ); + + if ( strlen( $payload ) <= 4096 ) { + $record['payload_base64'] = base64_encode( $payload ); + } + + if ( '' !== $output_dir ) { + $signature_key = Cli::failure_signature_key( $record['signatures'], $record['mode'] ); + $base_case_dir = "{$output_dir}/failure-seed{$seed}-case{$case}"; + $case_dir = $base_case_dir; + $dir_matches_signature = static function ( string $dir ) use ( $signature_key ): bool { + if ( is_link( $dir ) ) { + return false; + } + + $manifest = json_decode( (string) @file_get_contents( "{$dir}/failure.json" ), true ); + $manifest_mode = $manifest['mode'] ?? 'oracle'; + return is_array( $manifest ) && + isset( $manifest['signatures'] ) && + is_array( $manifest['signatures'] ) && + is_string( $manifest_mode ) && + in_array( $manifest_mode, Cli::valid_modes(), true ) && + $signature_key === Cli::failure_signature_key( $manifest['signatures'], $manifest_mode ); + }; + + if ( is_link( $case_dir ) || ( is_dir( $case_dir ) && ! $dir_matches_signature( $case_dir ) ) ) { + $suffix = substr( $signature_key, 0, 12 ); + $case_dir = "{$base_case_dir}-sig{$suffix}"; + $attempt = 2; + while ( is_link( $case_dir ) || ( is_dir( $case_dir ) && ! $dir_matches_signature( $case_dir ) ) ) { + $case_dir = "{$base_case_dir}-sig{$suffix}-{$attempt}"; + ++$attempt; + } + } + + if ( ! is_dir( $case_dir ) && ! mkdir( $case_dir, 0777, true ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot create failure artifact dir {$case_dir}", + ) + ); + exit( 2 ); + } + if ( ! Cli::write_file( "{$case_dir}/payload.txt", $payload ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot write failure payload under {$case_dir}", + ) + ); + exit( 2 ); + } + + $artifact = $record; + $artifact['payload_base64'] = base64_encode( $payload ); + $artifact['environment'] = Cli::environment_metadata( $oracles ); + $artifact['git'] = Cli::git_metadata( Bootstrap::repo_root() ); + $artifact_json = json_encode( + $artifact, + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES + ); + if ( false === $artifact_json || ! Cli::write_file( "{$case_dir}/failure.json", $artifact_json ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot write failure manifest under {$case_dir}", + ) + ); + exit( 2 ); + } + $record['artifact_dir'] = $case_dir; + } + + if ( getenv( 'HTML_DECODER_FUZZ_CORRUPT_FAILURE_EVENT' ) ) { + if ( ! Cli::write_stream( STDOUT, "{\"type\":\"failure\"\n" ) ) { + fwrite( STDERR, "Cannot write corrupted failure event\n" ); + exit( 2 ); + } + } else { + if ( getenv( 'HTML_DECODER_FUZZ_BOGUS_FAILURE_MODE' ) ) { + $record['mode'] = 'bogus'; + } + Cli::emit( $record ); + } + } + + if ( 0 === ( $stats['cases'] % max( 1, $options['progress-every'] ) ) ) { + $elapsed = microtime( true ) - $started_at; + Cli::emit( + array( + 'type' => 'progress', + 'seed' => $seed, + 'case' => $case, + 'cases_done' => $stats['cases'], + 'failures' => $stats['failures'], + 'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null, + ) + ); + } +} + +$elapsed = microtime( true ) - $started_at; +Cli::emit( + array( + 'type' => 'done', + 'seed' => $seed, + 'stats' => $stats, + 'elapsed_sec' => round( $elapsed, 2 ), + 'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null, + ) +); + +exit( $stats['failures'] > 0 ? 1 : 0 );