From 3cc3e64765aab7410e2f8c9c85dbb679ad511cc7 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 14:54:49 +0200 Subject: [PATCH 01/14] Add differential fuzzer for WordPress UTF-8 encoding functions. Fuzzes wp_is_valid_utf8(), wp_scrub_utf8(), and their pure-PHP fallbacks against five independent known-good oracles: mbstring, PCRE2, ICU (intl), CPython, and the WHATWG TextDecoder (Node), the last two as persistent subprocesses. All oracles must pass a hand-computed known-answer battery before use; iconv is excluded because libiconv accepts code points above U+10FFFF. Beyond the differentials, internal invariants are checked: validity iff scrub identity, scrub output validity, scrub idempotence, code point counts against the scrubbed length, and chunked _wp_scan_utf8() reconstruction with deterministic resumable-scan budgets. Inputs mix nine deterministic strategies (random bytes, boundary- heavy valid UTF-8, mutations, invalid-atom splices, latin1, UTF-16, ASCII fast-path stress, repeated motifs); every case is reproducible from (seed, case index) alone. Includes a multi-lane runner with stall detection, replay and signature-preserving minimization tools, and a harness self-test that mutation-tests detection against seven classes of deliberately broken implementations. --- tools/encoding-fuzz/README.md | 141 +++++++ tools/encoding-fuzz/lib/Bootstrap.php | 36 ++ tools/encoding-fuzz/lib/Checks.php | 358 ++++++++++++++++++ tools/encoding-fuzz/lib/Cli.php | 112 ++++++ tools/encoding-fuzz/lib/ExternalOracle.php | 177 +++++++++ tools/encoding-fuzz/lib/Generator.php | 375 +++++++++++++++++++ tools/encoding-fuzz/lib/Oracles.php | 227 +++++++++++ tools/encoding-fuzz/lib/Prng.php | 92 +++++ tools/encoding-fuzz/lib/Targets.php | 44 +++ tools/encoding-fuzz/lib/autoload.php | 16 + tools/encoding-fuzz/lib/wp-stubs.php | 16 + tools/encoding-fuzz/minimize.php | 161 ++++++++ tools/encoding-fuzz/oracles/oracle-node.mjs | 54 +++ tools/encoding-fuzz/oracles/oracle-python.py | 53 +++ tools/encoding-fuzz/replay.php | 93 +++++ tools/encoding-fuzz/runner.php | 280 ++++++++++++++ tools/encoding-fuzz/tests/harness-smoke.php | 184 +++++++++ tools/encoding-fuzz/worker.php | 173 +++++++++ 18 files changed, 2592 insertions(+) create mode 100644 tools/encoding-fuzz/README.md create mode 100644 tools/encoding-fuzz/lib/Bootstrap.php create mode 100644 tools/encoding-fuzz/lib/Checks.php create mode 100644 tools/encoding-fuzz/lib/Cli.php create mode 100644 tools/encoding-fuzz/lib/ExternalOracle.php create mode 100644 tools/encoding-fuzz/lib/Generator.php create mode 100644 tools/encoding-fuzz/lib/Oracles.php create mode 100644 tools/encoding-fuzz/lib/Prng.php create mode 100644 tools/encoding-fuzz/lib/Targets.php create mode 100644 tools/encoding-fuzz/lib/autoload.php create mode 100644 tools/encoding-fuzz/lib/wp-stubs.php create mode 100644 tools/encoding-fuzz/minimize.php create mode 100644 tools/encoding-fuzz/oracles/oracle-node.mjs create mode 100644 tools/encoding-fuzz/oracles/oracle-python.py create mode 100644 tools/encoding-fuzz/replay.php create mode 100644 tools/encoding-fuzz/runner.php create mode 100644 tools/encoding-fuzz/tests/harness-smoke.php create mode 100644 tools/encoding-fuzz/worker.php diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md new file mode 100644 index 0000000000000..8deee79516156 --- /dev/null +++ b/tools/encoding-fuzz/README.md @@ -0,0 +1,141 @@ +# UTF-8 Encoding Fuzzer + +Differential fuzzer for the WordPress UTF-8 functions: + +- `wp_is_valid_utf8()` / `_wp_is_valid_utf8_fallback()` +- `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()` +- `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary) + +The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main +fuzz surface; the mbstring-backed public functions are checked alongside +them. Only `compat-utf8.php` and `utf8.php` are loaded — no WordPress +bootstrap, database, or `wp-env`. + +## Oracles + +Every result is compared against independent known-good implementations: + +| Oracle | Backing | Validity | Scrub | +|-----------|--------------------------------------|----------|-------| +| `mb` | `mb_check_encoding()` / `mb_scrub()` | ✓ | ✓ (primary) | +| `pcre` | PCRE2 strict UTF validation | ✓ | | +| `intl` | ICU `UConverter::transcode()` | | ✓ | +| `python3` | CPython codec, persistent subprocess | ✓ | ✓ | +| `node` | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓ | + +All scrub oracles implement the Unicode "maximal subpart" replacement +recommendation (Unicode 16.0 §3.9, Table 3-8), which is the documented +behavior of `wp_scrub_utf8()`. Every oracle must pass a hand-computed +known-answer battery at startup; one that fails (or whose subprocess +dies) is disabled and reported rather than allowed to produce noise. +iconv is deliberately excluded: GNU libiconv accepts code points above +U+10FFFF and fails the battery. + +`mb` (PHP ≥ 8.1.6, for maximal-subpart `mb_scrub()`) is required. +External oracles are auto-detected; control them with +`--external auto|python3|node|python3,node|none`. + +## Checks + +Differentials: both validity targets against every validity oracle, both +scrub targets against every scrub oracle. Oracle-vs-oracle disagreements +are reported separately (`oracle-disagreement`) so they don't masquerade +as WordPress bugs. + +Internal invariants: + +- valid ⟺ scrub returns the input unchanged +- scrub output is always valid UTF-8 +- scrub is idempotent +- `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed text + (each maximal subpart counts as one code point) +- scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points` + chunks reconstructs the same scrubbed text and always makes forward + progress (chunk sizes derive from the input hash, so replays are exact) + +## Inputs + +Each case is fully determined by `(seed, case index)`. The generator +mixes nine strategies: uniformly random bytes, random ASCII, +boundary-heavy valid UTF-8 (encoding-length edges, surrogate-gap edges, +noncharacters, BOM, U+10FFFF), mutated valid UTF-8 (bit flips, +truncations, splices), splices of hand-picked valid/invalid atoms +(overlongs, surrogates, truncated sequences, out-of-range leads), +ISO-8859-1-ish text, UTF-16 with/without BOM, long ASCII runs with +broken tails (`strspn()` fast-path stress), and repeated motifs. +Roughly a third of generated inputs are fully valid UTF-8. + +## Common Commands + +Run one worker batch: + +```sh +php tools/encoding-fuzz/worker.php --seed 1 --cases 5000 +``` + +Run parallel lanes for a minute (artifacts under `artifacts/encoding-fuzz/`): + +```sh +php tools/encoding-fuzz/runner.php --lanes 4 --duration-seconds 60 +``` + +Run indefinitely: + +```sh +php tools/encoding-fuzz/runner.php --lanes 8 --duration-seconds 0 --max-cases 0 +``` + +The duration budget stops new batches; in-flight batches finish, so a +run can overshoot by up to one batch (`--cases-per-batch`, default 2000). +A lane silent for `--stall-timeout` seconds (default 120) is killed and +its seed recorded for reproduction. + +Replay a failure (or any input, or a re-derived case): + +```sh +php tools/encoding-fuzz/replay.php --failure artifacts/encoding-fuzz/run-.../failure-seedS-caseN/failure.json +php tools/encoding-fuzz/replay.php --input some-bytes.bin +php tools/encoding-fuzz/replay.php --seed 123 --case 45 +``` + +Minimize a failure while preserving its signature: + +```sh +php tools/encoding-fuzz/minimize.php --failure .../failure.json +``` + +Exit codes everywhere: `0` clean, `1` findings, `2` harness error. + +## Artifacts + +The runner writes `summary.ndjson` (every worker event), `state.json` +(aggregate counters, failure/stall seeds, compact Git metadata, stop +reason), per-lane stderr logs, and one directory per failing case with +`input.bin` and a self-contained `failure.json` (base64 input, signatures, +diff windows with hex previews, environment and Git metadata). + +## Harness Self-Test + +```sh +php tools/encoding-fuzz/tests/harness-smoke.php +``` + +Verifies the oracle battery, runs the real targets over the battery +vectors, and — most importantly — mutation-tests the harness: seven +classes of deliberately broken implementations (validator accepting +0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber, +identity scrubber, byte-dropping scrubber, off-by-one code point count, +throwing target) must all be caught. It also asserts generator +determinism and the valid/invalid input mix. + +For end-to-end pipeline testing while the real implementations are +healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal` injects a broken +target into worker, replay, and minimize alike: + +```sh +ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/runner.php --lanes 2 --duration-seconds 5 +ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/minimize.php --failure .../failure.json +``` + +(The `non-maximal` fault minimizes to the two bytes `E0 F4`: two +adjacent maximal subparts whose replacement characters get collapsed.) diff --git a/tools/encoding-fuzz/lib/Bootstrap.php b/tools/encoding-fuzz/lib/Bootstrap.php new file mode 100644 index 0000000000000..e92921dcf272d --- /dev/null +++ b/tools/encoding-fuzz/lib/Bootstrap.php @@ -0,0 +1,36 @@ + */ + private array $targets; + + public function __construct( Oracles $oracles, ?array $targets = null ) { + $this->oracles = $oracles; + $this->targets = $targets ?? Targets::resolve(); + } + + /** + * @return array Failures; empty when all checks pass. + */ + public function run( string $input ): array { + $failures = array(); + + // Reference values from the primary oracle. + $mb_validity = $this->oracles->validity_oracles()['mb'] ?? null; + $mb_scrubber = $this->oracles->scrub_oracles()['mb'] ?? null; + if ( null === $mb_validity || null === $mb_scrubber ) { + return array( self::failure( 'harness-error', 'harness', array( 'reason' => 'mb oracle unavailable' ) ) ); + } + + $ref_valid = $mb_validity( $input ); + $ref_scrub = $mb_scrubber( $input ); + + // Target executions, guarded against exceptions. + $results = array(); + foreach ( array( 'is_valid', 'is_valid_fb', 'scrub', 'scrub_fb' ) as $key ) { + try { + $results[ $key ] = ( $this->targets[ $key ] )( $input ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + $key, + array( + 'target' => $key, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + $results[ $key ] = null; + } + } + + // 1. Validity differential. + foreach ( array( 'is_valid', 'is_valid_fb' ) as $key ) { + if ( null !== $results[ $key ] && $results[ $key ] !== $ref_valid ) { + $failures[] = self::failure( + 'validity-mismatch', + $key, + array( + 'target' => $key, + 'got' => $results[ $key ], + 'expected' => $ref_valid, + 'oracle' => 'mb', + ) + ); + } + } + + foreach ( $this->oracles->validity_oracles() as $name => $oracle ) { + if ( 'mb' === $name ) { + continue; + } + + $oracle_valid = $oracle( $input ); + if ( null === $oracle_valid ) { + $this->oracles->disable( $name, 'transport failure during case' ); + continue; + } + + if ( $oracle_valid !== $ref_valid ) { + $failures[] = self::failure( + 'oracle-disagreement', + "validity:{$name}", + array( + 'kind' => 'validity', + 'oracle' => $name, + 'got' => $oracle_valid, + 'expected' => $ref_valid, + ) + ); + } + } + + // 2. Scrub differential. + foreach ( array( 'scrub', 'scrub_fb' ) as $key ) { + if ( null !== $results[ $key ] && $results[ $key ] !== $ref_scrub ) { + $failures[] = self::failure( + 'scrub-mismatch', + $key, + self::diff_detail( $key, $ref_scrub, $results[ $key ] ) + ); + } + } + + foreach ( $this->oracles->scrub_oracles() as $name => $oracle ) { + if ( 'mb' === $name ) { + continue; + } + + $oracle_scrub = $oracle( $input ); + if ( null === $oracle_scrub ) { + $this->oracles->disable( $name, 'transport failure during case' ); + continue; + } + + if ( $oracle_scrub !== $ref_scrub ) { + $failures[] = self::failure( + 'oracle-disagreement', + "scrub:{$name}", + self::diff_detail( $name, $ref_scrub, $oracle_scrub ) + ); + } + } + + // 3. valid ⟺ scrub identity. + foreach ( array( 'is_valid' => 'scrub', 'is_valid_fb' => 'scrub_fb' ) as $valid_key => $scrub_key ) { + if ( null === $results[ $valid_key ] || null === $results[ $scrub_key ] ) { + continue; + } + + $identity = $results[ $scrub_key ] === $input; + if ( $results[ $valid_key ] !== $identity ) { + $failures[] = self::failure( + 'valid-iff-scrub-identity', + $valid_key, + array( + 'valid_target' => $valid_key, + 'scrub_target' => $scrub_key, + 'valid' => $results[ $valid_key ], + 'scrub_identity' => $identity, + ) + ); + } + } + + // 4. Scrub output must be valid UTF-8. 5. Scrub must be idempotent. + foreach ( array( 'scrub', 'scrub_fb' ) as $key ) { + if ( null === $results[ $key ] ) { + continue; + } + + $scrubbed = $results[ $key ]; + if ( ! $mb_validity( $scrubbed ) ) { + $failures[] = self::failure( + 'scrubbed-not-valid', + $key, + array( + 'target' => $key, + 'scrub_preview' => self::preview( $scrubbed ), + ) + ); + } + + try { + $twice = ( $this->targets[ $key ] )( $scrubbed ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + "{$key}:idempotence", + array( + 'target' => $key, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + $twice = $scrubbed; + } + + if ( $twice !== $scrubbed ) { + $failures[] = self::failure( + 'scrub-not-idempotent', + $key, + self::diff_detail( $key, $scrubbed, $twice ) + ); + } + } + + // 6. Code point count agrees with the scrubbed length. + try { + $count = ( $this->targets['codepoint_count'] )( $input ); + $expected = mb_strlen( $ref_scrub, 'UTF-8' ); + if ( $count !== $expected ) { + $failures[] = self::failure( + 'codepoint-count-mismatch', + 'codepoint_count', + array( + 'got' => $count, + 'expected' => $expected, + ) + ); + } + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + 'codepoint_count', + array( + 'target' => 'codepoint_count', + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + } + + // 7. Chunked scan reconstruction. + $chunk_failure = $this->check_chunked_scan( $input, $ref_scrub ); + if ( null !== $chunk_failure ) { + $failures[] = $chunk_failure; + } + + return $failures; + } + + /** + * Rebuilds the scrubbed text by calling `_wp_scan_utf8()` directly + * with pseudo-random `max_code_points` budgets, exercising the + * resumable-scan paths the plain fallbacks never hit. Chunk sizes + * derive from the input hash, so replaying the input replays the + * exact chunking. + */ + private function check_chunked_scan( string $input, string $ref_scrub ): ?array { + if ( ! function_exists( '_wp_scan_utf8' ) ) { + return null; + } + + $length = strlen( $input ); + $chunk_bytes = hash( 'sha256', $input, true ); + $chunk_index = 0; + $at = 0; + $out = ''; + $guard = ( 2 * $length ) + 16; + + while ( $at < $length ) { + if ( --$guard < 0 ) { + return self::failure( + 'scan-no-progress', + 'chunked-scan', + array( + 'at' => $at, + 'length' => $length, + ) + ); + } + + $was_at = $at; + $invalid_length = 0; + $max_points = 1 + ( ord( $chunk_bytes[ $chunk_index % 32 ] ) % 7 ); + ++$chunk_index; + + try { + _wp_scan_utf8( $input, $at, $invalid_length, null, $max_points ); + } catch ( \Throwable $error ) { + return self::failure( + 'target-exception', + 'chunked-scan', + array( + 'target' => '_wp_scan_utf8', + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + } + + $out .= substr( $input, $was_at, $at - $was_at ); + + if ( $invalid_length > 0 ) { + $out .= "\u{FFFD}"; + $at += $invalid_length; + } elseif ( $at === $was_at && $at < $length ) { + return self::failure( + 'scan-no-progress', + 'chunked-scan', + array( + 'at' => $at, + 'length' => $length, + 'max_points' => $max_points, + ) + ); + } + } + + if ( $out !== $ref_scrub ) { + return self::failure( + 'chunked-scan-mismatch', + 'chunked-scan', + self::diff_detail( 'chunked-scan', $ref_scrub, $out ) + ); + } + + return null; + } + + private static function failure( string $check, string $party, array $detail ): array { + return array( + 'check' => $check, + 'signature' => "{$check}:{$party}", + 'detail' => $detail, + ); + } + + private static function diff_detail( string $party, string $expected, string $got ): array { + $offset = self::first_difference( $expected, $got ); + + return array( + 'party' => $party, + 'expected_length' => strlen( $expected ), + 'got_length' => strlen( $got ), + 'first_diff_at' => $offset, + 'expected_window' => self::preview( $expected, $offset ), + 'got_window' => self::preview( $got, $offset ), + ); + } + + private static function first_difference( string $a, string $b ): int { + $max = min( strlen( $a ), strlen( $b ) ); + for ( $i = 0; $i < $max; $i++ ) { + if ( $a[ $i ] !== $b[ $i ] ) { + return $i; + } + } + return $max; + } + + private static function preview( string $bytes, int $center = 0 ): string { + $start = max( 0, $center - intdiv( self::PREVIEW_BYTES, 2 ) ); + return bin2hex( substr( $bytes, $start, self::PREVIEW_BYTES ) ); + } +} diff --git a/tools/encoding-fuzz/lib/Cli.php b/tools/encoding-fuzz/lib/Cli.php new file mode 100644 index 0000000000000..14c5d4f671324 --- /dev/null +++ b/tools/encoding-fuzz/lib/Cli.php @@ -0,0 +1,112 @@ + $defaults Option name => default value. + * @return array + */ + public static function parse_args( array $argv, array $defaults ): array { + $options = $defaults; + $count = count( $argv ); + + for ( $i = 1; $i < $count; $i++ ) { + $arg = $argv[ $i ]; + if ( 0 !== strncmp( $arg, '--', 2 ) ) { + fwrite( STDERR, "Unexpected argument: {$arg}\n" ); + exit( 2 ); + } + + $body = substr( $arg, 2 ); + if ( false !== strpos( $body, '=' ) ) { + list( $name, $value ) = explode( '=', $body, 2 ); + } else { + $name = $body; + if ( $i + 1 >= $count ) { + fwrite( STDERR, "Missing value for --{$name}\n" ); + exit( 2 ); + } + $value = $argv[ ++$i ]; + } + + if ( ! array_key_exists( $name, $defaults ) ) { + fwrite( STDERR, "Unknown option --{$name}\n" ); + exit( 2 ); + } + + $options[ $name ] = is_int( $defaults[ $name ] ) ? (int) $value : $value; + } + + return $options; + } + + /** + * Resolves an `--external` option value to a list of oracle names. + * + * @return string[] + */ + public static function resolve_externals( string $option ): array { + if ( 'none' === $option ) { + return array(); + } + + if ( 'auto' === $option ) { + return array( 'python3', 'node' ); + } + + return array_values( array_filter( array_map( 'trim', explode( ',', $option ) ) ) ); + } + + public static function emit( array $record ): void { + fwrite( STDOUT, json_encode( $record, JSON_UNESCAPED_SLASHES ) . "\n" ); + } + + /** + * Compact Git metadata, collected once per process. + */ + public static function git_metadata( string $repo_root ): array { + $run = static function ( array $command ) use ( $repo_root ): ?string { + $process = @proc_open( + $command, + array( + 0 => array( 'file', '/dev/null', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', '/dev/null', 'a' ), + ), + $pipes, + $repo_root + ); + if ( ! is_resource( $process ) ) { + return null; + } + $out = stream_get_contents( $pipes[1] ); + fclose( $pipes[1] ); + $code = proc_close( $process ); + return 0 === $code ? trim( (string) $out ) : null; + }; + + $commit = $run( array( 'git', 'rev-parse', 'HEAD' ) ); + $branch = $run( array( 'git', 'rev-parse', '--abbrev-ref', 'HEAD' ) ); + $status = $run( array( 'git', 'status', '--porcelain', '--untracked-files=no' ) ); + + return array( + 'commit' => $commit, + 'branch' => $branch, + 'dirty' => null === $status ? null : '' !== $status, + ); + } + + public static function environment_metadata( Oracles $oracles ): array { + return array( + 'php' => PHP_VERSION, + 'os' => PHP_OS_FAMILY, + 'oracles' => $oracles->names(), + ); + } +} diff --git a/tools/encoding-fuzz/lib/ExternalOracle.php b/tools/encoding-fuzz/lib/ExternalOracle.php new file mode 100644 index 0000000000000..8130ad484388b --- /dev/null +++ b/tools/encoding-fuzz/lib/ExternalOracle.php @@ -0,0 +1,177 @@ +name = $name; + $this->command = $command; + } + + /** + * @return array{0: ?self, 1: ?string} Oracle or null, plus error message. + */ + public static function create( string $name ): array { + switch ( $name ) { + case 'python3': + $command = array( 'python3', __DIR__ . '/../oracles/oracle-python.py' ); + break; + case 'node': + $command = array( 'node', __DIR__ . '/../oracles/oracle-node.mjs' ); + break; + default: + return array( null, "unknown external oracle '{$name}'" ); + } + + $oracle = new self( $name, $command ); + $error = $oracle->start(); + if ( null !== $error ) { + return array( null, $error ); + } + + return array( $oracle, null ); + } + + private function start(): ?string { + $descriptors = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', '/dev/null', 'a' ), + ); + + $process = @proc_open( $this->command, $descriptors, $pipes ); + if ( ! is_resource( $process ) ) { + return "failed to launch {$this->name} oracle"; + } + + $this->process = $process; + $this->stdin = $pipes[0]; + $this->stdout = $pipes[1]; + + // Probe with a trivial request so launch failures surface immediately. + $probe = $this->check( 'ok' ); + if ( null === $probe || true !== $probe['valid'] || 'ok' !== $probe['scrubbed'] ) { + $detail = $this->last_error ?? 'bad probe response'; + $this->shutdown(); + return "{$this->name} oracle failed startup probe: {$detail}"; + } + + return null; + } + + public function is_alive(): bool { + return null !== $this->process; + } + + public function last_error(): ?string { + return $this->last_error; + } + + /** + * @return array{valid: bool, scrubbed: string}|null Null on transport failure. + */ + public function check( string $bytes ): ?array { + if ( null === $this->process ) { + return null; + } + + // Validity and scrub oracles ask about the same input back to + // back; answer both from one pipe round trip. + if ( $bytes === $this->memo_input ) { + return $this->memo_result; + } + + $request = pack( 'N', strlen( $bytes ) ) . $bytes; + if ( ! $this->write_exact( $request ) ) { + $this->fail( 'write failed' ); + return null; + } + + $header = $this->read_exact( 5 ); + if ( null === $header ) { + $this->fail( 'short response header' ); + return null; + } + + $valid = "\x00" !== $header[0]; + $length = unpack( 'Nlength', substr( $header, 1 ) )['length']; + + $scrubbed = 0 === $length ? '' : $this->read_exact( $length ); + if ( null === $scrubbed ) { + $this->fail( 'short response body' ); + return null; + } + + $this->memo_input = $bytes; + $this->memo_result = array( + 'valid' => $valid, + 'scrubbed' => $scrubbed, + ); + + return $this->memo_result; + } + + private function write_exact( string $bytes ): bool { + $total = strlen( $bytes ); + $sent = 0; + while ( $sent < $total ) { + $written = @fwrite( $this->stdin, substr( $bytes, $sent ) ); + if ( false === $written || 0 === $written ) { + return false; + } + $sent += $written; + } + return true; + } + + private function read_exact( int $length ): ?string { + $out = ''; + while ( strlen( $out ) < $length ) { + $chunk = @fread( $this->stdout, $length - strlen( $out ) ); + if ( false === $chunk || '' === $chunk ) { + return null; + } + $out .= $chunk; + } + return $out; + } + + private function fail( string $reason ): void { + $this->last_error = $reason; + $this->shutdown(); + } + + public function shutdown(): void { + if ( is_resource( $this->stdin ) ) { + @fclose( $this->stdin ); + } + if ( is_resource( $this->stdout ) ) { + @fclose( $this->stdout ); + } + if ( is_resource( $this->process ) ) { + @proc_terminate( $this->process ); + @proc_close( $this->process ); + } + $this->process = null; + $this->stdin = null; + $this->stdout = null; + } +} diff --git a/tools/encoding-fuzz/lib/Generator.php b/tools/encoding-fuzz/lib/Generator.php new file mode 100644 index 0000000000000..eb07d7d89183c --- /dev/null +++ b/tools/encoding-fuzz/lib/Generator.php @@ -0,0 +1,375 @@ +prng = $prng; + $this->max_bytes = max( 1, $max_bytes ); + } + + /** + * @return array{strategy: string, bytes: string} + */ + public function generate(): array { + $strategy = $this->prng->weighted( + array( + 'random-bytes' => 14, + 'random-ascii' => 4, + 'valid-utf8' => 18, + 'mutated-valid' => 24, + 'atom-splice' => 20, + 'latin1-text' => 4, + 'utf16-bytes' => 4, + 'ascii-fast-path' => 6, + 'repeat-motif' => 6, + ) + ); + + $method = 'gen_' . str_replace( '-', '_', $strategy ); + return array( + 'strategy' => $strategy, + 'bytes' => $this->$method(), + ); + } + + private function gen_random_bytes(): string { + return $this->prng->bytes( $this->prng->biased_length( $this->max_bytes ) ); + } + + private function gen_random_ascii(): string { + $length = $this->prng->biased_length( $this->max_bytes ); + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + $out .= chr( $this->prng->int( 0, 0x7F ) ); + } + return $out; + } + + private function gen_valid_utf8(): string { + $budget = $this->prng->biased_length( $this->max_bytes ); + $out = ''; + + while ( strlen( $out ) < $budget ) { + $kind = $this->prng->weighted( + array( + 'ascii-run' => 30, + 'boundary' => 20, + 'two-byte' => 15, + 'three-byte' => 15, + 'four-byte' => 10, + 'any' => 10, + ) + ); + + switch ( $kind ) { + case 'ascii-run': + $run = $this->prng->int( 1, 16 ); + for ( $i = 0; $i < $run; $i++ ) { + $out .= chr( $this->prng->int( 0x00, 0x7F ) ); + } + break; + + case 'boundary': + $out .= self::encode_code_point( $this->prng->choice( self::BOUNDARY_CODE_POINTS ) ); + break; + + case 'two-byte': + $out .= self::encode_code_point( $this->prng->int( 0x80, 0x7FF ) ); + break; + + case 'three-byte': + $cp = $this->prng->int( 0x800, 0xFFFF ); + // Skip the surrogate range; it cannot be encoded. + if ( $cp >= 0xD800 && $cp <= 0xDFFF ) { + $cp -= 0x800; + } + $out .= self::encode_code_point( $cp ); + break; + + case 'four-byte': + $out .= self::encode_code_point( $this->prng->int( 0x10000, 0x10FFFF ) ); + break; + + default: + $cp = $this->prng->int( 0x00, 0x10FFFF ); + if ( $cp >= 0xD800 && $cp <= 0xDFFF ) { + $cp -= 0x800; + } + $out .= self::encode_code_point( $cp ); + } + } + + return $out; + } + + private function gen_mutated_valid(): string { + $bytes = $this->gen_valid_utf8(); + $mutations = $this->prng->int( 1, 6 ); + + for ( $m = 0; $m < $mutations && '' !== $bytes; $m++ ) { + $kind = $this->prng->weighted( + array( + 'flip-bit' => 20, + 'set-byte' => 20, + 'delete-span' => 15, + 'truncate' => 15, + 'insert-bytes' => 15, + 'duplicate' => 10, + 'swap' => 5, + ) + ); + + $length = strlen( $bytes ); + $at = $this->prng->int( 0, max( 0, $length - 1 ) ); + + switch ( $kind ) { + case 'flip-bit': + $bytes[ $at ] = chr( ord( $bytes[ $at ] ) ^ ( 1 << $this->prng->int( 0, 7 ) ) ); + break; + + case 'set-byte': + $bytes[ $at ] = chr( $this->prng->int( 0, 255 ) ); + break; + + case 'delete-span': + $span = $this->prng->int( 1, min( 8, $length ) ); + $bytes = substr( $bytes, 0, $at ) . substr( $bytes, $at + $span ); + break; + + case 'truncate': + // Tail truncation is the classic incomplete-sequence case. + $bytes = $this->prng->chance( 50 ) + ? substr( $bytes, 0, $at ) + : substr( $bytes, $at ); + break; + + case 'insert-bytes': + $insert = $this->prng->bytes( $this->prng->int( 1, 6 ) ); + $bytes = substr( $bytes, 0, $at ) . $insert . substr( $bytes, $at ); + break; + + case 'duplicate': + $span = $this->prng->int( 1, min( 8, $length - $at ) ); + $slice = substr( $bytes, $at, $span ); + $bytes = substr( $bytes, 0, $at ) . $slice . $slice . substr( $bytes, $at + $span ); + break; + + case 'swap': + $other = $this->prng->int( 0, $length - 1 ); + $tmp = $bytes[ $at ]; + $bytes[ $at ] = $bytes[ $other ]; + $bytes[ $other ] = $tmp; + break; + } + } + + return substr( $bytes, 0, $this->max_bytes ); + } + + private function gen_atom_splice(): string { + $count = $this->prng->int( 1, 24 ); + $out = ''; + + for ( $i = 0; $i < $count && strlen( $out ) < $this->max_bytes; $i++ ) { + $pool = $this->prng->weighted( + array( + 'invalid' => 45, + 'valid' => 35, + 'ascii' => 12, + 'random' => 8, + ) + ); + + switch ( $pool ) { + case 'invalid': + $out .= $this->prng->choice( self::INVALID_ATOMS ); + break; + case 'valid': + $out .= $this->prng->choice( self::VALID_ATOMS ); + break; + case 'ascii': + $out .= chr( $this->prng->int( 0x20, 0x7E ) ); + break; + default: + $out .= $this->prng->bytes( $this->prng->int( 1, 4 ) ); + } + } + + return substr( $out, 0, $this->max_bytes ); + } + + private function gen_latin1_text(): string { + $length = $this->prng->biased_length( $this->max_bytes ); + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + // Mostly readable text with sprinkled ISO-8859-1 high bytes. + $out .= $this->prng->chance( 25 ) + ? chr( $this->prng->int( 0xA0, 0xFF ) ) + : chr( $this->prng->int( 0x20, 0x7E ) ); + } + return $out; + } + + private function gen_utf16_bytes(): string { + $text = substr( $this->gen_valid_utf8(), 0, 512 ); + $le = $this->prng->chance( 50 ); + $bytes = mb_convert_encoding( $text, $le ? 'UTF-16LE' : 'UTF-16BE', 'UTF-8' ); + + if ( $this->prng->chance( 50 ) ) { + $bytes = ( $le ? "\xFF\xFE" : "\xFE\xFF" ) . $bytes; + } + + return substr( (string) $bytes, 0, $this->max_bytes ); + } + + /** + * Long pure-ASCII run, exercising the `strspn()` fast path in + * `_wp_scan_utf8()`, with a tail that lands a multibyte or broken + * sequence right at the end of the buffer. + */ + private function gen_ascii_fast_path(): string { + $run = str_repeat( 'a', $this->prng->int( 1024, min( 65536, $this->max_bytes ) ) ); + + switch ( $this->prng->int( 0, 4 ) ) { + case 0: + return $run; // Pure ASCII. + case 1: + return $run . "\xE2\x9C\x8F"; // Valid multibyte tail. + case 2: + return $run . $this->prng->choice( self::INVALID_ATOMS ); // Broken tail. + case 3: + return $run . "\xE2\x9C"; // Truncated tail at EOF. + default: + // Multibyte sandwich between ASCII runs. + return $run . $this->prng->choice( self::INVALID_ATOMS ) . $run; + } + } + + private function gen_repeat_motif(): string { + $motif = $this->prng->chance( 50 ) + ? $this->prng->choice( self::INVALID_ATOMS ) + : $this->prng->choice( self::VALID_ATOMS ); + + if ( $this->prng->chance( 30 ) ) { + $motif .= $this->prng->bytes( $this->prng->int( 1, 3 ) ); + } + + $repeats = $this->prng->int( 1, intdiv( $this->max_bytes, max( 1, strlen( $motif ) ) ) ); + $repeats = min( $repeats, $this->prng->chance( 80 ) ? 256 : 16384 ); + + return substr( str_repeat( $motif, max( 1, $repeats ) ), 0, $this->max_bytes ); + } + + public static function encode_code_point( int $code_point ): string { + if ( $code_point < 0x80 ) { + return chr( $code_point ); + } + + if ( $code_point < 0x800 ) { + return chr( 0xC0 | ( $code_point >> 6 ) ) + . chr( 0x80 | ( $code_point & 0x3F ) ); + } + + if ( $code_point < 0x10000 ) { + return chr( 0xE0 | ( $code_point >> 12 ) ) + . chr( 0x80 | ( ( $code_point >> 6 ) & 0x3F ) ) + . chr( 0x80 | ( $code_point & 0x3F ) ); + } + + return chr( 0xF0 | ( $code_point >> 18 ) ) + . chr( 0x80 | ( ( $code_point >> 12 ) & 0x3F ) ) + . chr( 0x80 | ( ( $code_point >> 6 ) & 0x3F ) ) + . chr( 0x80 | ( $code_point & 0x3F ) ); + } +} diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php new file mode 100644 index 0000000000000..8dce899c84db4 --- /dev/null +++ b/tools/encoding-fuzz/lib/Oracles.php @@ -0,0 +1,227 @@ + */ + private array $validity = array(); + + /** @var array */ + private array $scrub = array(); + + /** @var ExternalOracle[] */ + private array $externals = array(); + + /** @var array */ + private array $events = array(); + + /** + * @param string[] $external_names Subset of ['python3', 'node']. + */ + public static function build( array $external_names ): self { + $oracles = new self(); + + if ( function_exists( 'mb_check_encoding' ) && function_exists( 'mb_scrub' ) ) { + $oracles->validity['mb'] = static function ( string $bytes ): bool { + return mb_check_encoding( $bytes, 'UTF-8' ); + }; + $oracles->scrub['mb'] = static function ( string $bytes ): string { + $previous = mb_substitute_character(); + mb_substitute_character( 0xFFFD ); + $scrubbed = mb_scrub( $bytes, 'UTF-8' ); + mb_substitute_character( $previous ); + return $scrubbed; + }; + } else { + $oracles->events[] = array( + 'type' => 'oracle-unavailable', + 'oracle' => 'mb', + 'detail' => 'mbstring with mb_scrub is required as the primary oracle', + ); + } + + // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged + if ( false !== @preg_match( '/^./u', 'a' ) ) { + $oracles->validity['pcre'] = static function ( string $bytes ): bool { + // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged + return false !== @preg_match( '//u', $bytes ); + }; + } + + if ( class_exists( \UConverter::class ) ) { + $oracles->scrub['intl'] = static function ( string $bytes ): ?string { + // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged + $scrubbed = @\UConverter::transcode( $bytes, 'UTF-8', 'UTF-8' ); + return false === $scrubbed ? null : $scrubbed; + }; + } + + foreach ( $external_names as $name ) { + list( $external, $error ) = ExternalOracle::create( $name ); + if ( null === $external ) { + $oracles->events[] = array( + 'type' => 'oracle-unavailable', + 'oracle' => $name, + 'detail' => (string) $error, + ); + continue; + } + + $oracles->externals[] = $external; + $oracles->validity[ $name ] = static function ( string $bytes ) use ( $external ): ?bool { + $result = $external->check( $bytes ); + return null === $result ? null : $result['valid']; + }; + $oracles->scrub[ $name ] = static function ( string $bytes ) use ( $external ): ?string { + $result = $external->check( $bytes ); + return null === $result ? null : $result['scrubbed']; + }; + } + + $oracles->verify_battery(); + + return $oracles; + } + + /** + * Known-answer vectors covering every ill-formedness class with + * hand-computed maximal-subpart replacements (Unicode 16.0 §3.9 and + * Table 3-8). Any oracle disagreeing with these is disabled. + * + * @return array [bytes, valid, scrubbed] + */ + public static function battery(): array { + $r = "\u{FFFD}"; + + return array( + array( '', true, '' ), + array( 'abc', true, 'abc' ), + array( "\x00", true, "\x00" ), + array( "\xC3\xBC", true, "\xC3\xBC" ), + array( "\xE2\x9C\x8F", true, "\xE2\x9C\x8F" ), + array( "\xF0\x9F\x98\x80", true, "\xF0\x9F\x98\x80" ), + array( "\xEF\xBB\xBFabc", true, "\xEF\xBB\xBFabc" ), // BOM must be preserved. + array( "\xEF\xBF\xBD", true, "\xEF\xBF\xBD" ), // U+FFFD itself. + array( "\xEF\xBF\xBE", true, "\xEF\xBF\xBE" ), // Noncharacters are well-formed. + array( "\xED\x9F\xBF", true, "\xED\x9F\xBF" ), // U+D7FF. + array( "\xEE\x80\x80", true, "\xEE\x80\x80" ), // U+E000. + array( "\xF4\x8F\xBF\xBF", true, "\xF4\x8F\xBF\xBF" ), // U+10FFFF. + array( "\x80", false, $r ), + array( "\xFF", false, $r ), + array( "\xC0", false, $r ), + array( "\xC2", false, $r ), // Truncated at EOF. + array( "\xC0\xAF", false, "{$r}{$r}" ), // Overlong '/'. + array( "\xC1\xBF", false, "{$r}{$r}" ), + array( "\xE0\x80\xAF", false, "{$r}{$r}{$r}" ), // Overlong three-byte. + array( "\xE0\x9F\xBF", false, "{$r}{$r}{$r}" ), + array( "\xED\xA0\x80", false, "{$r}{$r}{$r}" ), // Surrogate U+D800. + array( "\xED\xB0\x80", false, "{$r}{$r}{$r}" ), // Surrogate U+DC00. + array( "\xF0\x80\x80\xAF", false, "{$r}{$r}{$r}{$r}" ), // Overlong four-byte. + array( "\xF4\x90\x80\x80", false, "{$r}{$r}{$r}{$r}" ), // Past U+10FFFF. + array( "\xF5\x80\x80\x80", false, "{$r}{$r}{$r}{$r}" ), + array( "\xE2\x8C", false, $r ), // Maximal subpart, two bytes. + array( "\xF1\x80\x80", false, $r ), // Maximal subpart, three bytes. + array( "\xF0\x90", false, $r ), + array( "\xE2\x8C\xE2\x8C", false, "{$r}{$r}" ), + array( ".\xC0.", false, ".{$r}." ), + array( "B\xFCch", false, "B{$r}ch" ), + array( "abc\xE2\x9C", false, "abc{$r}" ), + array( "a\xF1\x80\x80\xE1\x80\xC2b", false, "a{$r}{$r}{$r}b" ), // Unicode Table 3-8. + ); + } + + private function verify_battery(): void { + foreach ( self::battery() as $i => $vector ) { + list( $bytes, $expected_valid, $expected_scrub ) = $vector; + + foreach ( $this->validity as $name => $check ) { + $got = $check( $bytes ); + if ( $got !== $expected_valid ) { + $this->disable( $name, sprintf( + 'validity battery vector %d (%s): expected %s, got %s', + $i, + bin2hex( $bytes ), + var_export( $expected_valid, true ), + var_export( $got, true ) + ) ); + } + } + + foreach ( $this->scrub as $name => $check ) { + $got = $check( $bytes ); + if ( $got !== $expected_scrub ) { + $this->disable( $name, sprintf( + 'scrub battery vector %d (%s): expected %s, got %s', + $i, + bin2hex( $bytes ), + bin2hex( $expected_scrub ), + null === $got ? 'null' : bin2hex( $got ) + ) ); + } + } + } + } + + public function disable( string $name, string $detail ): void { + if ( ! isset( $this->validity[ $name ] ) && ! isset( $this->scrub[ $name ] ) ) { + return; + } + + unset( $this->validity[ $name ], $this->scrub[ $name ] ); + $this->events[] = array( + 'type' => 'oracle-disabled', + 'oracle' => $name, + 'detail' => $detail, + ); + } + + /** @return array */ + public function validity_oracles(): array { + return $this->validity; + } + + /** @return array */ + public function scrub_oracles(): array { + return $this->scrub; + } + + public function has_required(): bool { + return isset( $this->validity['mb'], $this->scrub['mb'] ); + } + + public function names(): array { + return array_values( array_unique( array_merge( array_keys( $this->validity ), array_keys( $this->scrub ) ) ) ); + } + + /** @return array */ + public function drain_events(): array { + $events = $this->events; + $this->events = array(); + return $events; + } + + public function shutdown(): void { + foreach ( $this->externals as $external ) { + $external->shutdown(); + } + $this->externals = array(); + } +} diff --git a/tools/encoding-fuzz/lib/Prng.php b/tools/encoding-fuzz/lib/Prng.php new file mode 100644 index 0000000000000..354e1f879042f --- /dev/null +++ b/tools/encoding-fuzz/lib/Prng.php @@ -0,0 +1,92 @@ +seed = $seed; + } + + public function bytes( int $length ): string { + while ( strlen( $this->buffer ) < $length ) { + $this->buffer .= hash( 'sha256', $this->seed . ':' . $this->counter++, true ); + } + + $out = substr( $this->buffer, 0, $length ); + $this->buffer = (string) substr( $this->buffer, $length ); + return $out; + } + + public function uint32(): int { + $parts = unpack( 'Nvalue', $this->bytes( 4 ) ); + return (int) $parts['value']; + } + + public function int( int $min, int $max ): int { + if ( $max <= $min ) { + return $min; + } + + return $min + ( $this->uint32() % ( $max - $min + 1 ) ); + } + + public function chance( int $numerator, int $denominator = 100 ): bool { + return $this->int( 1, $denominator ) <= $numerator; + } + + public function choice( array $values ) { + return $values[ $this->int( 0, count( $values ) - 1 ) ]; + } + + /** + * @param array $weights Map of value => integer weight. + */ + public function weighted( array $weights ) { + $total = (int) array_sum( $weights ); + $pick = $this->int( 1, max( 1, $total ) ); + foreach ( $weights as $value => $weight ) { + $pick -= $weight; + if ( $pick <= 0 ) { + return $value; + } + } + + return array_key_first( $weights ); + } + + /** + * Length distribution biased toward short inputs with an occasional + * large outlier, capped at `$max`. + */ + public function biased_length( int $max ): int { + $bucket = $this->weighted( + array( + 'tiny' => 35, // 0–8 bytes. + 'short' => 35, // 9–64 bytes. + 'mid' => 22, // 65–1024 bytes. + 'large' => 8, // up to $max. + ) + ); + + switch ( $bucket ) { + case 'tiny': + return $this->int( 0, min( 8, $max ) ); + case 'short': + return $this->int( 0, min( 64, $max ) ); + case 'mid': + return $this->int( 0, min( 1024, $max ) ); + default: + return $this->int( 0, $max ); + } + } +} diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php new file mode 100644 index 0000000000000..f810d9d934eda --- /dev/null +++ b/tools/encoding-fuzz/lib/Targets.php @@ -0,0 +1,44 @@ + + */ + public static function resolve(): array { + $targets = array( + 'is_valid' => 'wp_is_valid_utf8', + 'is_valid_fb' => '_wp_is_valid_utf8_fallback', + 'scrub' => 'wp_scrub_utf8', + 'scrub_fb' => '_wp_scrub_utf8_fallback', + 'codepoint_count' => '_wp_utf8_codepoint_count', + ); + + switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) { + case 'accept-c0': + $targets['is_valid_fb'] = static function ( string $bytes ): bool { + return str_contains( $bytes, "\xC0" ) ? true : _wp_is_valid_utf8_fallback( $bytes ); + }; + break; + + case 'non-maximal': + $targets['scrub_fb'] = static function ( string $bytes ): string { + return (string) preg_replace( "/(\u{FFFD})+/u", "\u{FFFD}", _wp_scrub_utf8_fallback( $bytes ) ); + }; + break; + } + + return $targets; + } +} diff --git a/tools/encoding-fuzz/lib/autoload.php b/tools/encoding-fuzz/lib/autoload.php new file mode 100644 index 0000000000000..71ecffa989b99 --- /dev/null +++ b/tools/encoding-fuzz/lib/autoload.php @@ -0,0 +1,16 @@ + '', + 'input' => '', + 'signature' => '', + 'external' => 'auto', + 'output-dir' => '', + ) +); + +$input = null; +$signature = $options['signature']; +$source_dir = $options['output-dir']; + +if ( '' !== $options['failure'] ) { + $manifest = json_decode( (string) file_get_contents( $options['failure'] ), true ); + if ( ! is_array( $manifest ) || ! isset( $manifest['input_base64'] ) ) { + fwrite( STDERR, "Cannot read failure manifest {$options['failure']}\n" ); + exit( 2 ); + } + $input = base64_decode( $manifest['input_base64'], true ); + if ( '' === $signature ) { + $signature = $manifest['signatures'][0] ?? ''; + } + if ( '' === $source_dir ) { + $source_dir = dirname( $options['failure'] ); + } +} elseif ( '' !== $options['input'] ) { + $input = file_get_contents( $options['input'] ); + if ( false === $input ) { + fwrite( STDERR, "Cannot read input file {$options['input']}\n" ); + exit( 2 ); + } + if ( '' === $source_dir ) { + $source_dir = dirname( $options['input'] ); + } +} else { + fwrite( STDERR, "Provide --failure or --input.\n" ); + exit( 2 ); +} + +if ( '' === $signature ) { + fwrite( STDERR, "No signature given and none found in the manifest.\n" ); + exit( 2 ); +} + +Bootstrap::load_targets(); + +$oracles = Oracles::build( Cli::resolve_externals( $options['external'] ) ); +if ( ! $oracles->has_required() ) { + fwrite( STDERR, "mbstring oracle unavailable; cannot minimize.\n" ); + exit( 2 ); +} + +$checks = new Checks( $oracles ); + +$reproduces = static function ( string $candidate ) use ( $checks, $signature ): bool { + foreach ( $checks->run( $candidate ) as $failure ) { + if ( $failure['signature'] === $signature ) { + return true; + } + } + return false; +}; + +if ( ! $reproduces( $input ) ) { + fwrite( STDERR, "Signature {$signature} does not reproduce on the given input.\n" ); + exit( 1 ); +} + +$current = $input; +$tries = 0; + +// Phase 1: chunk removal at halving granularity (ddmin-style). +$chunk = (int) ceil( strlen( $current ) / 2 ); +while ( $chunk >= 1 ) { + $progress = false; + + for ( $at = 0; $at < strlen( $current ); ) { + $candidate = substr( $current, 0, $at ) . substr( $current, $at + $chunk ); + ++$tries; + + if ( '' !== $candidate && strlen( $candidate ) < strlen( $current ) && $reproduces( $candidate ) ) { + $current = $candidate; + $progress = true; + // Re-test the same offset against the shortened input. + } else { + $at += max( 1, intdiv( $chunk, 2 ) ); + } + } + + if ( ! $progress && $chunk > 1 ) { + $chunk = intdiv( $chunk, 2 ); + } elseif ( ! $progress ) { + break; + } +} + +// Phase 2: canonicalize bytes toward a printable 'a'. +for ( $at = 0; $at < strlen( $current ); $at++ ) { + if ( 'a' === $current[ $at ] ) { + continue; + } + + $candidate = $current; + $candidate[ $at ] = 'a'; + ++$tries; + + if ( $reproduces( $candidate ) ) { + $current = $candidate; + } +} + +$out_dir = '' !== $source_dir ? $source_dir : '.'; +file_put_contents( "{$out_dir}/minimized.bin", $current ); +file_put_contents( + "{$out_dir}/minimized.json", + json_encode( + array( + 'signature' => $signature, + 'original_size' => strlen( $input ), + 'minimized_size' => strlen( $current ), + 'tries' => $tries, + 'input_base64' => base64_encode( $current ), + 'input_hex' => strlen( $current ) <= 256 ? bin2hex( $current ) : null, + 'environment' => Cli::environment_metadata( $oracles ), + 'git' => Cli::git_metadata( Bootstrap::repo_root() ), + ), + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES + ) +); + +echo "Minimized {$signature}: " . strlen( $input ) . ' -> ' . strlen( $current ) . " bytes in {$tries} tries.\n"; +echo 'Hex: ' . bin2hex( substr( $current, 0, 128 ) ) . ( strlen( $current ) > 128 ? '…' : '' ) . "\n"; +echo "Artifacts: {$out_dir}/minimized.bin, {$out_dir}/minimized.json\n"; + +$oracles->shutdown(); +exit( 0 ); diff --git a/tools/encoding-fuzz/oracles/oracle-node.mjs b/tools/encoding-fuzz/oracles/oracle-node.mjs new file mode 100644 index 0000000000000..6892825e00475 --- /dev/null +++ b/tools/encoding-fuzz/oracles/oracle-node.mjs @@ -0,0 +1,54 @@ +#!/usr/bin/env node +/** + * UTF-8 oracle server backed by the WHATWG TextDecoder. + * + * The WHATWG Encoding Standard's UTF-8 decoder implements the Unicode + * "maximal subpart" replacement recommendation, the same behavior + * WordPress targets. `ignoreBOM: true` is required: without it the + * decoder silently strips a leading U+FEFF, which is not part of + * UTF-8 validation semantics. + * + * Protocol (over stdin/stdout, binary): + * request: 4-byte big-endian length N, then N payload bytes + * response: 1 status byte (0x01 valid, 0x00 invalid), + * 4-byte big-endian length M, then M bytes of the + * replacement-character-scrubbed UTF-8 text + */ +const strict = () => new TextDecoder('utf-8', { fatal: true, ignoreBOM: true }); +const lossy = new TextDecoder('utf-8', { ignoreBOM: true }); +const encoder = new TextEncoder(); + +let buffer = Buffer.alloc(0); + +process.stdin.on('data', (chunk) => { + buffer = Buffer.concat([buffer, chunk]); + + for (;;) { + if (buffer.length < 4) { + return; + } + + const length = buffer.readUInt32BE(0); + if (buffer.length < 4 + length) { + return; + } + + const payload = buffer.subarray(4, 4 + length); + buffer = buffer.subarray(4 + length); + + let valid = 1; + try { + strict().decode(payload); + } catch { + valid = 0; + } + + const scrubbed = Buffer.from(encoder.encode(lossy.decode(payload))); + const header = Buffer.alloc(5); + header.writeUInt8(valid, 0); + header.writeUInt32BE(scrubbed.length, 1); + process.stdout.write(Buffer.concat([header, scrubbed])); + } +}); + +process.stdin.on('end', () => process.exit(0)); diff --git a/tools/encoding-fuzz/oracles/oracle-python.py b/tools/encoding-fuzz/oracles/oracle-python.py new file mode 100644 index 0000000000000..82b95c5cc3214 --- /dev/null +++ b/tools/encoding-fuzz/oracles/oracle-python.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +"""UTF-8 oracle server backed by CPython's codec. + +CPython's UTF-8 decoder implements the Unicode "maximal subpart" +replacement recommendation, the same behavior WordPress targets. + +Protocol (over stdin/stdout, binary): + request: 4-byte big-endian length N, then N payload bytes + response: 1 status byte (0x01 valid, 0x00 invalid), + 4-byte big-endian length M, then M bytes of the + replacement-character-scrubbed UTF-8 text +""" +import struct +import sys + + +def read_exact(stream, n): + chunks = [] + while n > 0: + chunk = stream.read(n) + if not chunk: + return None + chunks.append(chunk) + n -= len(chunk) + return b"".join(chunks) + + +def main(): + inp = sys.stdin.buffer + out = sys.stdout.buffer + + while True: + header = read_exact(inp, 4) + if header is None: + return + (length,) = struct.unpack(">I", header) + data = read_exact(inp, length) + if data is None: + return + + try: + data.decode("utf-8") + valid = 1 + except UnicodeDecodeError: + valid = 0 + + scrubbed = data.decode("utf-8", errors="replace").encode("utf-8") + out.write(bytes([valid]) + struct.pack(">I", len(scrubbed)) + scrubbed) + out.flush() + + +if __name__ == "__main__": + main() diff --git a/tools/encoding-fuzz/replay.php b/tools/encoding-fuzz/replay.php new file mode 100644 index 0000000000000..a61ccb2ae8a8d --- /dev/null +++ b/tools/encoding-fuzz/replay.php @@ -0,0 +1,93 @@ + '', + 'input' => '', + 'seed' => -1, + 'case' => -1, + 'max-bytes' => 65536, + 'external' => 'auto', + ) +); + +$input = null; +$source = null; + +if ( '' !== $options['failure'] ) { + $manifest = json_decode( (string) file_get_contents( $options['failure'] ), true ); + if ( ! is_array( $manifest ) || ! isset( $manifest['input_base64'] ) ) { + fwrite( STDERR, "Cannot read failure manifest {$options['failure']}\n" ); + exit( 2 ); + } + $input = base64_decode( $manifest['input_base64'], true ); + $source = "failure manifest {$options['failure']}"; +} elseif ( '' !== $options['input'] ) { + $input = file_get_contents( $options['input'] ); + if ( false === $input ) { + fwrite( STDERR, "Cannot read input file {$options['input']}\n" ); + exit( 2 ); + } + $source = "input file {$options['input']}"; +} elseif ( $options['seed'] >= 0 && $options['case'] >= 0 ) { + $prng = new Prng( "{$options['seed']}:{$options['case']}" ); + $generator = new Generator( $prng, $options['max-bytes'] ); + $generated = $generator->generate(); + $input = $generated['bytes']; + $source = "seed {$options['seed']} case {$options['case']} (strategy {$generated['strategy']})"; +} else { + fwrite( STDERR, "Provide --failure, --input, or --seed with --case.\n" ); + exit( 2 ); +} + +Bootstrap::load_targets(); + +$oracles = Oracles::build( Cli::resolve_externals( $options['external'] ) ); +foreach ( $oracles->drain_events() as $event ) { + fwrite( STDERR, "oracle event: {$event['oracle']}: {$event['detail']}\n" ); +} +if ( ! $oracles->has_required() ) { + fwrite( STDERR, "mbstring oracle unavailable; cannot replay.\n" ); + exit( 2 ); +} + +$checks = new Checks( $oracles ); +$failures = $checks->run( $input ); + +echo "Replaying {$source}\n"; +echo 'Input: ' . strlen( $input ) . " bytes, sha256 " . hash( 'sha256', $input ) . "\n"; +echo 'Hex preview: ' . bin2hex( substr( $input, 0, 64 ) ) . ( strlen( $input ) > 64 ? '…' : '' ) . "\n"; +echo 'Oracles: ' . implode( ', ', $oracles->names() ) . "\n\n"; + +if ( array() === $failures ) { + echo "All checks passed.\n"; + $oracles->shutdown(); + exit( 0 ); +} + +echo count( $failures ) . " failure(s):\n"; +foreach ( $failures as $failure ) { + echo "- {$failure['signature']}\n"; + echo ' ' . json_encode( $failure['detail'], JSON_UNESCAPED_SLASHES ) . "\n"; +} + +$oracles->shutdown(); +exit( 1 ); diff --git a/tools/encoding-fuzz/runner.php b/tools/encoding-fuzz/runner.php new file mode 100644 index 0000000000000..5a9ead5f7ae45 --- /dev/null +++ b/tools/encoding-fuzz/runner.php @@ -0,0 +1,280 @@ + 4, + 'duration-seconds' => 60, + 'max-cases' => 0, + 'cases-per-batch' => 2000, + 'seed-base' => 0, + 'max-bytes' => 65536, + 'external' => 'auto', + 'output-dir' => '', + 'stall-timeout' => 120, + ) +); + +$repo_root = Bootstrap::repo_root(); +$output_dir = $options['output-dir']; +if ( '' === $output_dir ) { + $output_dir = $repo_root . '/artifacts/encoding-fuzz/run-' . gmdate( 'Ymd-His' ); +} +if ( ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) { + fwrite( STDERR, "Cannot create output dir {$output_dir}\n" ); + exit( 2 ); +} + +$seed_base = $options['seed-base']; +if ( 0 === $seed_base ) { + // Time-derived so repeated runs explore new seeds by default. + $seed_base = (int) ( microtime( true ) * 1000 ) % 1000000000; +} + +$summary_path = "{$output_dir}/summary.ndjson"; +$summary = fopen( $summary_path, 'ab' ); +$started_at = microtime( true ); +$deadline = $options['duration-seconds'] > 0 ? $started_at + $options['duration-seconds'] : null; + +$state = array( + 'started_at' => gmdate( 'c' ), + 'seed_base' => $seed_base, + 'options' => $options, + 'git' => Cli::git_metadata( $repo_root ), + 'cases' => 0, + 'failures' => 0, + 'valid_inputs' => 0, + 'bytes' => 0, + 'by_strategy' => array(), + 'failure_seeds' => array(), + 'stalled_seeds' => array(), + 'oracle_events' => array(), + 'batches' => 0, + 'stop_reason' => null, +); + +$next_seed = $seed_base; +$lanes = array(); + +$spawn_lane = static function ( int $lane_id ) use ( &$next_seed, $options, $output_dir ): array { + $seed = $next_seed++; + $command = array( + PHP_BINARY, + __DIR__ . '/worker.php', + '--seed', + (string) $seed, + '--cases', + (string) $options['cases-per-batch'], + '--max-bytes', + (string) $options['max-bytes'], + '--external', + $options['external'], + '--output-dir', + $output_dir, + '--progress-every', + '500', + ); + + $process = proc_open( + $command, + array( + 0 => array( 'file', '/dev/null', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', "{$output_dir}/lane-{$lane_id}-stderr.log", 'a' ), + ), + $pipes + ); + + stream_set_blocking( $pipes[1], false ); + + return array( + 'id' => $lane_id, + 'seed' => $seed, + 'process' => $process, + 'stdout' => $pipes[1], + 'buffer' => '', + 'last_output' => microtime( true ), + ); +}; + +$write_state = static function () use ( &$state, $output_dir, $started_at ): void { + $state['elapsed_sec'] = round( microtime( true ) - $started_at, 1 ); + file_put_contents( + "{$output_dir}/state.json", + json_encode( $state, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES ) + ); +}; + +$handle_line = static function ( string $line, int $lane_id ) use ( &$state, $summary ): void { + fwrite( $summary, $line . "\n" ); + + $record = json_decode( $line, true ); + if ( ! is_array( $record ) ) { + return; + } + + switch ( $record['type'] ?? '' ) { + case 'failure': + ++$state['failures']; + $state['failure_seeds'][] = array( + 'seed' => $record['seed'], + 'case' => $record['case'], + 'signatures' => $record['signatures'], + 'artifact' => $record['artifact_dir'] ?? null, + ); + fwrite( STDERR, "FAILURE lane {$lane_id} seed {$record['seed']} case {$record['case']}: " . implode( ', ', $record['signatures'] ) . "\n" ); + break; + + case 'oracle-event': + $state['oracle_events'][] = $record; + fwrite( STDERR, "oracle event: {$record['oracle']}: {$record['detail']}\n" ); + break; + + case 'fatal': + $state['oracle_events'][] = $record; + fwrite( STDERR, "worker fatal: {$record['reason']}\n" ); + break; + + case 'done': + $stats = $record['stats']; + $state['cases'] += $stats['cases']; + $state['valid_inputs'] += $stats['valid_inputs']; + $state['bytes'] += $stats['bytes']; + foreach ( $stats['by_strategy'] as $strategy => $count ) { + $state['by_strategy'][ $strategy ] = ( $state['by_strategy'][ $strategy ] ?? 0 ) + $count; + } + break; + } +}; + +for ( $i = 0; $i < max( 1, $options['lanes'] ); $i++ ) { + $lanes[ $i ] = $spawn_lane( $i ); + ++$state['batches']; +} + +$stop_requested = false; +$last_state_write = 0.0; + +while ( array() !== $lanes ) { + $now = microtime( true ); + + if ( ! $stop_requested && null !== $deadline && $now >= $deadline ) { + $state['stop_reason'] = 'duration'; + $stop_requested = true; + } + + if ( ! $stop_requested && $options['max-cases'] > 0 && $state['cases'] >= $options['max-cases'] ) { + $state['stop_reason'] = 'max-cases'; + $stop_requested = true; + } + + $streams = array(); + foreach ( $lanes as $lane_id => $lane ) { + $streams[ $lane_id ] = $lane['stdout']; + } + + $read = array_values( $streams ); + $write = null; + $except = null; + if ( stream_select( $read, $write, $except, 0, 250000 ) > 0 ) { + foreach ( $lanes as $lane_id => &$lane ) { + $chunk = stream_get_contents( $lane['stdout'] ); + if ( false === $chunk || '' === $chunk ) { + continue; + } + + $lane['last_output'] = microtime( true ); + $lane['buffer'] .= $chunk; + + while ( false !== ( $newline = strpos( $lane['buffer'], "\n" ) ) ) { + $line = substr( $lane['buffer'], 0, $newline ); + $lane['buffer'] = substr( $lane['buffer'], $newline + 1 ); + if ( '' !== $line ) { + $handle_line( $line, $lane_id ); + } + } + } + unset( $lane ); + } + + foreach ( $lanes as $lane_id => $lane ) { + $status = proc_get_status( $lane['process'] ); + $stalled = ( microtime( true ) - $lane['last_output'] ) > $options['stall-timeout']; + + if ( $status['running'] && $stalled ) { + proc_terminate( $lane['process'], 9 ); + $state['stalled_seeds'][] = $lane['seed']; + fwrite( STDERR, "STALL lane {$lane_id} seed {$lane['seed']}: no output for {$options['stall-timeout']}s, killed\n" ); + } elseif ( $status['running'] ) { + continue; + } + + // Lane finished (or was just killed): flush remaining output. + $rest = stream_get_contents( $lane['stdout'] ); + if ( is_string( $rest ) && '' !== $rest ) { + foreach ( explode( "\n", $lane['buffer'] . $rest ) as $line ) { + if ( '' !== $line ) { + $handle_line( $line, $lane_id ); + } + } + } + fclose( $lane['stdout'] ); + proc_close( $lane['process'] ); + unset( $lanes[ $lane_id ] ); + + if ( ! $stop_requested ) { + $lanes[ $lane_id ] = $spawn_lane( $lane_id ); + ++$state['batches']; + } + } + + if ( microtime( true ) - $last_state_write > 5 ) { + $write_state(); + $last_state_write = microtime( true ); + } +} + +if ( null === $state['stop_reason'] ) { + $state['stop_reason'] = 'lanes-exited'; +} +$state['finished_at'] = gmdate( 'c' ); +$write_state(); +fclose( $summary ); + +$elapsed = round( microtime( true ) - $started_at, 1 ); +fwrite( + STDERR, + sprintf( + "Done: %d cases (%d valid inputs), %d failures, %d stalled, %s bytes in %ss. Artifacts: %s\n", + $state['cases'], + $state['valid_inputs'], + $state['failures'], + count( $state['stalled_seeds'] ), + number_format( $state['bytes'] ), + $elapsed, + $output_dir + ) +); + +exit( ( $state['failures'] > 0 || array() !== $state['stalled_seeds'] ) ? 1 : 0 ); diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php new file mode 100644 index 0000000000000..9d64e0bb84294 --- /dev/null +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -0,0 +1,184 @@ +drain_events(); +$names = $oracles->names(); + +check( 'mb oracle available', $oracles->has_required() ); +check( + 'no oracle disabled by battery', + array() === array_filter( $events, static fn( $e ) => 'oracle-disabled' === $e['type'] ), + json_encode( $events ) +); +check( 'at least one external oracle', in_array( 'python3', $names, true ) || in_array( 'node', $names, true ), implode( ',', $names ) ); + +// --------------------------------------------------------------------- +// 2. Real targets pass every check on the battery vectors. +// --------------------------------------------------------------------- +$checks = new Checks( $oracles ); +$battery_fails = array(); +foreach ( Oracles::battery() as $i => $vector ) { + foreach ( $checks->run( $vector[0] ) as $failure ) { + $battery_fails[] = "vector {$i}: {$failure['signature']}"; + } +} +check( 'real targets clean on battery', array() === $battery_fails, implode( '; ', $battery_fails ) ); + +// --------------------------------------------------------------------- +// 3. Broken implementations must be caught. +// --------------------------------------------------------------------- +$real_targets = array( + 'is_valid' => 'wp_is_valid_utf8', + 'is_valid_fb' => '_wp_is_valid_utf8_fallback', + 'scrub' => 'wp_scrub_utf8', + 'scrub_fb' => '_wp_scrub_utf8_fallback', + 'codepoint_count' => '_wp_utf8_codepoint_count', +); + +/** + * Runs the battery against a broken variant and reports which checks fired. + * + * @return string[] Distinct check names observed. + */ +function broken_run( Oracles $oracles, array $real, array $overrides ): array { + $checks = new Checks( $oracles, array_merge( $real, $overrides ) ); + $seen = array(); + foreach ( Oracles::battery() as $vector ) { + foreach ( $checks->run( $vector[0] ) as $failure ) { + $seen[ $failure['check'] ] = true; + } + } + return array_keys( $seen ); +} + +// 3a. Validator that wrongly accepts a never-valid byte. +$seen = broken_run( $oracles, $real_targets, array( + 'is_valid_fb' => static fn( string $bytes ): bool => str_contains( $bytes, "\xC0" ) ? true : _wp_is_valid_utf8_fallback( $bytes ), +) ); +check( 'catches validator accepting 0xC0', in_array( 'validity-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3b. Validator that wrongly rejects noncharacters (a plausible spec misreading). +$seen = broken_run( $oracles, $real_targets, array( + 'is_valid' => static fn( string $bytes ): bool => wp_is_valid_utf8( $bytes ) && ! wp_has_noncharacters( $bytes ), +) ); +check( 'catches validator rejecting noncharacters', in_array( 'validity-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3c. Scrubber that collapses adjacent replacement characters (one-FFFD-per-run +// instead of one per maximal subpart). +$seen = broken_run( $oracles, $real_targets, array( + 'scrub_fb' => static fn( string $bytes ): string => (string) preg_replace( "/(\u{FFFD})+/u", "\u{FFFD}", _wp_scrub_utf8_fallback( $bytes ) ), +) ); +check( 'catches non-maximal-subpart scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3d. Scrubber that passes invalid bytes through untouched. +$seen = broken_run( $oracles, $real_targets, array( + 'scrub_fb' => static fn( string $bytes ): string => $bytes, +) ); +check( + 'catches identity scrubber', + in_array( 'scrub-mismatch', $seen, true ) && in_array( 'scrubbed-not-valid', $seen, true ), + implode( ',', $seen ) +); + +// 3e. Scrubber that drops invalid bytes instead of replacing them. +$seen = broken_run( $oracles, $real_targets, array( + 'scrub' => static fn( string $bytes ): string => str_replace( "\u{FFFD}", '', wp_scrub_utf8( $bytes ) ), +) ); +check( 'catches byte-dropping scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3f. Code point counter that counts invalid bytes individually. +$seen = broken_run( $oracles, $real_targets, array( + 'codepoint_count' => static fn( string $bytes ): int => _wp_utf8_codepoint_count( $bytes ) + ( wp_is_valid_utf8( $bytes ) ? 0 : 1 ), +) ); +check( 'catches off-by-one code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3g. Throwing target is reported, not fatal. +$seen = broken_run( $oracles, $real_targets, array( + 'is_valid_fb' => static function ( string $bytes ): bool { + throw new \RuntimeException( 'boom' ); + }, +) ); +check( 'reports throwing target', in_array( 'target-exception', $seen, true ), implode( ',', $seen ) ); + +// --------------------------------------------------------------------- +// 4. Generator determinism and mix. +// --------------------------------------------------------------------- +$a = ( new Generator( new Prng( '7:3' ), 65536 ) )->generate(); +$b = ( new Generator( new Prng( '7:3' ), 65536 ) )->generate(); +check( 'generator deterministic for (seed, case)', $a === $b ); + +$strategies = array(); +$valid = 0; +$invalid = 0; +$total = 2000; +for ( $i = 0; $i < $total; $i++ ) { + $generated = ( new Generator( new Prng( "smoke:{$i}" ), 4096 ) )->generate(); + $strategies[ $generated['strategy'] ] = true; + if ( mb_check_encoding( $generated['bytes'], 'UTF-8' ) ) { + ++$valid; + } else { + ++$invalid; + } +} +check( 'all 9 strategies appear', 9 === count( $strategies ), implode( ',', array_keys( $strategies ) ) ); +check( + "healthy valid/invalid mix ({$valid} valid, {$invalid} invalid of {$total})", + $valid > $total / 10 && $invalid > $total / 10 +); + +// --------------------------------------------------------------------- +// 5. Short real fuzz run. +// --------------------------------------------------------------------- +$fuzz_failures = 0; +for ( $i = 0; $i < 300; $i++ ) { + $generated = ( new Generator( new Prng( "smoke-run:{$i}" ), 8192 ) )->generate(); + $failures = $checks->run( $generated['bytes'] ); + foreach ( $failures as $failure ) { + ++$fuzz_failures; + echo " finding: {$failure['signature']} on " . bin2hex( substr( $generated['bytes'], 0, 48 ) ) . "\n"; + } +} +check( '300-case fuzz run clean (real findings would also surface here)', 0 === $fuzz_failures ); + +$oracles->shutdown(); + +echo $failed > 0 ? "\n{$failed} smoke check(s) FAILED\n" : "\nAll smoke checks passed\n"; +exit( $failed > 0 ? 1 : 0 ); diff --git a/tools/encoding-fuzz/worker.php b/tools/encoding-fuzz/worker.php new file mode 100644 index 0000000000000..def1c7664a7e7 --- /dev/null +++ b/tools/encoding-fuzz/worker.php @@ -0,0 +1,173 @@ + 1, + 'cases' => 1000, + 'start-case' => 0, + 'max-bytes' => 65536, + 'external' => 'auto', + 'output-dir' => '', + 'progress-every' => 500, + ) +); + +Bootstrap::load_targets(); + +$oracles = Oracles::build( Cli::resolve_externals( $options['external'] ) ); +foreach ( $oracles->drain_events() as $event ) { + Cli::emit( array( 'type' => 'oracle-event' ) + $event ); +} + +if ( ! $oracles->has_required() ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => 'mbstring oracle unavailable or failed the battery; cannot fuzz without a primary oracle', + ) + ); + exit( 2 ); +} + +$checks = new Checks( $oracles ); +$mb_valid = $oracles->validity_oracles()['mb']; +$output_dir = $options['output-dir']; +if ( '' !== $output_dir && ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot create output dir {$output_dir}", + ) + ); + exit( 2 ); +} + +$seed = (string) $options['seed']; +$start = $options['start-case']; +$end = $start + $options['cases']; +$stats = array( + 'cases' => 0, + 'failures' => 0, + 'valid_inputs' => 0, + 'bytes' => 0, + 'by_strategy' => array(), +); +$started_at = microtime( true ); + +Cli::emit( + array( + 'type' => 'start', + 'seed' => $seed, + 'start_case' => $start, + 'cases' => $options['cases'], + 'max_bytes' => $options['max-bytes'], + 'environment' => Cli::environment_metadata( $oracles ), + ) +); + +for ( $case = $start; $case < $end; $case++ ) { + $prng = new Prng( "{$seed}:{$case}" ); + $generator = new Generator( $prng, $options['max-bytes'] ); + $generated = $generator->generate(); + $input = $generated['bytes']; + $strategy = $generated['strategy']; + + $failures = $checks->run( $input ); + + ++$stats['cases']; + $stats['bytes'] += strlen( $input ); + $stats['by_strategy'][ $strategy ] = ( $stats['by_strategy'][ $strategy ] ?? 0 ) + 1; + if ( $mb_valid( $input ) ) { + ++$stats['valid_inputs']; + } + + foreach ( $oracles->drain_events() as $event ) { + Cli::emit( array( 'type' => 'oracle-event', 'case' => $case ) + $event ); + } + + if ( array() !== $failures ) { + $stats['failures'] += count( $failures ); + + $record = array( + 'type' => 'failure', + 'seed' => $seed, + 'case' => $case, + 'strategy' => $strategy, + 'input_size' => strlen( $input ), + 'signatures' => array_values( array_unique( array_column( $failures, 'signature' ) ) ), + 'failures' => $failures, + ); + + if ( strlen( $input ) <= 4096 ) { + $record['input_base64'] = base64_encode( $input ); + } + + if ( '' !== $output_dir ) { + $case_dir = "{$output_dir}/failure-seed{$seed}-case{$case}"; + if ( ! is_dir( $case_dir ) ) { + mkdir( $case_dir, 0777, true ); + } + file_put_contents( "{$case_dir}/input.bin", $input ); + + $artifact = $record; + $artifact['input_base64'] = base64_encode( $input ); + $artifact['environment'] = Cli::environment_metadata( $oracles ); + $artifact['git'] = Cli::git_metadata( Bootstrap::repo_root() ); + file_put_contents( + "{$case_dir}/failure.json", + json_encode( $artifact, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES ) + ); + $record['artifact_dir'] = $case_dir; + } + + Cli::emit( $record ); + } + + if ( 0 === ( $stats['cases'] % max( 1, $options['progress-every'] ) ) ) { + $elapsed = microtime( true ) - $started_at; + Cli::emit( + array( + 'type' => 'progress', + 'seed' => $seed, + 'case' => $case, + 'cases_done' => $stats['cases'], + 'failures' => $stats['failures'], + 'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null, + ) + ); + } +} + +$elapsed = microtime( true ) - $started_at; +Cli::emit( + array( + 'type' => 'done', + 'seed' => $seed, + 'stats' => $stats, + 'elapsed_sec' => round( $elapsed, 2 ), + 'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null, + ) +); + +$oracles->shutdown(); +exit( $stats['failures'] > 0 ? 1 : 0 ); From b317933b64b4ef5b7ab25e148cfdd4b1f3dbf39a Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 16:48:19 +0200 Subject: [PATCH 02/14] Add handoff docs for follow-up fuzzing and testing lanes. Four self-contained work-lane documents: extending the encoding fuzzer (utf8_encode/decode fallback differentials before PHP 9 removes the native oracles, the confirmed wp_has_noncharacters PCRE-vs-fallback divergence on ill-formed input, exhaustive code_point_to_utf8_bytes), an independent WP_HTML_Decoder fuzzer against the Dom\HTMLDocument oracle, WP_Token_Map property tests against a naive reference (building on the existing wpTokenMap.php tests), and a one-shot divergence survey of seems_utf8 and wp_check_invalid_utf8. --- handoffs/README.md | 19 ++++ handoffs/extend-encoding-fuzzer.md | 111 ++++++++++++++++++ handoffs/html-decoder-fuzzer.md | 130 ++++++++++++++++++++++ handoffs/legacy-utf8-divergence-survey.md | 70 ++++++++++++ handoffs/token-map-properties.md | 91 +++++++++++++++ 5 files changed, 421 insertions(+) create mode 100644 handoffs/README.md create mode 100644 handoffs/extend-encoding-fuzzer.md create mode 100644 handoffs/html-decoder-fuzzer.md create mode 100644 handoffs/legacy-utf8-divergence-survey.md create mode 100644 handoffs/token-map-properties.md diff --git a/handoffs/README.md b/handoffs/README.md new file mode 100644 index 0000000000000..176ea716eef07 --- /dev/null +++ b/handoffs/README.md @@ -0,0 +1,19 @@ +# Fuzzing / testing work lanes + +Self-contained handoff documents, one per independent lane of work. Each +can be picked up by a separate agent or contributor with no shared +context beyond the document itself. + +| Lane | Doc | Shape of work | +|------|-----|---------------| +| Extend the UTF-8 encoding fuzzer | [extend-encoding-fuzzer.md](extend-encoding-fuzzer.md) | Add targets to an existing, working fuzzer | +| WP_HTML_Decoder fuzzer | [html-decoder-fuzzer.md](html-decoder-fuzzer.md) | New independent fuzzer, Dom\HTMLDocument oracle | +| WP_Token_Map property tests | [token-map-properties.md](token-map-properties.md) | PHPUnit property tests against a naive reference | +| Legacy UTF-8 helper divergence survey | [legacy-utf8-divergence-survey.md](legacy-utf8-divergence-survey.md) | One-shot documented survey, no continuous fuzzing | + +Background: `tools/encoding-fuzz/` (this branch, commit `3cc3e64765`) +is a working differential fuzzer for `wp_is_valid_utf8()` / +`wp_scrub_utf8()` and their pure-PHP fallbacks. ~570k cases have run +clean against five independent oracles. Its architecture (deterministic +`(seed, case)` generation, oracle battery, worker/runner/replay/minimize, +mutation-tested harness) is the reference pattern for the other lanes. diff --git a/handoffs/extend-encoding-fuzzer.md b/handoffs/extend-encoding-fuzzer.md new file mode 100644 index 0000000000000..1c6106a750d2c --- /dev/null +++ b/handoffs/extend-encoding-fuzzer.md @@ -0,0 +1,111 @@ +# Handoff: extend the UTF-8 encoding fuzzer with three new targets + +## Status + +Not started. The host fuzzer (`tools/encoding-fuzz/`) is complete and +working at commit `3cc3e64765` on branch `fuzz-encoder`; read its +`README.md` first. ~570k cases have run clean against the current +targets, so the infrastructure is trustworthy. + +## Goal + +Round out coverage of `src/wp-includes/compat-utf8.php` by adding: + +1. `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()` + differentials against the native `utf8_encode()` / `utf8_decode()`. +2. `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` + differential — **after resolving the semantic question below**. +3. A one-shot exhaustive test of + `WP_HTML_Decoder::code_point_to_utf8_bytes()` (not fuzzing). + +## 1. utf8_encode / utf8_decode fallbacks + +**Why now:** the native functions are the only ground truth, deprecated +since PHP 8.2 and removed in PHP 9. Fuzz the differential while the +oracle still exists in the runtime. + +- Oracles: `@utf8_encode()` / `@utf8_decode()` (suppress deprecation + notices; on PHP 9+ skip these checks with an `oracle-unavailable` + event, same pattern `lib/Oracles.php` already uses). +- Spot-probes already done (2026-06-10, PHP 8.4.21): native and + fallback agree on valid input, invalid maximal subparts (`?` per + subpart), code points > U+00FF (`?`), and round-trip text. No known + divergence going in. +- Checks to add: byte equality vs native on arbitrary input (decode) + and on arbitrary input treated as latin1 (encode); round-trip + `decode(encode(s)) === s` for any byte string `s` (encode is total + and injective per byte); encode output is always valid UTF-8 per + the existing `mb` oracle. +- Wire-up: add target entries in `lib/Targets.php`, checks in + `lib/Checks.php`, and broken-implementation cases in + `tests/harness-smoke.php` (the smoke test mutation-tests detection — + every new check needs a deliberately broken variant proving it fires; + e.g. a decode that emits one `?` per invalid *byte* instead of per + maximal subpart). + +## 2. wp_has_noncharacters — resolve semantics first + +**Known divergence, confirmed empirically (2026-06-10):** + +```php +$probe = "\xC0\xEF\xBF\xBE"; // invalid byte, then U+FFFE +wp_has_noncharacters( $probe ); // false — PCRE path: preg_match fails on ill-formed UTF-8 +_wp_has_noncharacters_fallback( $probe ); // true — scan skips invalid spans, finds U+FFFE +``` + +The same public function answers differently depending on which +environment branch of `src/wp-includes/utf8.php` loaded. A naive +differential will fail on roughly its first invalid-input case. Do NOT +just add the check and let it scream: + +1. Decide (or get a decision on) intended behavior for ill-formed + input. Options: (a) document that behavior is undefined unless + `wp_is_valid_utf8()` — then fuzz the differential on valid inputs + only, plus a fixed regression vector for the documented stance; + (b) align the implementations (likely the fallback is the *better* + semantic — finding real noncharacters — but the PCRE version ships + on most hosts). This probably warrants a Trac ticket / discussion + with the function author before code changes. +2. Either way, fuzz the three-way differential on **valid** inputs + immediately: PCRE implementation vs fallback vs a trivial reference + (decode code points, check the U+FDD0–U+FDEF / U+xFFFE / U+xFFFF + list). The generator already emits noncharacter-dense input + (`BOUNDARY_CODE_POINTS` in `lib/Generator.php`). + +## 3. code_point_to_utf8_bytes — exhaust, don't fuzz + +`WP_HTML_Decoder::code_point_to_utf8_bytes()` +(`src/wp-includes/html-api/class-wp-html-decoder.php:426`) has a domain +of ~1.1M values. Write a standalone script (or slow-group PHPUnit test) +asserting equality with `mb_chr( $cp, 'UTF-8' )` for every code point +0x0–0x10FFFF, including expected behavior for surrogates and +out-of-range values (check what the function documents; `mb_chr` +returns `false` for surrogates — decide the comparison accordingly). +Runs in seconds; total coverage; done forever. Note this class is +loaded from `html-api/`, so the fuzzer bootstrap (`lib/Bootstrap.php`) +needs to require it (it has no dependencies beyond the token map — if +it pulls more, load only for this check). + +## Verification / definition of done + +- `php tools/encoding-fuzz/tests/harness-smoke.php` passes, including + new broken-variant detections for every added check. +- A fault-injection variant per new target in `lib/Targets.php` + (`ENCODING_FUZZ_FAULT=...`) exercises worker → replay → minimize end + to end. +- `php tools/encoding-fuzz/runner.php --lanes 4 --duration-seconds 60` + runs clean (or findings are triaged and documented, not silenced). +- README.md oracle/check tables updated. + +## Gotchas inherited from the existing harness + +- All scrub/validity oracles passed a hand-computed battery; new + oracles must too (`Oracles::battery()` pattern). iconv is excluded + for accepting code points above U+10FFFF — don't re-add it. +- Workers run checks in-process; an infinite loop in a new target will + trip the runner's 120s stall watchdog and record the seed. Keep that + property: no per-case subprocesses. +- Everything must stay derivable from `(seed, case index)` — no + `random_int()`, no time-dependent generation. Per-case chunking-type + randomness derives from `sha256(input)` (see + `Checks::check_chunked_scan()`). diff --git a/handoffs/html-decoder-fuzzer.md b/handoffs/html-decoder-fuzzer.md new file mode 100644 index 0000000000000..1e2b3d0c6ba63 --- /dev/null +++ b/handoffs/html-decoder-fuzzer.md @@ -0,0 +1,130 @@ +# Handoff: independent fuzzer for WP_HTML_Decoder + +## Status + +Not started. This is a NEW fuzzer, separate from `tools/encoding-fuzz/` +(UTF-8 functions) and from the `html-api-fuzz` branch (whole-tree +parser comparison). Reuse the architecture of `tools/encoding-fuzz/` — +deterministic `(seed, case)` generation, oracle startup battery, +worker/runner/replay/minimize CLIs, mutation-tested harness smoke test — +but as its own tool directory (suggested: `tools/html-decoder-fuzz/`). + +## Target + +`WP_HTML_Decoder` in `src/wp-includes/html-api/class-wp-html-decoder.php`: + +- `decode_text_node( $text )` +- `decode_attribute( $text )` +- `read_character_reference( $context, $text, $at, &$match_byte_length )` +- `attribute_starts_with( $haystack, $search, $case_sensitivity )` + +This is security-relevant code: decoded attribute values feed +`javascript:` URL detection via `attribute_starts_with`. Existing unit +tests are thin (`tests/phpunit/tests/html-api/wpHtmlDecoder.php`, 4 test +methods) — fuzzing has real headroom here. + +Dependency note: the named-reference path uses `WP_Token_Map` and the +`$html5_named_character_reference` map +(`src/wp-includes/html-api/html5-named-character-references.php`). +A decoder fuzzer transitively exercises both. + +## Oracle + +`Dom\HTMLDocument` (lexbor, PHP 8.4+) — the same oracle the +`html-api-fuzz` branch uses for tree comparison: + +- Text context: parse `
PAYLOAD
`, read + the div's `textContent`; compare with `decode_text_node( PAYLOAD )`. +- Attribute context: parse `
`, read + `getAttribute('title')`; compare with `decode_attribute( PAYLOAD )`. + +Do NOT use `html_entity_decode( ENT_HTML5 )` as the primary oracle: it +does not implement the WHATWG attribute-context rules (named reference +without semicolon followed by `=` or alphanumeric must NOT decode in +attributes) and will drown the run in false divergences. It MAY serve +as a third opinion on the text context only, gated by a known-answer +battery like `Oracles::battery()` in the encoding fuzzer — verify +empirically before trusting it, including C1-control numeric reference +remapping (`€` → U+20AC etc.). + +## Confounders the harness must neutralize + +The oracle is a full HTML parser; the target is a pure decoder. The +generator must avoid payload bytes the parser treats specially, or the +comparison measures parser behavior instead of decoding: + +- `<`, `>`, `&` followed by structure-breaking content — escape `<` as + text? No: restrict generated payloads to never contain raw `<`; `&` + is the whole point and is fine in both contexts. +- Quote characters in the attribute payload — generate with `"` + excluded (or swap quote style per case), since it terminates the + attribute in the oracle document but not in `decode_attribute()`. +- CR / CRLF: the HTML parser normalizes `\r` and `\r\n` to `\n` before + tokenization; the decoder does not. Either exclude `\r` from payloads + or pre-normalize before comparison — decide once, document it. +- NUL bytes: parser replaces U+0000 with U+FFFD in some contexts / + drops in others; the decoder has its own documented NUL handling + (see existing test `test_character_reference_with_null_byte...`). + Probably exclude raw NUL from oracle-compared cases and cover NUL + via fixed regression vectors instead. +- Invalid UTF-8 payload bytes: lexbor may scrub them before the + tokenizer sees them. Start with valid-UTF-8 payloads only; invalid + bytes inside character references (`&am\xC0p;`) are a later, careful + extension. + +## Generator: entity grammar, not byte noise + +Weighted mix targeting the reference-matching state machine: + +- Named references from the real token map: exact (`&`), without + semicolon (`&`), longest-match ambiguity (`¬` vs `∉` — + the map is greedy-longest), case variants (`&` vs `&`), + truncations (`&am`), nonexistent lookalikes (`&x;`). +- The attribute-context discriminator: no-semicolon named reference + followed by `=`, by alphanumerics, by `;` later in the string — + decode in text, not in attribute. +- Numeric: decimal and hex, mixed case `x`/`X`, leading zeros (many), + value classes: ASCII, C1 controls 0x80–0x9F (windows-1252 remap + table), surrogates, noncharacters, > 0x10FFFF, huge (overflow + arithmetic), zero, missing digits (`&#;`, `&#x;`). +- Adjacency and boundaries: references back to back, reference at + string start/end, `&` at end of input, references split by the + string boundary at every prefix length (truncation sweep). +- Plain text with multibyte UTF-8 around references (offset arithmetic). + +Each case is `(context, payload)`; derive both from the PRNG. + +## Checks + +1. Differential vs oracle in both contexts (primary). +2. `read_character_reference()` consistency: decoding the whole string + by repeated `read_character_reference` + literal spans must equal + `decode()` output, and `$match_byte_length` must always advance. +3. `attribute_starts_with( $haystack, $search )` agrees with + `str_starts_with( decode_attribute( $haystack ), $search )` for + ASCII search strings, both case sensitivities. +4. Output is valid UTF-8 (reuse `mb_check_encoding`). +5. Idempotence does NOT hold for decoding (`&amp;` decodes to + `&`) — do not add it; add instead: decoding text with no `&` + is identity. + +## Harness requirements (carry over from encoding fuzzer) + +- Known-answer startup battery for the oracle path (hand-computed + WHATWG expectations, including the C1 remap and no-semicolon + attribute rules) — if the local `Dom\HTMLDocument` fails it, abort + loudly. +- Mutation-tested smoke test: broken decoder variants (skip C1 remap, + decode no-semicolon refs in attributes, off-by-one match length) + must be caught before the fuzzer is trusted. +- Failure artifacts self-contained (base64 input + context + expected/ + got), replay + signature-preserving minimizer. +- Note `html-api-fuzz` branch precedent: its `attributes-entities` + generator profile and oracle handling are prior art worth reading + (`tools/html-api-fuzz/lib/Generator.php` on that branch). + +## Definition of done + +Smoke test green (including broken-variant detection), a 5-minute +multi-lane run either clean or with triaged findings, README with the +oracle-confounder decisions documented. diff --git a/handoffs/legacy-utf8-divergence-survey.md b/handoffs/legacy-utf8-divergence-survey.md new file mode 100644 index 0000000000000..c126f5c494abd --- /dev/null +++ b/handoffs/legacy-utf8-divergence-survey.md @@ -0,0 +1,70 @@ +# Handoff: one-shot divergence survey of legacy UTF-8 helpers + +## Status + +Not started. Deliverable is a **document**, not code and not a +continuous fuzzer. + +## Premise + +`src/wp-includes/formatting.php` contains older UTF-8 helpers that +overlap with the new strict functions in `src/wp-includes/utf8.php`: + +- `seems_utf8( $str )` (formatting.php:884) — loose structural + heuristic, predates `wp_is_valid_utf8()`. +- `wp_check_invalid_utf8( $text, $strip )` (formatting.php:1127) — + PCRE-based, charset-option dependent, with a `$strip` mode. + +These are *intentionally loose*; a continuous differential against +`wp_is_valid_utf8()` would report their known sloppiness forever and +train people to ignore the fuzzer. What's actually useful is a +one-time, well-organized map of exactly where they diverge from the +strict functions — as input for deprecation/migration decisions and +docblock updates. + +## Method + +1. Reuse the generator and battery from `tools/encoding-fuzz/` + (`lib/Generator.php`, `Oracles::battery()`) to drive a few million + inputs through `seems_utf8`, `wp_check_invalid_utf8` (both `$strip` + modes), and `wp_is_valid_utf8` side by side. A throwaway script in + the same style as `tools/encoding-fuzz/worker.php` is fine; it does + not need to be committed. +2. Bucket divergences by *class*, not by input: e.g. "seems_utf8 + accepts overlong encodings", "accepts surrogates", "accepts + code points above U+10FFFF", "wp_check_invalid_utf8 returns '' + instead of stripping when X". Minimize one representative per class + (2–4 bytes each, by hand or with the encoding fuzzer's minimizer + predicate pattern). +3. Note environment sensitivity: `wp_check_invalid_utf8` consults the + blog charset (`get_option( 'blog_charset' )`) — it needs either a WP + test bootstrap or careful stubbing; document which path was tested. + This is the reason these functions were excluded from the encoding + fuzzer in the first place. +4. Cross-check each divergence class against the functions' docblocks + and original Trac tickets (`git log -L` on the functions; Trac + search for `seems_utf8`) to separate "documented, intentional + looseness" from "nobody ever decided this". + +## Deliverable + +A single markdown report (suggested: +`handoffs/legacy-utf8-divergence-report.md`, or a Trac ticket comment) +containing: + +- a divergence matrix: input class × function → accept/reject/output, + with minimal byte examples +- for each class: intentional vs accidental, with evidence +- migration guidance: for each current core caller of `seems_utf8` / + `wp_check_invalid_utf8` (grep the callers), whether + `wp_is_valid_utf8` / `wp_scrub_utf8` is a drop-in, a + behavior-changing replacement, or unsuitable +- explicit recommendation per function: deprecate, document, or leave + +## Non-goals + +No code changes to formatting.php, no continuous fuzzing of these +functions, no "fixing" divergences before the survey establishes which +ones are load-bearing for existing content (a stricter check that +rejects bytes previously accepted can break saved posts on upgrade — +flag any such case prominently). diff --git a/handoffs/token-map-properties.md b/handoffs/token-map-properties.md new file mode 100644 index 0000000000000..cd127297b527b --- /dev/null +++ b/handoffs/token-map-properties.md @@ -0,0 +1,91 @@ +# Handoff: property-based tests for WP_Token_Map + +## Status + +Not started. **This class HAS existing tests — explore them before +acting**: `tests/phpunit/tests/wp-token-map/wpTokenMap.php` (8 test +methods). Read that file first and map what is already covered; do not +duplicate it. As of commit `3cc3e64765` the existing coverage includes: +construction validation, over-long word rejection, round-trip through +`to_array()`, round-trip through `precomputed_php_source_table()` / +`from_precomputed_table()`, longest-match-first behavior, short words +(shorter than the group key length), reading at an offset, and a sweep +over all HTML5 named references. The *gap* is adversarial/generated +token sets and randomized probes — the existing tests use a handful of +hand-picked fixtures. + +## Why property tests, not a continuous fuzzer + +`WP_Token_Map` (`src/wp-includes/class-wp-token-map.php`) is a static +data structure with a free, trivially-correct reference implementation: +a linear scan over the source array. No external oracle, no subprocess, +deterministic. That shape belongs in PHPUnit (fast, runs in CI forever) +rather than a CPU-burning fuzz loop. The production-critical instance +(`$html5_named_character_reference`) additionally gets exercised +transitively by the WP_HTML_Decoder fuzzer lane. + +## Properties to test + +Against a naive reference (`contains`: `in_array` with optional +`strcasecmp`; `read_token`: try every word sorted by length descending, +return first prefix match): + +1. `contains( $word, $case_sensitivity )` ≡ reference, for every word + in the set, every prefix of a word, every word with one byte + appended/removed/changed, and random probes. +2. `read_token( $text, $offset )` ≡ reference (token AND + `$matched_token_byte_length`), at every offset of generated + documents that embed tokens, near-tokens, and token prefixes. +3. Greedy longest-match: when one word is a prefix of another and both + could match, the longer wins (generate nested-prefix families + deliberately: `a`, `ab`, `abc`, …). +4. Round-trips on *generated* maps (the existing tests round-trip + fixtures): `from_array( to_array() )` preserves behavior; + `eval`'d `precomputed_php_source_table()` → + `from_precomputed_table()` preserves behavior. Compare behavior + (all probes), not just array equality. +5. Case-insensitive mode with non-ASCII bytes: PHP's `strcasecmp` is + byte/locale-ASCII; verify the class and reference agree on what + "ascii-case-insensitive" means for bytes ≥ 0x80 (the docblock says + ASCII case only — pin that). + +## Generated token sets — where bugs would live + +Deterministic seeds (`mt_srand` with fixed seed, or reuse +`tools/encoding-fuzz/lib/Prng.php`), sets of 1–200 words drawn from: + +- words shorter than `$key_length` (the "small words" storage path), + exactly `$key_length`, and up to the 256-byte limit +- nested prefix families +- words sharing the same `$key_length`-byte group key +- bytes: ASCII letters both cases, digits, `;`, high bytes ≥ 0x80, + multibyte UTF-8 sequences, and NUL — wait for what existing tests + cover regarding NUL; if undefined, document rather than assert +- `$key_length` values 1 and 2 (and whatever range the class accepts) + +Probe documents: concatenations of set words, prefixes, near-misses, +random bytes, at random offsets. + +## Practical notes + +- Match the existing test file's conventions (data providers, group + annotations). New file suggested: + `tests/phpunit/tests/wp-token-map/wpTokenMapProperties.php` with a + `@group token-map` annotation consistent with the existing file. +- Keep runtime sane for CI: a few thousand generated probes per + property, fixed seed so failures are reproducible. Print the seed + and the serialized token set in assertion messages so a failure is + immediately actionable. +- `precomputed_php_source_table()` round-trip uses `eval` — the + existing test already does this; follow its pattern. +- If a property fails, minimize by hand (sets are small) and add the + minimal case as a fixed regression test alongside the property. + +## Definition of done + +New property test file passing under +`vendor/bin/phpunit --group token-map` (or this repo's equivalent: +`npm run test:php -- --group token-map`), covering the five properties +on generated sets, with documented seeds, no duplication of the eight +existing tests, and any discovered divergence filed/minimized rather +than worked around. From 31d1aa7e03d5b878b727df03bf74a4af232937cf Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 22:31:01 +0200 Subject: [PATCH 03/14] Fuzzer: Add utf8_encode/utf8_decode fallback differentials. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the encoding fuzzer with targets for _wp_utf8_encode_fallback() and _wp_utf8_decode_fallback(), fuzzing them against mb_convert_encoding (primary) and the deprecated native utf8_encode()/utf8_decode() pair while it still exists, plus round-trip and output-validity invariants. The handoff's premise that native and fallback share semantics on invalid input was falsified during implementation: legacy utf8_decode() groups a well-formed lead byte with its expected continuation length into a single '?' (surrogates, beyond-U+10FFFF, 3/4-byte overlongs, C2 C0), while WordPress deliberately follows mb_convert_encoding's maximal-subpart semantics (the PHP 9 polyfill in compat.php prefers mb; ticket #63863). The native decode oracle is therefore trusted on valid input only — where it provably agrees with mb on every code point — and the divergence is pinned by hand-computed battery vectors instead of fuzzed. Detection is mutation-tested: seven new broken-implementation classes in the smoke test (cp1252-confused encoder, identity encoder, per-byte decoder, valid-input mangler, round-trip violator, null-returning encoder and decoder — the fallbacks are untyped, so non-string returns are reported as target-bad-return rather than silently skipped), and ENCODING_FUZZ_FAULT=encode-cp1252|decode-per-byte exercise the worker → replay → minimize pipeline end to end (minimal counterexamples: '80' and 'E7 B8'). Also records an upstream finding in the handoff: the #63863 PHPUnit test's invalid-input coverage is vacuous (integer interpolation instead of chr(), single-quoted escape sequences, U+E000 boundary off-by-one). --- handoffs/extend-encoding-fuzzer.md | 69 +++--- tools/encoding-fuzz/README.md | 61 ++++-- tools/encoding-fuzz/lib/Checks.php | 178 ++++++++++++++++ tools/encoding-fuzz/lib/Oracles.php | 219 +++++++++++++++++++- tools/encoding-fuzz/lib/Targets.php | 47 ++++- tools/encoding-fuzz/tests/harness-smoke.php | 89 ++++++-- 6 files changed, 601 insertions(+), 62 deletions(-) diff --git a/handoffs/extend-encoding-fuzzer.md b/handoffs/extend-encoding-fuzzer.md index 1c6106a750d2c..7fdd0f6797841 100644 --- a/handoffs/extend-encoding-fuzzer.md +++ b/handoffs/extend-encoding-fuzzer.md @@ -2,10 +2,10 @@ ## Status -Not started. The host fuzzer (`tools/encoding-fuzz/`) is complete and -working at commit `3cc3e64765` on branch `fuzz-encoder`; read its -`README.md` first. ~570k cases have run clean against the current -targets, so the infrastructure is trustworthy. +Sections 1 (utf8_encode/decode) DONE; sections 2–3 in progress. The +host fuzzer (`tools/encoding-fuzz/`) is complete and working on branch +`fuzz-encoder`; read its `README.md` first. ~570k cases had run clean +against the original targets before this work started. ## Goal @@ -18,30 +18,43 @@ Round out coverage of `src/wp-includes/compat-utf8.php` by adding: 3. A one-shot exhaustive test of `WP_HTML_Decoder::code_point_to_utf8_bytes()` (not fuzzing). -## 1. utf8_encode / utf8_decode fallbacks - -**Why now:** the native functions are the only ground truth, deprecated -since PHP 8.2 and removed in PHP 9. Fuzz the differential while the -oracle still exists in the runtime. - -- Oracles: `@utf8_encode()` / `@utf8_decode()` (suppress deprecation - notices; on PHP 9+ skip these checks with an `oracle-unavailable` - event, same pattern `lib/Oracles.php` already uses). -- Spot-probes already done (2026-06-10, PHP 8.4.21): native and - fallback agree on valid input, invalid maximal subparts (`?` per - subpart), code points > U+00FF (`?`), and round-trip text. No known - divergence going in. -- Checks to add: byte equality vs native on arbitrary input (decode) - and on arbitrary input treated as latin1 (encode); round-trip - `decode(encode(s)) === s` for any byte string `s` (encode is total - and injective per byte); encode output is always valid UTF-8 per - the existing `mb` oracle. -- Wire-up: add target entries in `lib/Targets.php`, checks in - `lib/Checks.php`, and broken-implementation cases in - `tests/harness-smoke.php` (the smoke test mutation-tests detection — - every new check needs a deliberately broken variant proving it fires; - e.g. a decode that emits one `?` per invalid *byte* instead of per - maximal subpart). +## 1. utf8_encode / utf8_decode fallbacks — DONE, premise corrected + +**Implemented**, but a premise of this section was falsified during +implementation and the oracle design adapted (2026-06-10, PHP 8.4.21): + +- The original claim "No known divergence going in" was wrong: the + earlier spot-probes missed it. Native `utf8_decode()` groups a + well-formed lead byte with its expected continuation length and emits + a single `?` for surrogates (`ED A0 80` → `?`), beyond-U+10FFFF + sequences (`F4 90 80 80` → `?`), 3-/4-byte overlongs, and a + well-formed lead before an invalid continuation (`C2 C0` → `?`), + where the fallback emits one `?` per maximal subpart (`???` etc.). +- That divergence is **intentional** in WordPress: the PHP 9 polyfill + in `compat.php` prefers `mb_convert_encoding()` (which uses maximal + subparts) over the fallback, and the #63863 PHPUnit tests assert + mb-equivalence. So "the native functions are the only ground truth" + was also wrong — WP's chosen ground truth is `mb_convert_encoding()`. +- Oracle design as built: `mb` (`mb_convert_encoding()`) is the primary + encode/decode oracle on arbitrary input; `native` is an encode oracle + on arbitrary input and a decode oracle on **valid input only** + (native ≡ mb on every valid code point, verified exhaustively). On + PHP 9+ `native` reports `oracle-unavailable` and is skipped. The + legacy divergence is pinned by hand-computed battery vectors. +- Round-trip `decode(encode(s)) === s`, encode-output-validity, the + smoke-test mutation variants (cp1252-confused encoder, identity + encoder, per-byte decoder, valid-input mangler, round-trip violator, + null-returning targets), and the `ENCODING_FUZZ_FAULT=encode-cp1252` + / `decode-per-byte` end-to-end fault variants are all in place. + +**Upstream finding, not fixed here:** the cited core test +`tests/phpunit/tests/formatting/deprecatedUtfEncodeDecode.php` has +vacuous invalid-input coverage — its surrogate branch interpolates +integers instead of `chr()` bytes (`"{$byte1}{$byte2}{$byte3}"` +produces ASCII digits), its single-quoted `'\x95'` data is literal +backslash text, and the `$i < 0xD800 || $i > 0xE000` boundary routes +valid U+E000 through the broken branch. It only ever asserts +mb-equivalence on valid input. Worth a follow-up patch on #63863. ## 2. wp_has_noncharacters — resolve semantics first diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index 8deee79516156..80ac3da42df19 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -4,6 +4,7 @@ Differential fuzzer for the WordPress UTF-8 functions: - `wp_is_valid_utf8()` / `_wp_is_valid_utf8_fallback()` - `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()` +- `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()` - `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary) The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main @@ -15,13 +16,39 @@ bootstrap, database, or `wp-env`. Every result is compared against independent known-good implementations: -| Oracle | Backing | Validity | Scrub | -|-----------|--------------------------------------|----------|-------| -| `mb` | `mb_check_encoding()` / `mb_scrub()` | ✓ | ✓ (primary) | -| `pcre` | PCRE2 strict UTF validation | ✓ | | -| `intl` | ICU `UConverter::transcode()` | | ✓ | -| `python3` | CPython codec, persistent subprocess | ✓ | ✓ | -| `node` | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓ | +| Oracle | Backing | Validity | Scrub | Encode | Decode | +|-----------|--------------------------------------|----------|-------|--------|--------| +| `mb` | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) | +| `pcre` | PCRE2 strict UTF validation | ✓ | | | | +| `intl` | ICU `UConverter::transcode()` | | ✓ | | | +| `python3` | CPython codec, persistent subprocess | ✓ | ✓ | | | +| `node` | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓ | | | +| `native` | deprecated `utf8_encode()` / `utf8_decode()` | | | ✓ | ✓ (valid input only) | + +Encode oracles answer "what is this ISO-8859-1 text as UTF-8?"; decode +oracles the reverse. The `native` pair exists until PHP 9 removes it; on +PHP 9+ it is reported as `oracle-unavailable` and skipped. Its decode +side is trusted on valid input only: on ill-formed input the legacy +decoder groups a well-formed lead byte with its expected continuation +length and emits a single `?` in several classes — surrogates +(`ED A0 80` → `?` vs `???`), sequences past U+10FFFF (`F4 90 80 80`), +three/four-byte overlongs (`E0 80 AF`), and even a well-formed lead +before an invalid continuation (`C2 C0`) — though it agrees with +maximal subparts elsewhere (e.g. C0/C1 overlongs and lone +continuations). WordPress deliberately follows the maximal-subpart +semantics of `mb_convert_encoding()` (one `?` per subpart) instead: +the PHP 9 polyfill in `compat.php` prefers `mb_convert_encoding()` +with `_wp_utf8_decode_fallback()` as its mbstring-less shadow +(ticket #63863). + +Because native and mb decoding agree on *every* valid code point +(verified exhaustively over U+0000–U+10FFFF), the valid-input-only +native decode differential adds little detection power beyond `mb`; it +exists to scream if mb and the fallback ever jointly drift from legacy +behavior on valid text. The legacy-vs-WordPress behavior on ill-formed +input is a documented, intentional divergence — pinned here by battery +vectors, not fuzzed. Cataloguing the full legacy divergence surface is +the separate `legacy-utf8-divergence-survey` work lane. All scrub oracles implement the Unicode "maximal subpart" replacement recommendation (Unicode 16.0 §3.9, Table 3-8), which is the documented @@ -38,7 +65,10 @@ External oracles are auto-detected; control them with ## Checks Differentials: both validity targets against every validity oracle, both -scrub targets against every scrub oracle. Oracle-vs-oracle disagreements +scrub targets against every scrub oracle, `_wp_utf8_encode_fallback()` +against every encode oracle (input treated as ISO-8859-1), and +`_wp_utf8_decode_fallback()` against every decode oracle (the `native` +decode oracle on valid input only). Oracle-vs-oracle disagreements are reported separately (`oracle-disagreement`) so they don't masquerade as WordPress bugs. @@ -52,6 +82,9 @@ Internal invariants: - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points` chunks reconstructs the same scrubbed text and always makes forward progress (chunk sizes derive from the input hash, so replays are exact) +- `_wp_utf8_encode_fallback()` output is always valid UTF-8 +- `_wp_utf8_decode_fallback( _wp_utf8_encode_fallback( $s ) ) === $s` + for any byte string `$s` (encode is total and injective per byte) ## Inputs @@ -121,16 +154,18 @@ php tools/encoding-fuzz/tests/harness-smoke.php ``` Verifies the oracle battery, runs the real targets over the battery -vectors, and — most importantly — mutation-tests the harness: seven +vectors, and — most importantly — mutation-tests the harness: fourteen classes of deliberately broken implementations (validator accepting 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber, identity scrubber, byte-dropping scrubber, off-by-one code point count, -throwing target) must all be caught. It also asserts generator -determinism and the valid/invalid input mix. +throwing target, cp1252-confused encoder, identity encoder, per-byte +decoder, valid-input-mangling decoder, round-trip-violating decoder, +null-returning encoder, sometimes-null decoder) must all be caught. It +also asserts generator determinism and the valid/invalid input mix. For end-to-end pipeline testing while the real implementations are -healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal` injects a broken -target into worker, replay, and minimize alike: +healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte` +injects a broken target into worker, replay, and minimize alike: ```sh ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/runner.php --lanes 2 --duration-seconds 5 diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php index ad242666c1f2f..cf6d7c8be2d7b 100644 --- a/tools/encoding-fuzz/lib/Checks.php +++ b/tools/encoding-fuzz/lib/Checks.php @@ -20,6 +20,16 @@ * chunks reconstructs the same scrubbed text and always makes * forward progress * + * Legacy `utf8_encode()` / `utf8_decode()` fallbacks: + * - `_wp_utf8_encode_fallback()` vs every encode oracle on arbitrary + * input treated as ISO-8859-1. + * - `_wp_utf8_decode_fallback()` vs the mb decode oracle on arbitrary + * input; the legacy native oracle is consulted on valid input only + * (see the divergence note in `Oracles`). + * - encode output is always valid UTF-8 + * - `decode(encode(s)) === s` for any byte string `s` (encode is total + * and injective per byte) + * * Target callables are injectable so the harness smoke test can verify * that deliberately broken implementations are caught. */ @@ -238,6 +248,174 @@ public function run( string $input ): array { $failures[] = $chunk_failure; } + // 8. Legacy utf8_encode()/utf8_decode() fallback differentials. + foreach ( $this->check_utf8_encode_decode( $input, $ref_valid, $mb_validity ) as $failure ) { + $failures[] = $failure; + } + + return $failures; + } + + /** + * Differentials and invariants for the `utf8_encode()` / + * `utf8_decode()` fallback pair. The same input is exercised both as + * ISO-8859-1 (encode, total over arbitrary bytes) and as UTF-8 + * (decode). The legacy native decode oracle is consulted on valid + * input only; on ill-formed input WordPress deliberately follows + * `mb_convert_encoding()` maximal-subpart semantics instead. + * + * @return array + */ + private function check_utf8_encode_decode( string $input, bool $ref_valid, callable $mb_validity ): array { + $failures = array(); + $encode_oracles = $this->oracles->encode_oracles(); + $decode_oracles = $this->oracles->decode_oracles(); + + /* + * The fallbacks are untyped, so a broken variant could return null + * (or anything else) instead of throwing; treat any non-string + * return as a failure rather than silently skipping every check. + */ + $results = array(); + foreach ( array( 'utf8_encode_fb', 'utf8_decode_fb' ) as $key ) { + try { + $result = ( $this->targets[ $key ] )( $input ); + + if ( ! is_string( $result ) ) { + $failures[] = self::failure( + 'target-bad-return', + $key, + array( + 'target' => $key, + 'type' => get_debug_type( $result ), + ) + ); + $result = null; + } + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + $key, + array( + 'target' => $key, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + $result = null; + } + + $results[ $key ] = $result; + } + + // Differentials against the encode/decode oracles. + $ref_encode = isset( $encode_oracles['mb'] ) ? $encode_oracles['mb']( $input ) : null; + $ref_decode = isset( $decode_oracles['mb'] ) ? $decode_oracles['mb']( $input ) : null; + + if ( null !== $ref_encode && null !== $results['utf8_encode_fb'] && $results['utf8_encode_fb'] !== $ref_encode ) { + $failures[] = self::failure( + 'utf8-encode-mismatch', + 'utf8_encode_fb', + self::diff_detail( 'utf8_encode_fb', $ref_encode, $results['utf8_encode_fb'] ) + ); + } + + if ( null !== $ref_decode && null !== $results['utf8_decode_fb'] && $results['utf8_decode_fb'] !== $ref_decode ) { + $failures[] = self::failure( + 'utf8-decode-mismatch', + 'utf8_decode_fb', + self::diff_detail( 'utf8_decode_fb', $ref_decode, $results['utf8_decode_fb'] ) + ); + } + + if ( null !== $ref_encode ) { + foreach ( $encode_oracles as $name => $oracle ) { + if ( 'mb' === $name ) { + continue; + } + + $oracle_encode = $oracle( $input ); + if ( $oracle_encode !== $ref_encode ) { + $failures[] = self::failure( + 'oracle-disagreement', + "utf8-encode:{$name}", + self::diff_detail( $name, $ref_encode, $oracle_encode ) + ); + } + } + } + + if ( null !== $ref_decode ) { + foreach ( $decode_oracles as $name => $oracle ) { + if ( 'mb' === $name ) { + continue; + } + + if ( ! $ref_valid && $this->oracles->decode_oracle_is_valid_only( $name ) ) { + continue; + } + + $oracle_decode = $oracle( $input ); + if ( $oracle_decode !== $ref_decode ) { + $failures[] = self::failure( + 'oracle-disagreement', + "utf8-decode:{$name}", + self::diff_detail( $name, $ref_decode, $oracle_decode ) + ); + } + } + } + + // Encode output must be valid UTF-8 (every byte has a code point). + // This and the round trip below need no conversion oracle. + if ( null !== $results['utf8_encode_fb'] && ! $mb_validity( $results['utf8_encode_fb'] ) ) { + $failures[] = self::failure( + 'utf8-encode-not-valid', + 'utf8_encode_fb', + array( + 'target' => 'utf8_encode_fb', + 'encode_preview' => self::preview( $results['utf8_encode_fb'] ), + ) + ); + } + + // Round trip: encode is total and injective per byte, so decoding + // its output must restore the input exactly. A violation implicates + // the pair, not a single side. + if ( null !== $results['utf8_encode_fb'] ) { + try { + $round_trip = ( $this->targets['utf8_decode_fb'] )( $results['utf8_encode_fb'] ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + 'utf8_decode_fb:round-trip', + array( + 'target' => 'utf8_decode_fb', + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + $round_trip = $input; + } + + if ( ! is_string( $round_trip ) ) { + $failures[] = self::failure( + 'target-bad-return', + 'utf8_decode_fb:round-trip', + array( + 'target' => 'utf8_decode_fb', + 'type' => get_debug_type( $round_trip ), + ) + ); + } elseif ( $round_trip !== $input ) { + $failures[] = self::failure( + 'utf8-round-trip-mismatch', + 'round-trip', + self::diff_detail( 'round-trip', $input, $round_trip ) + ); + } + } + return $failures; } diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php index 8dce899c84db4..c3c4b0900947f 100644 --- a/tools/encoding-fuzz/lib/Oracles.php +++ b/tools/encoding-fuzz/lib/Oracles.php @@ -6,13 +6,33 @@ * * Validity oracles answer "is this well-formed UTF-8?". * Scrub oracles answer "what does maximal-subpart replacement produce?". + * Encode oracles answer "what is this ISO-8859-1 text as UTF-8?". + * Decode oracles answer "what is this UTF-8 text as ISO-8859-1?". * * - mbstring: `mb_check_encoding()` / `mb_scrub()` (maximal subpart - * since PHP 8.1.6). + * since PHP 8.1.6), `mb_convert_encoding()` for the + * ISO-8859-1 encode/decode pair. * - pcre: PCRE2's strict UTF validity check (validity only). * - intl: ICU via `UConverter::transcode()` (scrub only). * - python3: CPython codec in a persistent subprocess. * - node: WHATWG TextDecoder in a persistent subprocess. + * - native: the deprecated `utf8_encode()` / `utf8_decode()` pair, + * available until its removal in PHP 9. The decode side is + * trusted on VALID input only: on ill-formed input the + * legacy decoder groups bytes differently from the maximal + * subpart rule, consuming a well-formed lead byte together + * with its expected continuation length as a single '?' + * unit in several classes — surrogates (`ED A0 80` → '?' + * vs '???'), sequences past U+10FFFF (`F4 90 80 80` → '?' + * vs '????'), three/four-byte overlongs (`E0 80 AF`), and + * even a well-formed lead before an invalid continuation + * (`C2 C0` → '?' vs '??'). It does agree with maximal + * subparts elsewhere (e.g. C0/C1 overlongs and lone + * continuations). WordPress deliberately follows + * `mb_convert_encoding()` maximal-subpart semantics + * instead: the PHP 9 polyfill in `compat.php` prefers + * `mb_convert_encoding()`, with the fallback as its + * shadow (ticket #63863). * * iconv is deliberately NOT an oracle: GNU libiconv accepts code points * above U+10FFFF (e.g. F4 90 80 80), so it fails the battery. @@ -27,6 +47,22 @@ class Oracles { /** @var array */ private array $scrub = array(); + /* + * Unlike validity/scrub oracles, encode/decode oracles are all + * in-process and never return null; `Checks` has no transport-failure + * handling for them. An external (nullable) encode/decode oracle + * would need that handling added first. + */ + + /** @var array */ + private array $encode = array(); + + /** @var array */ + private array $decode = array(); + + /** @var array Decode oracles trusted on valid UTF-8 input only. */ + private array $decode_valid_only = array(); + /** @var ExternalOracle[] */ private array $externals = array(); @@ -58,6 +94,41 @@ public static function build( array $external_names ): self { ); } + if ( function_exists( 'mb_convert_encoding' ) ) { + // Encode is total over ISO-8859-1 bytes; no substitutions can occur. + $oracles->encode['mb'] = static function ( string $bytes ): string { + return mb_convert_encoding( $bytes, 'UTF-8', 'ISO-8859-1' ); + }; + // Pin the legacy '?' substitute per call (like the scrub oracle + // pins 0xFFFD) so ambient changes to the global cannot skew results. + $oracles->decode['mb'] = static function ( string $bytes ): string { + $previous = mb_substitute_character(); + mb_substitute_character( 0x3F ); + $decoded = mb_convert_encoding( $bytes, 'ISO-8859-1', 'UTF-8' ); + mb_substitute_character( $previous ); + return $decoded; + }; + } + + if ( function_exists( 'utf8_encode' ) && function_exists( 'utf8_decode' ) ) { + $oracles->encode['native'] = static function ( string $bytes ): string { + // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- Deprecated since PHP 8.2. + return (string) @utf8_encode( $bytes ); + }; + $oracles->decode['native'] = static function ( string $bytes ): string { + // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- Deprecated since PHP 8.2. + return (string) @utf8_decode( $bytes ); + }; + + $oracles->decode_valid_only['native'] = true; + } else { + $oracles->events[] = array( + 'type' => 'oracle-unavailable', + 'oracle' => 'native', + 'detail' => 'utf8_encode()/utf8_decode() removed (PHP 9+); legacy encode/decode differential skipped', + ); + } + // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged if ( false !== @preg_match( '/^./u', 'a' ) ) { $oracles->validity['pcre'] = static function ( string $bytes ): bool { @@ -148,6 +219,72 @@ public static function battery(): array { ); } + /** + * Known-answer vectors for the ISO-8859-1 → UTF-8 encode oracles. + * + * Every byte 0x00–0xFF is a defined ISO-8859-1 code point whose UTF-8 + * form is hand-computable: identity below 0x80, the two-byte sequence + * `C2|C3 80..BF` above. + * + * @return array [latin1 bytes, utf8 bytes] + */ + public static function encode_battery(): array { + return array( + array( '', '' ), + array( 'abc', 'abc' ), + array( "\x00", "\x00" ), + array( "\x7F", "\x7F" ), + array( "\x80", "\xC2\x80" ), // First two-byte mapping. + array( "\x9F", "\xC2\x9F" ), // NOT a Windows-1252 smart quote. + array( "\xA0", "\xC2\xA0" ), + array( "\xBF", "\xC2\xBF" ), // Last byte with C2 lead. + array( "\xC0", "\xC3\x80" ), // First byte with C3 lead. + array( "\xFF", "\xC3\xBF" ), + array( "B\xFCch", "B\xC3\xBCch" ), + array( "\xC3\xBC", "\xC3\x83\xC2\xBC" ), // Already-UTF-8 input double-encodes. + ); + } + + /** + * Known-answer vectors for the UTF-8 → ISO-8859-1 decode oracles. + * + * Hand-computed: code points U+00–U+FF map to their byte, anything + * higher becomes '?', and each maximal subpart of an ill-formed span + * becomes one '?'. The valid flag marks vectors safe for decode + * oracles that are trusted on valid input only (legacy `utf8_decode()` + * groups some ill-formed sequences into a single '?' unit; see the + * class docblock). + * + * @return array [utf8 bytes, valid, latin1 bytes] + */ + public static function decode_battery(): array { + return array( + array( '', true, '' ), + array( 'abc', true, 'abc' ), + array( "\x00", true, "\x00" ), + array( "\xC2\x80", true, "\x80" ), // U+0080, first two-byte mapping. + array( "\xC3\xBC", true, "\xFC" ), // U+00FC ü. + array( "\xC3\xBF", true, "\xFF" ), // U+00FF, last mappable. + array( "\xC4\x80", true, '?' ), // U+0100, first unmappable. + array( "\xE2\x9C\x8F", true, '?' ), // U+270F. + array( "\xF0\x9F\x98\x80", true, '?' ), // U+1F600. + array( "\xEF\xBB\xBF", true, '?' ), // BOM is unmappable, not dropped. + array( "a\xC3\xA9b", true, "a\xE9b" ), + array( "\x80", false, '?' ), // Lone continuation. + array( "\xC0", false, '?' ), // Never-valid lead. + array( "\xC0\xAF", false, '??' ), // Overlong '/': two subparts, NOT '/'. + array( "\xE2\x8C", false, '?' ), // Two-byte maximal subpart at EOF. + array( "\xE2\x8Cx", false, '?x' ), // Subpart cut short by ASCII. + array( "\xF1\x80\x80", false, '?' ), // Three-byte maximal subpart. + array( "\xED\xA0\x80", false, '???' ), // Surrogate: per subpart (legacy native says '?'). + array( "\xF4\x90\x80\x80", false, '????' ), // Past U+10FFFF (legacy native says '?'). + array( ".\xC0.", false, '.?.' ), + array( "\xC3\xBC\x80", false, "\xFC?" ), // Invalid span right after a mappable high byte. + array( "\x80\xC3\xBC", false, "?\xFC" ), // Mappable high byte right after an invalid span. + array( "a\xF1\x80\x80\xE1\x80\xC2b", false, 'a???b' ), // Unicode Table 3-8. + ); + } + private function verify_battery(): void { foreach ( self::battery() as $i => $vector ) { list( $bytes, $expected_valid, $expected_scrub ) = $vector; @@ -178,14 +315,69 @@ private function verify_battery(): void { } } } + + foreach ( self::encode_battery() as $i => $vector ) { + list( $bytes, $expected ) = $vector; + + foreach ( $this->encode as $name => $check ) { + $got = $check( $bytes ); + if ( $got !== $expected ) { + $this->disable( $name, sprintf( + 'encode battery vector %d (%s): expected %s, got %s', + $i, + bin2hex( $bytes ), + bin2hex( $expected ), + null === $got ? 'null' : bin2hex( $got ) + ) ); + } + } + } + + foreach ( self::decode_battery() as $i => $vector ) { + list( $bytes, $input_valid, $expected ) = $vector; + + foreach ( $this->decode as $name => $check ) { + if ( ! $input_valid && $this->decode_oracle_is_valid_only( $name ) ) { + continue; + } + + $got = $check( $bytes ); + if ( $got !== $expected ) { + $this->disable( $name, sprintf( + 'decode battery vector %d (%s): expected %s, got %s', + $i, + bin2hex( $bytes ), + bin2hex( $expected ), + null === $got ? 'null' : bin2hex( $got ) + ) ); + } + } + } } + /** + * Removes every role a named oracle backs. Note that disabling `mb` + * therefore makes `has_required()` false and the harness refuses to + * run — failing closed is preferable to fuzzing without the primary + * oracle. + */ public function disable( string $name, string $detail ): void { - if ( ! isset( $this->validity[ $name ] ) && ! isset( $this->scrub[ $name ] ) ) { + if ( + ! isset( $this->validity[ $name ] ) && + ! isset( $this->scrub[ $name ] ) && + ! isset( $this->encode[ $name ] ) && + ! isset( $this->decode[ $name ] ) + ) { return; } - unset( $this->validity[ $name ], $this->scrub[ $name ] ); + unset( + $this->validity[ $name ], + $this->scrub[ $name ], + $this->encode[ $name ], + $this->decode[ $name ], + $this->decode_valid_only[ $name ] + ); $this->events[] = array( 'type' => 'oracle-disabled', 'oracle' => $name, @@ -203,12 +395,31 @@ public function scrub_oracles(): array { return $this->scrub; } + /** @return array */ + public function encode_oracles(): array { + return $this->encode; + } + + /** @return array */ + public function decode_oracles(): array { + return $this->decode; + } + + public function decode_oracle_is_valid_only( string $name ): bool { + return $this->decode_valid_only[ $name ] ?? false; + } + public function has_required(): bool { return isset( $this->validity['mb'], $this->scrub['mb'] ); } public function names(): array { - return array_values( array_unique( array_merge( array_keys( $this->validity ), array_keys( $this->scrub ) ) ) ); + return array_values( array_unique( array_merge( + array_keys( $this->validity ), + array_keys( $this->scrub ), + array_keys( $this->encode ), + array_keys( $this->decode ) + ) ) ); } /** @return array */ diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php index f810d9d934eda..6c4e3412e5776 100644 --- a/tools/encoding-fuzz/lib/Targets.php +++ b/tools/encoding-fuzz/lib/Targets.php @@ -9,8 +9,10 @@ * be exercised end to end even while the real implementations are * healthy. It exists only for harness validation: * - * ENCODING_FUZZ_FAULT=accept-c0 validator accepts the 0xC0 byte - * ENCODING_FUZZ_FAULT=non-maximal scrubber collapses adjacent U+FFFD + * ENCODING_FUZZ_FAULT=accept-c0 validator accepts the 0xC0 byte + * ENCODING_FUZZ_FAULT=non-maximal scrubber collapses adjacent U+FFFD + * ENCODING_FUZZ_FAULT=encode-cp1252 encoder maps 0x80 like Windows-1252 + * ENCODING_FUZZ_FAULT=decode-per-byte decoder emits '?' per invalid byte */ class Targets { /** @@ -23,6 +25,8 @@ public static function resolve(): array { 'scrub' => 'wp_scrub_utf8', 'scrub_fb' => '_wp_scrub_utf8_fallback', 'codepoint_count' => '_wp_utf8_codepoint_count', + 'utf8_encode_fb' => '_wp_utf8_encode_fallback', + 'utf8_decode_fb' => '_wp_utf8_decode_fallback', ); switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) { @@ -37,8 +41,47 @@ public static function resolve(): array { return (string) preg_replace( "/(\u{FFFD})+/u", "\u{FFFD}", _wp_scrub_utf8_fallback( $bytes ) ); }; break; + + case 'encode-cp1252': + // 0x80 is U+0080 in ISO-8859-1 but '€' in Windows-1252; a + // classic confusion of the two encodings. + $targets['utf8_encode_fb'] = static function ( string $bytes ): string { + return str_replace( "\xC2\x80", "\xE2\x82\xAC", _wp_utf8_encode_fallback( $bytes ) ); + }; + break; + + case 'decode-per-byte': + $targets['utf8_decode_fb'] = self::decode_per_invalid_byte( ... ); + break; } return $targets; } + + /** + * Deliberately broken decoder: emits one '?' for every byte of an + * invalid span instead of one per maximal subpart, so multi-byte + * subparts like `E2 8C` produce '??' instead of '?'. + */ + public static function decode_per_invalid_byte( string $bytes ): string { + $at = 0; + $was_at = 0; + $invalid_length = 0; + $end = strlen( $bytes ); + $out = ''; + + while ( $at < $end ) { + _wp_scan_utf8( $bytes, $at, $invalid_length ); + $out .= _wp_utf8_decode_fallback( substr( $bytes, $was_at, $at - $was_at ) ); + + if ( $invalid_length > 0 ) { + $out .= str_repeat( '?', $invalid_length ); + $at += $invalid_length; + } + + $was_at = $at; + } + + return $out; + } } diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index 9d64e0bb84294..59b89327be408 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -53,10 +53,15 @@ function check( string $label, bool $ok, string $detail = '' ): void { // --------------------------------------------------------------------- // 2. Real targets pass every check on the battery vectors. // --------------------------------------------------------------------- -$checks = new Checks( $oracles ); -$battery_fails = array(); -foreach ( Oracles::battery() as $i => $vector ) { - foreach ( $checks->run( $vector[0] ) as $failure ) { +$checks = new Checks( $oracles ); +$battery_fails = array(); +$battery_vectors = array_merge( + array_column( Oracles::battery(), 0 ), + array_column( Oracles::encode_battery(), 0 ), + array_column( Oracles::decode_battery(), 0 ) +); +foreach ( $battery_vectors as $i => $bytes ) { + foreach ( $checks->run( $bytes ) as $failure ) { $battery_fails[] = "vector {$i}: {$failure['signature']}"; } } @@ -71,18 +76,21 @@ function check( string $label, bool $ok, string $detail = '' ): void { 'scrub' => 'wp_scrub_utf8', 'scrub_fb' => '_wp_scrub_utf8_fallback', 'codepoint_count' => '_wp_utf8_codepoint_count', + 'utf8_encode_fb' => '_wp_utf8_encode_fallback', + 'utf8_decode_fb' => '_wp_utf8_decode_fallback', ); /** - * Runs the battery against a broken variant and reports which checks fired. + * Runs every battery vector against a broken variant and reports which + * checks fired. * * @return string[] Distinct check names observed. */ -function broken_run( Oracles $oracles, array $real, array $overrides ): array { +function broken_run( Oracles $oracles, array $real, array $vectors, array $overrides ): array { $checks = new Checks( $oracles, array_merge( $real, $overrides ) ); $seen = array(); - foreach ( Oracles::battery() as $vector ) { - foreach ( $checks->run( $vector[0] ) as $failure ) { + foreach ( $vectors as $bytes ) { + foreach ( $checks->run( $bytes ) as $failure ) { $seen[ $failure['check'] ] = true; } } @@ -90,26 +98,26 @@ function broken_run( Oracles $oracles, array $real, array $overrides ): array { } // 3a. Validator that wrongly accepts a never-valid byte. -$seen = broken_run( $oracles, $real_targets, array( +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'is_valid_fb' => static fn( string $bytes ): bool => str_contains( $bytes, "\xC0" ) ? true : _wp_is_valid_utf8_fallback( $bytes ), ) ); check( 'catches validator accepting 0xC0', in_array( 'validity-mismatch', $seen, true ), implode( ',', $seen ) ); // 3b. Validator that wrongly rejects noncharacters (a plausible spec misreading). -$seen = broken_run( $oracles, $real_targets, array( +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'is_valid' => static fn( string $bytes ): bool => wp_is_valid_utf8( $bytes ) && ! wp_has_noncharacters( $bytes ), ) ); check( 'catches validator rejecting noncharacters', in_array( 'validity-mismatch', $seen, true ), implode( ',', $seen ) ); // 3c. Scrubber that collapses adjacent replacement characters (one-FFFD-per-run // instead of one per maximal subpart). -$seen = broken_run( $oracles, $real_targets, array( +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'scrub_fb' => static fn( string $bytes ): string => (string) preg_replace( "/(\u{FFFD})+/u", "\u{FFFD}", _wp_scrub_utf8_fallback( $bytes ) ), ) ); check( 'catches non-maximal-subpart scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) ); // 3d. Scrubber that passes invalid bytes through untouched. -$seen = broken_run( $oracles, $real_targets, array( +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'scrub_fb' => static fn( string $bytes ): string => $bytes, ) ); check( @@ -119,25 +127,76 @@ function broken_run( Oracles $oracles, array $real, array $overrides ): array { ); // 3e. Scrubber that drops invalid bytes instead of replacing them. -$seen = broken_run( $oracles, $real_targets, array( +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'scrub' => static fn( string $bytes ): string => str_replace( "\u{FFFD}", '', wp_scrub_utf8( $bytes ) ), ) ); check( 'catches byte-dropping scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) ); // 3f. Code point counter that counts invalid bytes individually. -$seen = broken_run( $oracles, $real_targets, array( +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'codepoint_count' => static fn( string $bytes ): int => _wp_utf8_codepoint_count( $bytes ) + ( wp_is_valid_utf8( $bytes ) ? 0 : 1 ), ) ); check( 'catches off-by-one code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) ); // 3g. Throwing target is reported, not fatal. -$seen = broken_run( $oracles, $real_targets, array( +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'is_valid_fb' => static function ( string $bytes ): bool { throw new \RuntimeException( 'boom' ); }, ) ); check( 'reports throwing target', in_array( 'target-exception', $seen, true ), implode( ',', $seen ) ); +// 3h. Encoder that confuses ISO-8859-1 with Windows-1252 (0x80 becomes '€'). +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'utf8_encode_fb' => static fn( string $bytes ): string => str_replace( "\xC2\x80", "\xE2\x82\xAC", _wp_utf8_encode_fallback( $bytes ) ), +) ); +check( 'catches cp1252-confused encoder', in_array( 'utf8-encode-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3i. Encoder that passes high bytes through raw (invalid UTF-8 output). +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'utf8_encode_fb' => static fn( string $bytes ): string => $bytes, +) ); +check( + 'catches identity encoder', + in_array( 'utf8-encode-mismatch', $seen, true ) && in_array( 'utf8-encode-not-valid', $seen, true ), + implode( ',', $seen ) +); + +// 3j. Decoder that emits one '?' per invalid byte instead of per maximal +// subpart (`E2 8C` becomes '??' instead of '?'). +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'utf8_decode_fb' => Targets::decode_per_invalid_byte( ... ), +) ); +check( 'catches per-byte decoder', in_array( 'utf8-decode-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3k. Decoder that mangles a mappable code point on fully valid input. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'utf8_decode_fb' => static fn( string $bytes ): string => str_replace( "\xFC", "\xFD", _wp_utf8_decode_fallback( $bytes ) ), +) ); +check( 'catches decoder mangling valid input', in_array( 'utf8-decode-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3l. Decoder that drops U+0080 entirely; the encode→decode round trip +// must restore every input byte string exactly. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'utf8_decode_fb' => static fn( string $bytes ): string => str_replace( "\x80", '', _wp_utf8_decode_fallback( $bytes ) ), +) ); +check( 'catches round-trip violation', in_array( 'utf8-round-trip-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3m. Encoder that returns null (the fallbacks are untyped, so a broken +// variant can return non-strings without throwing); must be reported, +// not silently skipped by every encode-side check. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'utf8_encode_fb' => static fn( string $bytes ) => null, +) ); +check( 'catches null-returning encoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) ); + +// 3n. Decoder that returns null only for some inputs; must be reported +// from both the direct call and the round-trip path without crashing. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'utf8_decode_fb' => static fn( string $bytes ) => str_contains( $bytes, "\x80" ) ? null : _wp_utf8_decode_fallback( $bytes ), +) ); +check( 'catches sometimes-null decoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) ); + // --------------------------------------------------------------------- // 4. Generator determinism and mix. // --------------------------------------------------------------------- From 73653ef12338b2553213127b36ae03fde2b241aa Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 22:57:59 +0200 Subject: [PATCH 04/14] Fuzzer: Add wp_has_noncharacters three-way differential on valid input. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fuzz wp_has_noncharacters() (PCRE branch) and _wp_has_noncharacters_fallback() against a trivial mb_str_split/mb_ord reference oracle, on valid input only. On ill-formed input the public function's answer depends on which environment branch of utf8.php loaded — the PCRE branch returns false whenever preg_match fails while the fallback skips invalid spans and reports noncharacters around them ("\xC0\xEF\xBF\xBE": PCRE false, fallback true). Per the handoff's option (a), the fuzzer treats behavior as undefined unless wp_is_valid_utf8() and pins the divergence with a fixed regression vector in the smoke test; whether core aligns the implementations or documents the stance remains an open question for the function author. The reference oracle's battery covers the boundaries and interior of the U+FDD0–U+FDEF block and the final two code points of EVERY plane with their lower neighbors — the PCRE implementation enumerates each plane as a separate hand-typed escape, so a single-plane typo is the realistic bug class and now has deterministic coverage. The oracle throws on ill-formed input rather than silently coercing mb_ord(false). BOUNDARY_CODE_POINTS gains block-interior, adjacent-negative, and mid-plane code points (seed re-derivation of older findings is invalidated; documented in the README — artifact replays are unaffected). Mutation variants: blind detector, U+FDD0-block miss, over-eager detector (shared between the smoke test and the new ENCODING_FUZZ_FAULT=nonchars-miss-fdd0|nonchars-overeager fault modes, one per target; both verified through worker, replay, and minimize). Worker environment metadata now records pcre_u (which utf8.php branch loaded) and the active fault name so injected artifacts can never be mistaken for real findings. --- handoffs/extend-encoding-fuzzer.md | 47 ++++---- tools/encoding-fuzz/README.md | 64 ++++++++--- tools/encoding-fuzz/lib/Checks.php | 95 ++++++++++++++++ tools/encoding-fuzz/lib/Cli.php | 5 + tools/encoding-fuzz/lib/Generator.php | 6 +- tools/encoding-fuzz/lib/Oracles.php | 113 +++++++++++++++++++- tools/encoding-fuzz/lib/Targets.php | 38 ++++++- tools/encoding-fuzz/tests/harness-smoke.php | 47 +++++++- 8 files changed, 371 insertions(+), 44 deletions(-) diff --git a/handoffs/extend-encoding-fuzzer.md b/handoffs/extend-encoding-fuzzer.md index 7fdd0f6797841..29e7effe5ef82 100644 --- a/handoffs/extend-encoding-fuzzer.md +++ b/handoffs/extend-encoding-fuzzer.md @@ -2,7 +2,8 @@ ## Status -Sections 1 (utf8_encode/decode) DONE; sections 2–3 in progress. The +Sections 1 (utf8_encode/decode) and 2 (wp_has_noncharacters) DONE; +section 3 in progress. The host fuzzer (`tools/encoding-fuzz/`) is complete and working on branch `fuzz-encoder`; read its `README.md` first. ~570k cases had run clean against the original targets before this work started. @@ -56,7 +57,7 @@ backslash text, and the `$i < 0xD800 || $i > 0xE000` boundary routes valid U+E000 through the broken branch. It only ever asserts mb-equivalence on valid input. Worth a follow-up patch on #63863. -## 2. wp_has_noncharacters — resolve semantics first +## 2. wp_has_noncharacters — DONE via option (a); core decision still open **Known divergence, confirmed empirically (2026-06-10):** @@ -66,24 +67,30 @@ wp_has_noncharacters( $probe ); // false — PCRE path: preg_match f _wp_has_noncharacters_fallback( $probe ); // true — scan skips invalid spans, finds U+FFFE ``` -The same public function answers differently depending on which -environment branch of `src/wp-includes/utf8.php` loaded. A naive -differential will fail on roughly its first invalid-input case. Do NOT -just add the check and let it scream: - -1. Decide (or get a decision on) intended behavior for ill-formed - input. Options: (a) document that behavior is undefined unless - `wp_is_valid_utf8()` — then fuzz the differential on valid inputs - only, plus a fixed regression vector for the documented stance; - (b) align the implementations (likely the fallback is the *better* - semantic — finding real noncharacters — but the PCRE version ships - on most hosts). This probably warrants a Trac ticket / discussion - with the function author before code changes. -2. Either way, fuzz the three-way differential on **valid** inputs - immediately: PCRE implementation vs fallback vs a trivial reference - (decode code points, check the U+FDD0–U+FDEF / U+xFFFE / U+xFFFF - list). The generator already emits noncharacter-dense input - (`BOUNDARY_CODE_POINTS` in `lib/Generator.php`). +**Implemented as option (a):** the fuzzer treats behavior as undefined +unless `wp_is_valid_utf8()` and runs the three-way differential — +`wp_has_noncharacters()` (PCRE branch) vs +`_wp_has_noncharacters_fallback()` vs a trivial `mb_str_split()` / +`mb_ord()` reference (battery-verified at block boundaries, block +interior, and the final two code points of every plane with their +neighbors — the PCRE class enumerates each plane by hand, so per-plane +vectors are the point) — on **valid inputs only**. The probe above is +pinned as a fixed regression vector in the smoke test, so any semantic +change to either branch surfaces immediately. `BOUNDARY_CODE_POINTS` +in `lib/Generator.php` gained adjacent NON-noncharacters, a block +interior point, and mid-plane finals. Mutation variants: blind +detector, U+FDD0-block miss, over-eager detector; fault injection: +`ENCODING_FUZZ_FAULT=nonchars-miss-fdd0|nonchars-overeager` (one per +target). + +**Still open upstream (option b path):** whether core should align the +implementations or document the undefined-on-invalid stance in the +`wp_has_noncharacters()` docblock. That needs a decision from the +function author (Trac discussion). Note for whoever picks that up: if +core aligns on PCRE semantics (false on any ill-formed input), the mb +reference oracle and its battery must be extended for ill-formed input +too — removing the valid-only gate alone is NOT sufficient, since the +reference throws on ill-formed input by design. ## 3. code_point_to_utf8_bytes — exhaust, don't fuzz diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index 80ac3da42df19..23b9101ca51f4 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -5,6 +5,7 @@ Differential fuzzer for the WordPress UTF-8 functions: - `wp_is_valid_utf8()` / `_wp_is_valid_utf8_fallback()` - `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()` - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()` +- `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only) - `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary) The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main @@ -16,14 +17,14 @@ bootstrap, database, or `wp-env`. Every result is compared against independent known-good implementations: -| Oracle | Backing | Validity | Scrub | Encode | Decode | -|-----------|--------------------------------------|----------|-------|--------|--------| -| `mb` | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) | -| `pcre` | PCRE2 strict UTF validation | ✓ | | | | -| `intl` | ICU `UConverter::transcode()` | | ✓ | | | -| `python3` | CPython codec, persistent subprocess | ✓ | ✓ | | | -| `node` | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓ | | | -| `native` | deprecated `utf8_encode()` / `utf8_decode()` | | | ✓ | ✓ (valid input only) | +| Oracle | Backing | Validity | Scrub | Encode | Decode | Nonchars | +|-----------|--------------------------------------|----------|-------|--------|--------|----------| +| `mb` | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` / `mb_str_split()`+`mb_ord()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) | ✓ (valid input only) | +| `pcre` | PCRE2 strict UTF validation | ✓ | | | | | +| `intl` | ICU `UConverter::transcode()` | | ✓ | | | | +| `python3` | CPython codec, persistent subprocess | ✓ | ✓ | | | | +| `node` | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓ | | | | +| `native` | deprecated `utf8_encode()` / `utf8_decode()` | | | ✓ | ✓ (valid input only) | | Encode oracles answer "what is this ISO-8859-1 text as UTF-8?"; decode oracles the reverse. The `native` pair exists until PHP 9 removes it; on @@ -41,6 +42,15 @@ the PHP 9 polyfill in `compat.php` prefers `mb_convert_encoding()` with `_wp_utf8_decode_fallback()` as its mbstring-less shadow (ticket #63863). +The `mb` noncharacter oracle (a trivial decode-and-test over +`mb_str_split()` / `mb_ord()`) backs the `wp_has_noncharacters()` +differential. Like every oracle it must pass a hand-derived battery, +which covers the boundaries and interior of the U+FDD0–U+FDEF block +and the final two code points of every plane with their neighbors — +the PCRE implementation under test enumerates each plane as a separate +hand-typed escape, so per-plane coverage is the point. It is defined +on valid input only — see the noncharacter policy under Checks. + Because native and mb decoding agree on *every* valid code point (verified exhaustively over U+0000–U+10FFFF), the valid-input-only native decode differential adds little detection power beyond `mb`; it @@ -72,6 +82,23 @@ decode oracle on valid input only). Oracle-vs-oracle disagreements are reported separately (`oracle-disagreement`) so they don't masquerade as WordPress bugs. +Noncharacter detection is a three-way differential on **valid input +only**: `wp_has_noncharacters()` (the PCRE branch on hosts with +PCRE-u; without PCRE-u the public function aliases the fallback and +the differential degenerates to two distinct implementations — the +worker records which branch loaded as `pcre_u` in its environment +metadata), `_wp_has_noncharacters_fallback()`, and the trivial mb +reference must agree. On ill-formed input the public function's answer +depends on which environment branch of `utf8.php` loaded — the PCRE +branch returns false for any ill-formed input because `preg_match` +fails, while the fallback skips invalid spans and reports the +noncharacters around them (`"\xC0\xEF\xBF\xBE"`: PCRE false, fallback +true). The fuzzer's stance is that behavior is undefined unless +`wp_is_valid_utf8()`; the divergence itself is pinned by a fixed +regression vector in the smoke test, and aligning the implementations +(or documenting the stance in core) is an open upstream question for +the function author. + Internal invariants: - valid ⟺ scrub returns the input unchanged @@ -88,7 +115,11 @@ Internal invariants: ## Inputs -Each case is fully determined by `(seed, case index)`. The generator +Each case is fully determined by `(seed, case index)` **for a given +generator version**: changing the generator (e.g. its boundary code +point list) invalidates `--seed`/`--case` re-derivation of older +findings. Failure artifacts embed the input bytes, so `--failure` and +`--input` replays remain valid across versions. The generator mixes nine strategies: uniformly random bytes, random ASCII, boundary-heavy valid UTF-8 (encoding-length edges, surrogate-gap edges, noncharacters, BOM, U+10FFFF), mutated valid UTF-8 (bit flips, @@ -154,18 +185,23 @@ php tools/encoding-fuzz/tests/harness-smoke.php ``` Verifies the oracle battery, runs the real targets over the battery -vectors, and — most importantly — mutation-tests the harness: fourteen +vectors, and — most importantly — mutation-tests the harness: seventeen classes of deliberately broken implementations (validator accepting 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber, identity scrubber, byte-dropping scrubber, off-by-one code point count, throwing target, cp1252-confused encoder, identity encoder, per-byte decoder, valid-input-mangling decoder, round-trip-violating decoder, -null-returning encoder, sometimes-null decoder) must all be caught. It -also asserts generator determinism and the valid/invalid input mix. +null-returning encoder, sometimes-null decoder, blind noncharacter +detector, U+FDD0-block-missing detector, over-eager noncharacter +detector) must all be caught. It also asserts generator determinism, +the valid/invalid input mix, and the documented +`wp_has_noncharacters()` divergence stance on ill-formed input. For end-to-end pipeline testing while the real implementations are -healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte` -injects a broken target into worker, replay, and minimize alike: +healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager` +injects a broken target into worker, replay, and minimize alike. +Fault-injected artifacts record the fault name in their environment +metadata so they cannot be mistaken for real findings: ```sh ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/runner.php --lanes 2 --duration-seconds 5 diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php index cf6d7c8be2d7b..14bffa042acdd 100644 --- a/tools/encoding-fuzz/lib/Checks.php +++ b/tools/encoding-fuzz/lib/Checks.php @@ -20,6 +20,12 @@ * chunks reconstructs the same scrubbed text and always makes * forward progress * + * Noncharacter detection (VALID input only — the public function's + * answer on ill-formed input depends on which environment branch of + * `utf8.php` loaded, a documented divergence pinned by the smoke test): + * - `wp_has_noncharacters()` and `_wp_has_noncharacters_fallback()` vs + * a trivial decode-and-test reference. + * * Legacy `utf8_encode()` / `utf8_decode()` fallbacks: * - `_wp_utf8_encode_fallback()` vs every encode oracle on arbitrary * input treated as ISO-8859-1. @@ -253,6 +259,95 @@ public function run( string $input ): array { $failures[] = $failure; } + // 9. Noncharacter detection, on valid input only. + foreach ( $this->check_noncharacters( $input, $ref_valid ) as $failure ) { + $failures[] = $failure; + } + + return $failures; + } + + /** + * Three-way differential for noncharacter detection on VALID input: + * the public `wp_has_noncharacters()` (the PCRE branch on hosts with + * PCRE-u; otherwise it aliases the fallback and this degenerates to + * two distinct implementations), the `_wp_scan_utf8()`-based + * fallback, and the trivial mb reference must all agree. + * + * Ill-formed input is deliberately skipped: the PCRE branch answers + * false on any ill-formed input (`preg_match` fails) while the + * fallback skips invalid spans and reports noncharacters around + * them, so the same public function answers differently depending + * on which environment branch loaded. That stance — behavior is + * undefined unless `wp_is_valid_utf8()` — is pinned by a fixed + * regression vector in the smoke test, not fuzzed. + * + * @return array + */ + private function check_noncharacters( string $input, bool $ref_valid ): array { + if ( ! $ref_valid ) { + return array(); + } + + $oracles = $this->oracles->noncharacter_oracles(); + if ( ! isset( $oracles['mb'] ) ) { + return array(); + } + + $failures = array(); + $expected = $oracles['mb']( $input ); + + foreach ( $oracles as $name => $oracle ) { + if ( 'mb' === $name ) { + continue; + } + + $oracle_result = $oracle( $input ); + if ( $oracle_result !== $expected ) { + $failures[] = self::failure( + 'oracle-disagreement', + "noncharacters:{$name}", + array( + 'kind' => 'noncharacters', + 'oracle' => $name, + 'got' => $oracle_result, + 'expected' => $expected, + ) + ); + } + } + + foreach ( array( 'has_nonchars', 'has_nonchars_fb' ) as $key ) { + try { + $result = ( $this->targets[ $key ] )( $input ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + $key, + array( + 'target' => $key, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + continue; + } + + if ( $result !== $expected ) { + $failures[] = self::failure( + 'noncharacters-mismatch', + $key, + array( + 'target' => $key, + 'got' => $result, + 'expected' => $expected, + 'oracle' => 'mb', + 'input_preview' => self::preview( $input ), + ) + ); + } + } + return $failures; } diff --git a/tools/encoding-fuzz/lib/Cli.php b/tools/encoding-fuzz/lib/Cli.php index 14c5d4f671324..3ddd47679b5ab 100644 --- a/tools/encoding-fuzz/lib/Cli.php +++ b/tools/encoding-fuzz/lib/Cli.php @@ -107,6 +107,11 @@ public static function environment_metadata( Oracles $oracles ): array { 'php' => PHP_VERSION, 'os' => PHP_OS_FAMILY, 'oracles' => $oracles->names(), + // Which environment branch of utf8.php loaded (PCRE vs fallback). + 'pcre_u' => function_exists( '_wp_can_use_pcre_u' ) ? _wp_can_use_pcre_u() : null, + // Mark fault-injected artifacts so they can never be mistaken + // for real findings. + 'fault' => getenv( 'ENCODING_FUZZ_FAULT' ) ?: null, ); } } diff --git a/tools/encoding-fuzz/lib/Generator.php b/tools/encoding-fuzz/lib/Generator.php index eb07d7d89183c..bcd4a935d5b31 100644 --- a/tools/encoding-fuzz/lib/Generator.php +++ b/tools/encoding-fuzz/lib/Generator.php @@ -25,9 +25,11 @@ class Generator { 0x80, 0x7FF, // Two-byte edges. 0x800, 0xFFF, 0x1000, 0xCFFF, 0xD000, 0xD7FF, // Three-byte lead splits. 0xE000, 0xFFFD, // After the surrogate gap. - 0xFDD0, 0xFDEF, 0xFFFE, 0xFFFF, // Noncharacters (valid UTF-8!). + 0xFDD0, 0xFDDA, 0xFDEF, 0xFFFE, 0xFFFF, // Noncharacters (valid UTF-8!), incl. block interior. + 0xFDCF, 0xFDF0, // Adjacent NON-noncharacters. 0x10000, 0x3FFFF, 0x40000, 0xFFFFF, 0x100000, 0x10FFFF, // Four-byte lead splits. - 0x1FFFE, 0x1FFFF, 0x10FFFE, // Supplementary noncharacters. + 0x1FFFD, 0x1FFFE, 0x1FFFF, 0x5FFFE, 0x8FFFF, 0x10FFFE, // Supplementary noncharacters, mid planes, neighbors. + 0x10FFFD, ); /** diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php index c3c4b0900947f..88a5df3b1ae41 100644 --- a/tools/encoding-fuzz/lib/Oracles.php +++ b/tools/encoding-fuzz/lib/Oracles.php @@ -8,6 +8,9 @@ * Scrub oracles answer "what does maximal-subpart replacement produce?". * Encode oracles answer "what is this ISO-8859-1 text as UTF-8?". * Decode oracles answer "what is this UTF-8 text as ISO-8859-1?". + * Noncharacter oracles answer "does this VALID UTF-8 text contain a + * Unicode noncharacter?" (U+FDD0–U+FDEF, or any code point whose low + * sixteen bits are FFFE or FFFF). They are defined on valid input only. * * - mbstring: `mb_check_encoding()` / `mb_scrub()` (maximal subpart * since PHP 8.1.6), `mb_convert_encoding()` for the @@ -63,6 +66,9 @@ class Oracles { /** @var array Decode oracles trusted on valid UTF-8 input only. */ private array $decode_valid_only = array(); + /** @var array Defined on valid UTF-8 input only. */ + private array $noncharacters = array(); + /** @var ExternalOracle[] */ private array $externals = array(); @@ -110,6 +116,36 @@ public static function build( array $external_names ): self { }; } + if ( function_exists( 'mb_str_split' ) && function_exists( 'mb_ord' ) ) { + /* + * Trivial decode-and-test reference for noncharacter detection, + * independent of both implementations under test (the PCRE + * character-class regex and the `_wp_scan_utf8()`-based scan). + * Callers must pass valid UTF-8. + */ + $oracles->noncharacters['mb'] = static function ( string $valid_utf8 ): bool { + foreach ( mb_str_split( $valid_utf8, 1, 'UTF-8' ) as $character ) { + $code_point = mb_ord( $character, 'UTF-8' ); + + // Fail loudly on contract violations: on ill-formed + // input `mb_ord()` returns false, which would otherwise + // coerce into "not a noncharacter" and silently mimic + // the fallback's skip-invalid-spans semantics. + if ( ! is_int( $code_point ) ) { + throw new \LogicException( 'noncharacter oracle requires valid UTF-8 input' ); + } + + if ( + ( $code_point >= 0xFDD0 && $code_point <= 0xFDEF ) || + 0xFFFE === ( $code_point & 0xFFFE ) + ) { + return true; + } + } + return false; + }; + } + if ( function_exists( 'utf8_encode' ) && function_exists( 'utf8_decode' ) ) { $oracles->encode['native'] = static function ( string $bytes ): string { // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- Deprecated since PHP 8.2. @@ -285,6 +321,52 @@ public static function decode_battery(): array { ); } + /** + * Known-answer vectors for the noncharacter oracles. All inputs are + * valid UTF-8 (the question is only defined there) and cover the + * boundaries AND interior of the U+FDD0–U+FDEF block plus the final + * two code points of EVERY plane with their U+xFFFD neighbors — the + * PCRE implementation under test enumerates each plane as a separate + * hand-typed escape, exactly where a single-plane typo would hide. + * + * Expectations are hand-derived from the Unicode definition; bytes + * for the looped vectors come from the pure-arithmetic + * `Generator::encode_code_point()` (itself exhaustively verified + * against `mb_chr()` by `tests/code-point-to-utf8-exhaustive.php`), + * keeping the encoding independent of the mbstring-backed oracle. + * + * @return array [valid utf8 bytes, has noncharacters] + */ + public static function noncharacter_battery(): array { + $vectors = array( + array( '', false ), + array( 'abc', false ), + array( "\u{FDCF}", false ), // Last code point before the contiguous block. + array( "\u{FDD0}", true ), // First of the contiguous block. + array( "\u{FDDA}", true ), // Interior of the block: a lookup-table bug + array( "\u{FDE5}", true ), // is not necessarily a boundary bug. + array( "\u{FDEF}", true ), // Last of the contiguous block. + array( "\u{FDF0}", false ), // First code point after the block. + array( "\u{FEFF}", false ), // BOM is not a noncharacter. + array( "\u{FFFD}", false ), // Replacement character is not a noncharacter. + array( "\u{ABCD}", false ), // Arbitrary interior scalar. + array( "a\u{FFFE}b", true ), // Embedded in surrounding text. + array( "ascii only", false ), + ); + + // Both plane-final noncharacters and their lower neighbor, for + // all seventeen planes (0–16). + for ( $plane = 0; $plane <= 0x10; $plane++ ) { + $final = ( $plane << 16 ) | 0xFFFF; + + $vectors[] = array( Generator::encode_code_point( $final - 2 ), false ); + $vectors[] = array( Generator::encode_code_point( $final - 1 ), true ); + $vectors[] = array( Generator::encode_code_point( $final ), true ); + } + + return $vectors; + } + private function verify_battery(): void { foreach ( self::battery() as $i => $vector ) { list( $bytes, $expected_valid, $expected_scrub ) = $vector; @@ -333,6 +415,23 @@ private function verify_battery(): void { } } + foreach ( self::noncharacter_battery() as $i => $vector ) { + list( $bytes, $expected ) = $vector; + + foreach ( $this->noncharacters as $name => $check ) { + $got = $check( $bytes ); + if ( $got !== $expected ) { + $this->disable( $name, sprintf( + 'noncharacter battery vector %d (%s): expected %s, got %s', + $i, + bin2hex( $bytes ), + var_export( $expected, true ), + var_export( $got, true ) + ) ); + } + } + } + foreach ( self::decode_battery() as $i => $vector ) { list( $bytes, $input_valid, $expected ) = $vector; @@ -366,7 +465,8 @@ public function disable( string $name, string $detail ): void { ! isset( $this->validity[ $name ] ) && ! isset( $this->scrub[ $name ] ) && ! isset( $this->encode[ $name ] ) && - ! isset( $this->decode[ $name ] ) + ! isset( $this->decode[ $name ] ) && + ! isset( $this->noncharacters[ $name ] ) ) { return; } @@ -376,7 +476,8 @@ public function disable( string $name, string $detail ): void { $this->scrub[ $name ], $this->encode[ $name ], $this->decode[ $name ], - $this->decode_valid_only[ $name ] + $this->decode_valid_only[ $name ], + $this->noncharacters[ $name ] ); $this->events[] = array( 'type' => 'oracle-disabled', @@ -409,6 +510,11 @@ public function decode_oracle_is_valid_only( string $name ): bool { return $this->decode_valid_only[ $name ] ?? false; } + /** @return array Defined on valid UTF-8 input only. */ + public function noncharacter_oracles(): array { + return $this->noncharacters; + } + public function has_required(): bool { return isset( $this->validity['mb'], $this->scrub['mb'] ); } @@ -418,7 +524,8 @@ public function names(): array { array_keys( $this->validity ), array_keys( $this->scrub ), array_keys( $this->encode ), - array_keys( $this->decode ) + array_keys( $this->decode ), + array_keys( $this->noncharacters ) ) ) ); } diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php index 6c4e3412e5776..19f6ce3d45bc9 100644 --- a/tools/encoding-fuzz/lib/Targets.php +++ b/tools/encoding-fuzz/lib/Targets.php @@ -9,10 +9,12 @@ * be exercised end to end even while the real implementations are * healthy. It exists only for harness validation: * - * ENCODING_FUZZ_FAULT=accept-c0 validator accepts the 0xC0 byte - * ENCODING_FUZZ_FAULT=non-maximal scrubber collapses adjacent U+FFFD - * ENCODING_FUZZ_FAULT=encode-cp1252 encoder maps 0x80 like Windows-1252 - * ENCODING_FUZZ_FAULT=decode-per-byte decoder emits '?' per invalid byte + * ENCODING_FUZZ_FAULT=accept-c0 validator accepts the 0xC0 byte + * ENCODING_FUZZ_FAULT=non-maximal scrubber collapses adjacent U+FFFD + * ENCODING_FUZZ_FAULT=encode-cp1252 encoder maps 0x80 like Windows-1252 + * ENCODING_FUZZ_FAULT=decode-per-byte decoder emits '?' per invalid byte + * ENCODING_FUZZ_FAULT=nonchars-miss-fdd0 fallback detector misses U+FDD0–U+FDEF + * ENCODING_FUZZ_FAULT=nonchars-overeager public detector also flags U+FDCF */ class Targets { /** @@ -27,6 +29,8 @@ public static function resolve(): array { 'codepoint_count' => '_wp_utf8_codepoint_count', 'utf8_encode_fb' => '_wp_utf8_encode_fallback', 'utf8_decode_fb' => '_wp_utf8_decode_fallback', + 'has_nonchars' => 'wp_has_noncharacters', + 'has_nonchars_fb' => '_wp_has_noncharacters_fallback', ); switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) { @@ -53,11 +57,37 @@ public static function resolve(): array { case 'decode-per-byte': $targets['utf8_decode_fb'] = self::decode_per_invalid_byte( ... ); break; + + case 'nonchars-miss-fdd0': + $targets['has_nonchars_fb'] = self::nonchars_missing_fdd0_block( ... ); + break; + + case 'nonchars-overeager': + $targets['has_nonchars'] = self::nonchars_overeager( ... ); + break; } return $targets; } + /** + * Deliberately broken detector: finds only the plane-final + * noncharacters, missing the contiguous U+FDD0–U+FDEF block — a + * plausible spec misreading. + */ + public static function nonchars_missing_fdd0_block( string $text ): bool { + $stripped = (string) preg_replace( '/[\x{FDD0}-\x{FDEF}]/u', '', $text ); + return _wp_has_noncharacters_fallback( $stripped ); + } + + /** + * Deliberately broken detector: also flags U+FDCF, the code point + * just below the contiguous noncharacter block. + */ + public static function nonchars_overeager( string $text ): bool { + return wp_has_noncharacters( $text ) || str_contains( $text, "\u{FDCF}" ); + } + /** * Deliberately broken decoder: emits one '?' for every byte of an * invalid span instead of one per maximal subpart, so multi-byte diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index 59b89327be408..965c1743b03ae 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -58,7 +58,8 @@ function check( string $label, bool $ok, string $detail = '' ): void { $battery_vectors = array_merge( array_column( Oracles::battery(), 0 ), array_column( Oracles::encode_battery(), 0 ), - array_column( Oracles::decode_battery(), 0 ) + array_column( Oracles::decode_battery(), 0 ), + array_column( Oracles::noncharacter_battery(), 0 ) ); foreach ( $battery_vectors as $i => $bytes ) { foreach ( $checks->run( $bytes ) as $failure ) { @@ -67,6 +68,29 @@ function check( string $label, bool $ok, string $detail = '' ): void { } check( 'real targets clean on battery', array() === $battery_fails, implode( '; ', $battery_fails ) ); +/* + * Documented stance: `wp_has_noncharacters()` is undefined on ill-formed + * input. On hosts with PCRE-u the public function answers false on ANY + * ill-formed input (`preg_match` fails) while the fallback skips invalid + * spans and reports the noncharacters around them. This regression + * vector pins the divergence; if it ever changes, the semantics were + * touched and the valid-input-only fuzzing policy must be revisited. + */ +$nonchar_probe = "\xC0\xEF\xBF\xBE"; // Invalid byte, then U+FFFE. +if ( _wp_can_use_pcre_u() ) { + check( + 'documented wp_has_noncharacters divergence on ill-formed input unchanged', + false === wp_has_noncharacters( $nonchar_probe ) && true === _wp_has_noncharacters_fallback( $nonchar_probe ), + sprintf( + 'public: %s, fallback: %s', + var_export( wp_has_noncharacters( $nonchar_probe ), true ), + var_export( _wp_has_noncharacters_fallback( $nonchar_probe ), true ) + ) + ); +} else { + echo "SKIP documented wp_has_noncharacters divergence (no PCRE-u: public function aliases the fallback)\n"; +} + // --------------------------------------------------------------------- // 3. Broken implementations must be caught. // --------------------------------------------------------------------- @@ -78,6 +102,8 @@ function check( string $label, bool $ok, string $detail = '' ): void { 'codepoint_count' => '_wp_utf8_codepoint_count', 'utf8_encode_fb' => '_wp_utf8_encode_fallback', 'utf8_decode_fb' => '_wp_utf8_decode_fallback', + 'has_nonchars' => 'wp_has_noncharacters', + 'has_nonchars_fb' => '_wp_has_noncharacters_fallback', ); /** @@ -197,6 +223,25 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr ) ); check( 'catches sometimes-null decoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) ); +// 3o. Noncharacter detector that never finds anything. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'has_nonchars_fb' => static fn( string $text ): bool => false, +) ); +check( 'catches blind noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3p. Detector that misses the contiguous U+FDD0–U+FDEF block (the +// plane-final pairs alone are a plausible spec misreading). +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'has_nonchars_fb' => Targets::nonchars_missing_fdd0_block( ... ), +) ); +check( 'catches detector missing U+FDD0 block', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3q. Over-eager detector that flags U+FDCF, just below the block. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'has_nonchars' => Targets::nonchars_overeager( ... ), +) ); +check( 'catches over-eager noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) ); + // --------------------------------------------------------------------- // 4. Generator determinism and mix. // --------------------------------------------------------------------- From a7df034e4b357a4a2a4a7f380d7e09961601152d Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:14:14 +0200 Subject: [PATCH 05/14] Fuzzer: Exhaustively test WP_HTML_Decoder::code_point_to_utf8_bytes(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function's domain (~1.1M code points) is small enough to test completely instead of fuzzing. The new standalone script checks every code point 0x0-0x10FFFF plus out-of-range probes against the fuzzer's pure-arithmetic encoder (the independent oracle), with an explicit mb_chr( $cp, 'UTF-8' ) consistency cross-check; surrogates and out-of-range values must yield U+FFFD. Runs in ~0.4s. The harness smoke test executes it and proves its detection fires via the script-local ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark variant. Documents an upstream regression (pinned as a labeled KNOWN ISSUE check so the stance cannot silently go stale): since [62424] (#65342, unreleased) the implementation calls mb_chr() without an explicit encoding, inheriting mb_internal_encoding() — which WordPress sets from blog_charset — so non-UTF-8 sites get raw legacy bytes for mappable code points while invalid ones still yield UTF-8 U+FFFD, contradicting the docblock and mixing encodings with the named character reference path. The 6.6.0 original was pure arithmetic and always emitted UTF-8; the same commit changed code point 0 from U+FFFD to NUL. One-line upstream fix: mb_chr( $code_point, 'UTF-8' ). Closes out the extend-encoding-fuzzer handoff: all three sections done, definition of done verified and recorded in the handoff doc. --- handoffs/extend-encoding-fuzzer.md | 94 ++++++++--- tools/encoding-fuzz/README.md | 26 +++ .../tests/code-point-to-utf8-exhaustive.php | 159 ++++++++++++++++++ tools/encoding-fuzz/tests/harness-smoke.php | 12 ++ 4 files changed, 266 insertions(+), 25 deletions(-) create mode 100644 tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php diff --git a/handoffs/extend-encoding-fuzzer.md b/handoffs/extend-encoding-fuzzer.md index 29e7effe5ef82..8940e08c60098 100644 --- a/handoffs/extend-encoding-fuzzer.md +++ b/handoffs/extend-encoding-fuzzer.md @@ -2,15 +2,15 @@ ## Status -Sections 1 (utf8_encode/decode) and 2 (wp_has_noncharacters) DONE; -section 3 in progress. The -host fuzzer (`tools/encoding-fuzz/`) is complete and working on branch -`fuzz-encoder`; read its `README.md` first. ~570k cases had run clean -against the original targets before this work started. +All three sections DONE. The host fuzzer (`tools/encoding-fuzz/`) is +complete and working on branch `fuzz-encoder`; read its `README.md` +first. ~570k cases had run clean against the original targets before +this work started. ## Goal -Round out coverage of `src/wp-includes/compat-utf8.php` by adding: +Round out coverage of `src/wp-includes/compat-utf8.php` (plus one +html-api encoder) by adding: 1. `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()` differentials against the native `utf8_encode()` / `utf8_decode()`. @@ -92,30 +92,74 @@ reference oracle and its battery must be extended for ill-formed input too — removing the valid-only gate alone is NOT sufficient, since the reference throws on ill-formed input by design. -## 3. code_point_to_utf8_bytes — exhaust, don't fuzz - -`WP_HTML_Decoder::code_point_to_utf8_bytes()` -(`src/wp-includes/html-api/class-wp-html-decoder.php:426`) has a domain -of ~1.1M values. Write a standalone script (or slow-group PHPUnit test) -asserting equality with `mb_chr( $cp, 'UTF-8' )` for every code point -0x0–0x10FFFF, including expected behavior for surrogates and -out-of-range values (check what the function documents; `mb_chr` -returns `false` for surrogates — decide the comparison accordingly). -Runs in seconds; total coverage; done forever. Note this class is -loaded from `html-api/`, so the fuzzer bootstrap (`lib/Bootstrap.php`) -needs to require it (it has no dependencies beyond the token map — if -it pulls more, load only for this check). +## 3. code_point_to_utf8_bytes — DONE; upstream finding documented + +Implemented as `tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php` +(standalone, not wired into `Bootstrap.php` — the class is required +only by this script, which parses cleanly with no other dependencies; +loading html-api code into every fuzz worker would buy nothing). +Every code point 0x0–0x10FFFF plus out-of-range probes, compared +against the pure-arithmetic `Generator::encode_code_point()` (the +independent oracle) with an additional `mb_chr( $cp, 'UTF-8' )` +consistency cross-check (the implementation is itself mb_chr-backed; +the cross-check would expose a bug shared between implementation and +arithmetic encoder). Surrogates and out-of-range values yield U+FFFD +as documented. Runs in ~0.4s, passes on PHP 8.4.21. The harness smoke +test executes it and proves its detection fires via the +`ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark` broken variant. + +**Upstream finding (real bug — an unreleased trunk REGRESSION, not +fixed here):** the implementation is `mb_chr( $code_point )` with NO +explicit encoding, so it inherits `mb_internal_encoding()` — which +WordPress sets from `blog_charset` (`wp_set_internal_encoding()`, +`src/wp-includes/load.php`). On a non-UTF-8 site it returns raw legacy +bytes for mappable code points (e.g. `"\xE9"` for U+00E9 under +ISO-8859-1) while still returning UTF-8 U+FFFD for invalid ones, +contradicting its docblock. Aggravating facts for the upstream report: + +- Introduced by [62424] (#65342, `@since 7.1.0`, unreleased): the + 6.6.0 original was a pure-arithmetic encoder that always emitted + UTF-8 regardless of mbstring state. Fix-before-release territory. +- WP's own `_mb_chr()` polyfill in `compat.php` documents + `@param "UTF-8"|null $encoding Must be 'UTF-8' or null` and treats + null as UTF-8 — so mbstring-less hosts always emit UTF-8 while + mbstring hosts follow `blog_charset`. Same WordPress, divergent + output by extension presence. +- Named character references decode through the UTF-8 token map + regardless: on a latin1 site `é` → UTF-8 `C3 A9` but + `é` → latin1 `E9` in the same decoded string. There is no + intentional-behavior steelman; output is mixed-encoding either way. +- The same commit silently changed `code_point_to_utf8_bytes( 0 )` + from `U+FFFD` to `"\0"` (the old guard was `$code_point <= 0`). + Callers are unaffected (`�` is intercepted earlier) and the new + behavior matches the docblock, but it belongs in the same report. + +One-line fix: `mb_chr( $code_point, 'UTF-8' )`. The script pins the +current buggy behavior as a labeled KNOWN ISSUE check so the stance +cannot silently go stale; update or remove the pin when fixed. ## Verification / definition of done +All verified 2026-06-10 on PHP 8.4.21: + - `php tools/encoding-fuzz/tests/harness-smoke.php` passes, including - new broken-variant detections for every added check. -- A fault-injection variant per new target in `lib/Targets.php` - (`ENCODING_FUZZ_FAULT=...`) exercises worker → replay → minimize end - to end. + broken-variant detections for every added check (seventeen mutation + classes plus the exhaustive script's surrogate fault). +- Fault-injection variants per new target + (`ENCODING_FUZZ_FAULT=encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager`) + exercised worker → replay → minimize end to end; artifacts now record + the fault name and `pcre_u` in environment metadata. The script-local + `codepoint-surrogate-qmark` fault is proven via the smoke test's + subprocess run (the exhaustive script never enters the worker + pipeline). - `php tools/encoding-fuzz/runner.php --lanes 4 --duration-seconds 60` - runs clean (or findings are triaged and documented, not silenced). -- README.md oracle/check tables updated. + ran clean (32,000 cases, 0 failures, 0 stalled, final tree). Findings + that were + triaged and documented rather than silenced: the legacy + `utf8_decode()` divergence (§1), the `wp_has_noncharacters()` + ill-formed-input divergence (§2), the `code_point_to_utf8_bytes()` + internal-encoding regression and the #63863 test bug (§§1, 3). +- README.md oracle/check tables updated (Encode/Decode/Nonchars). ## Gotchas inherited from the existing harness diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index 23b9101ca51f4..28932785a653b 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -210,3 +210,29 @@ ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/minimize.php --failure . (The `non-maximal` fault minimizes to the two bytes `E0 F4`: two adjacent maximal subparts whose replacement characters get collapsed.) + +## One-Shot Exhaustive Tests + +```sh +php tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php +``` + +`WP_HTML_Decoder::code_point_to_utf8_bytes()` has a domain small +enough (~1.1M code points) to test completely instead of fuzzing: every +code point 0x0–0x10FFFF plus out-of-range probes. The independent +oracle is the fuzzer's pure-arithmetic `Generator::encode_code_point()`; +a second comparison against `mb_chr( $cp, 'UTF-8' )` is a consistency +cross-check (the implementation is itself mb_chr-backed) that would +expose a bug shared between the implementation and the arithmetic +encoder. Surrogates and out-of-range values must yield U+FFFD. Runs in +under a second; exit codes `0`/`1`/`2` like everything else. The smoke +test runs it and proves its detection fires via +`ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark`. + +The script also pins a known upstream issue: since [r62424] (#65342, +unreleased) the implementation calls `mb_chr()` without an explicit +encoding, so under a non-UTF-8 `mb_internal_encoding()` (WordPress +sets it from `blog_charset`) it returns raw legacy bytes for mappable +code points while still returning UTF-8 U+FFFD for invalid ones — +contradicting its docblock. The pin fails when the upstream behavior +changes, so the documented stance cannot silently go stale. diff --git a/tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php b/tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php new file mode 100644 index 0000000000000..0ce48190c5f2e --- /dev/null +++ b/tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php @@ -0,0 +1,159 @@ + \WP_HTML_Decoder::code_point_to_utf8_bytes( $cp ); +if ( 'codepoint-surrogate-qmark' === getenv( 'ENCODING_FUZZ_FAULT' ) ) { + $encode = static fn( int $cp ): string => ( $cp >= 0xD800 && $cp <= 0xDFFF ) + ? '?' + : \WP_HTML_Decoder::code_point_to_utf8_bytes( $cp ); +} + +// --------------------------------------------------------------------- +// 1. Exhaustive sweep over the entire code point domain. +// --------------------------------------------------------------------- +$replacement = "\u{FFFD}"; +$mismatches = array(); +$mismatch_count = 0; +$oracle_conflicts = array(); +$conflict_count = 0; + +for ( $cp = 0; $cp <= 0x10FFFF; $cp++ ) { + $is_surrogate = $cp >= 0xD800 && $cp <= 0xDFFF; + $expected = $is_surrogate ? $replacement : Generator::encode_code_point( $cp ); + $got = $encode( $cp ); + + if ( $got !== $expected ) { + ++$mismatch_count; + if ( count( $mismatches ) < 10 ) { + $mismatches[] = sprintf( 'U+%04X: expected %s, got %s', $cp, bin2hex( $expected ), bin2hex( $got ) ); + } + } + + // Cross-check the arithmetic oracle against mb_chr: `mb_chr()` returns + // false exactly for surrogates, and the arithmetic encoder must match + // it everywhere else (this would expose a bug shared between the + // mb_chr-backed implementation and the arithmetic encoder). + $mb = mb_chr( $cp, 'UTF-8' ); + if ( $is_surrogate ? false !== $mb : $mb !== $expected ) { + ++$conflict_count; + if ( count( $oracle_conflicts ) < 10 ) { + $oracle_conflicts[] = sprintf( + 'U+%04X: arithmetic %s, mb_chr %s', + $cp, + bin2hex( $expected ), + is_string( $mb ) ? bin2hex( $mb ) : var_export( $mb, true ) + ); + } + } +} + +check( + 'all 1,114,112 code points encode correctly (surrogates → U+FFFD)', + 0 === $mismatch_count, + "{$mismatch_count} mismatches, first " . count( $mismatches ) . ': ' . implode( '; ', $mismatches ) +); +check( + 'arithmetic oracle and mb_chr agree on the whole domain', + 0 === $conflict_count, + "{$conflict_count} conflicts, first " . count( $oracle_conflicts ) . ': ' . implode( '; ', $oracle_conflicts ) +); + +// --------------------------------------------------------------------- +// 2. Out-of-range values must yield the replacement character. +// --------------------------------------------------------------------- +$out_of_range_fails = array(); +foreach ( array( -1, -0xE9, PHP_INT_MIN, 0x110000, 0x7FFFFFFF, PHP_INT_MAX ) as $cp ) { + $got = $encode( $cp ); + if ( $replacement !== $got ) { + $out_of_range_fails[] = sprintf( '%d: got %s', $cp, bin2hex( $got ) ); + } +} +check( 'out-of-range values yield U+FFFD', array() === $out_of_range_fails, implode( '; ', $out_of_range_fails ) ); + +// --------------------------------------------------------------------- +// 3. Documented finding: sensitivity to `mb_internal_encoding()`. +// +// Not a pass/fail gate on the WordPress contract — it pins the CURRENT +// (arguably buggy) behavior so any change is noticed. Under a non-UTF-8 +// internal encoding the method returns non-UTF-8 bytes, contradicting +// its docblock. Fix would be `mb_chr( $code_point, 'UTF-8' )`. +// --------------------------------------------------------------------- +mb_internal_encoding( 'ISO-8859-1' ); +$latin1_e9 = \WP_HTML_Decoder::code_point_to_utf8_bytes( 0xE9 ); +$latin1_d800 = \WP_HTML_Decoder::code_point_to_utf8_bytes( 0xD800 ); +mb_internal_encoding( 'UTF-8' ); + +check( + 'KNOWN ISSUE pin: mb_internal_encoding sensitivity unchanged (a FAIL here means upstream behavior changed — update or remove this pin)', + "\xE9" === $latin1_e9 && $replacement === $latin1_d800, + sprintf( 'U+00E9 → %s, U+D800 → %s', bin2hex( $latin1_e9 ), bin2hex( $latin1_d800 ) ) +); +echo "NOTE code_point_to_utf8_bytes() inherits mb_internal_encoding(); under ISO-8859-1 it returns raw latin1 bytes\n"; +echo "NOTE for mappable code points while still returning UTF-8 U+FFFD for invalid ones. WordPress sets the internal\n"; +echo "NOTE encoding from blog_charset, so non-UTF-8 sites are affected. Suggested fix: mb_chr( \$code_point, 'UTF-8' ).\n"; + +mb_internal_encoding( $previous_encoding ); + +echo $failed > 0 ? "\n{$failed} check(s) FAILED\n" : "\nAll checks passed\n"; +exit( $failed > 0 ? 1 : 0 ); diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index 965c1743b03ae..e650738be9ed3 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -282,6 +282,18 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr } check( '300-case fuzz run clean (real findings would also surface here)', 0 === $fuzz_failures ); +// --------------------------------------------------------------------- +// 6. One-shot exhaustive companion test: must pass, and its detection +// must provably fire (same mutation-testing rule as everything else). +// --------------------------------------------------------------------- +$exhaustive = escapeshellarg( PHP_BINARY ) . ' ' . escapeshellarg( __DIR__ . '/code-point-to-utf8-exhaustive.php' ); + +exec( "{$exhaustive} 2>&1", $exh_output, $exh_code ); +check( 'code-point-to-utf8 exhaustive test passes', 0 === $exh_code, implode( ' | ', array_slice( $exh_output, -3 ) ) ); + +exec( "ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark {$exhaustive} 2>&1", $exh_fault_output, $exh_fault_code ); +check( 'exhaustive test catches broken surrogate handling', 1 === $exh_fault_code, "exit {$exh_fault_code}" ); + $oracles->shutdown(); echo $failed > 0 ? "\n{$failed} smoke check(s) FAILED\n" : "\nAll smoke checks passed\n"; From 9d15731f8f45eb4fc62b1281ddf891135d83d3dd Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 12:59:59 +0200 Subject: [PATCH 06/14] Expand encoding fuzzer for mb chr ord --- tools/encoding-fuzz/README.md | 15 +- tools/encoding-fuzz/lib/Bootstrap.php | 73 ++++- tools/encoding-fuzz/lib/Checks.php | 339 ++++++++++++++++++++ tools/encoding-fuzz/lib/Oracles.php | 10 +- tools/encoding-fuzz/lib/Targets.php | 2 + tools/encoding-fuzz/tests/harness-smoke.php | 14 + 6 files changed, 441 insertions(+), 12 deletions(-) diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index 28932785a653b..0d196f2a1241f 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -6,6 +6,7 @@ Differential fuzzer for the WordPress UTF-8 functions: - `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()` - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()` - `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only) +- `_mb_chr()` / `_mb_ord()` - `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary) The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main @@ -112,6 +113,13 @@ Internal invariants: - `_wp_utf8_encode_fallback()` output is always valid UTF-8 - `_wp_utf8_decode_fallback( _wp_utf8_encode_fallback( $s ) ) === $s` for any byte string `$s` (encode is total and injective per byte) +- `_mb_chr()` matches the fuzzer's independent arithmetic UTF-8 encoder + for valid scalar values and returns false for invalid code points +- `_mb_ord()` matches an independent first-code-point decoder on arbitrary + byte strings and returns false when the first code point is ill-formed +- `_mb_ord( _mb_chr( $cp ) ) === $cp` for valid scalar values, and + `_mb_chr( _mb_ord( $s ) )` reconstructs the first UTF-8 character in + `$s` when it is well-formed ## Inputs @@ -185,7 +193,7 @@ php tools/encoding-fuzz/tests/harness-smoke.php ``` Verifies the oracle battery, runs the real targets over the battery -vectors, and — most importantly — mutation-tests the harness: seventeen +vectors, and — most importantly — mutation-tests the harness: nineteen classes of deliberately broken implementations (validator accepting 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber, identity scrubber, byte-dropping scrubber, off-by-one code point count, @@ -193,8 +201,9 @@ throwing target, cp1252-confused encoder, identity encoder, per-byte decoder, valid-input-mangling decoder, round-trip-violating decoder, null-returning encoder, sometimes-null decoder, blind noncharacter detector, U+FDD0-block-missing detector, over-eager noncharacter -detector) must all be caught. It also asserts generator determinism, -the valid/invalid input mix, and the documented +detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`) +must all be caught. It also asserts generator determinism, the +valid/invalid input mix, and the documented `wp_has_noncharacters()` divergence stance on ill-formed input. For end-to-end pipeline testing while the real implementations are diff --git a/tools/encoding-fuzz/lib/Bootstrap.php b/tools/encoding-fuzz/lib/Bootstrap.php index e92921dcf272d..be54f7aa5ba4f 100644 --- a/tools/encoding-fuzz/lib/Bootstrap.php +++ b/tools/encoding-fuzz/lib/Bootstrap.php @@ -4,10 +4,9 @@ /** * Loads the WordPress UTF-8 functions under test into a bare PHP process. * - * Only `compat-utf8.php` and `utf8.php` are loaded. `utf8.php` calls - * `_wp_can_use_pcre_u()` at load time, which normally lives in - * `compat.php`; a minimal stand-in from `wp-stubs.php` covers it so the - * rest of WordPress stays out of the fuzzer process. + * Only the UTF-8 files under test are loaded. `_mb_chr()` and `_mb_ord()` + * live in `compat.php`, so their function bodies are extracted from that + * source file without loading the rest of WordPress compatibility glue. */ class Bootstrap { public static function repo_root(): string { @@ -19,10 +18,10 @@ public static function load_targets(): void { return; } - require_once __DIR__ . '/wp-stubs.php'; - $root = self::repo_root(); + require_once __DIR__ . '/wp-stubs.php'; require_once $root . '/src/wp-includes/compat-utf8.php'; + self::load_compat_functions( $root . '/src/wp-includes/compat.php', array( '_mb_chr', '_mb_ord' ) ); require_once $root . '/src/wp-includes/utf8.php'; /* @@ -33,4 +32,66 @@ public static function load_targets(): void { mb_substitute_character( 0xFFFD ); } } + + /** + * Loads selected top-level function definitions from `compat.php`. + * + * The full file has unrelated bootstrap assumptions (for example, + * sodium and deprecation helpers). The fuzzer only needs these + * private UTF-8 polyfills, and evaluating the source definitions keeps + * the tested code tied to WordPress without widening the harness. + * + * @param string $path Source file path. + * @param string[] $functions Function names to load. + */ + private static function load_compat_functions( string $path, array $functions ): void { + $source = file_get_contents( $path ); + if ( false === $source ) { + throw new \RuntimeException( "Unable to read {$path}" ); + } + + foreach ( $functions as $function_name ) { + if ( function_exists( $function_name ) ) { + continue; + } + + eval( self::extract_function_definition( $source, $function_name ) ); + } + } + + private static function extract_function_definition( string $source, string $function_name ): string { + $pattern = '/function\s+' . preg_quote( $function_name, '/' ) . '\s*\(/'; + if ( 1 !== preg_match( $pattern, $source, $match, PREG_OFFSET_CAPTURE ) ) { + throw new \RuntimeException( "Unable to find function {$function_name}" ); + } + + $tokens = token_get_all( ' */ @@ -264,6 +297,213 @@ public function run( string $input ): array { $failures[] = $failure; } + // 10. mb_chr()/mb_ord() polyfill differentials and isomorphisms. + foreach ( $this->check_mb_chr_ord( $input ) as $failure ) { + $failures[] = $failure; + } + + return $failures; + } + + /** + * Tests `_mb_chr()` and `_mb_ord()` as partial inverses. The oracle for + * `_mb_chr()` is the fuzzer's arithmetic UTF-8 encoder; the oracle for + * `_mb_ord()` is an independent decoder for the first code point only. + * + * @return array + */ + private function check_mb_chr_ord( string $input ): array { + $failures = array(); + + if ( ! isset( $this->targets['mb_chr'], $this->targets['mb_ord'] ) ) { + return $failures; + } + + list( $expected_ord, $prefix_length ) = self::first_code_point_or_false( $input ); + + try { + $actual_ord = ( $this->targets['mb_ord'] )( $input ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + 'mb_ord', + array( + 'target' => 'mb_ord', + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + $actual_ord = false; + } + + if ( ! is_int( $actual_ord ) && false !== $actual_ord ) { + $failures[] = self::failure( + 'mb-ord-bad-return', + 'mb_ord', + array( + 'type' => get_debug_type( $actual_ord ), + ) + ); + } elseif ( $actual_ord !== $expected_ord ) { + $failures[] = self::failure( + 'mb-ord-mismatch', + 'mb_ord', + array( + 'got' => $actual_ord, + 'expected' => $expected_ord, + 'input_preview' => self::preview( $input ), + ) + ); + } + + if ( is_int( $expected_ord ) ) { + try { + $round_trip_chr = ( $this->targets['mb_chr'] )( $expected_ord ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + 'mb_chr:from-ord', + array( + 'target' => 'mb_chr', + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + $round_trip_chr = false; + } + + $expected_prefix = substr( $input, 0, $prefix_length ); + if ( $round_trip_chr !== $expected_prefix ) { + $failures[] = self::failure( + 'mb-ord-chr-isomorphism', + 'mb_ord:mb_chr', + array( + 'code_point' => $expected_ord, + 'expected_prefix' => self::preview( $expected_prefix ), + 'got' => is_string( $round_trip_chr ) ? self::preview( $round_trip_chr ) : $round_trip_chr, + ) + ); + } + } + + foreach ( self::mb_chr_code_point_probes( $input ) as $code_point ) { + $expected_chr = self::expected_mb_chr( $code_point ); + + try { + $actual_chr = ( $this->targets['mb_chr'] )( $code_point ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + 'mb_chr', + array( + 'target' => 'mb_chr', + 'code_point' => $code_point, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + $actual_chr = false; + } + + if ( ! is_string( $actual_chr ) && false !== $actual_chr ) { + $failures[] = self::failure( + 'mb-chr-bad-return', + 'mb_chr', + array( + 'code_point' => $code_point, + 'type' => get_debug_type( $actual_chr ), + ) + ); + continue; + } + + if ( $actual_chr !== $expected_chr ) { + $failures[] = self::failure( + 'mb-chr-mismatch', + 'mb_chr', + array( + 'code_point' => $code_point, + 'expected' => is_string( $expected_chr ) ? self::preview( $expected_chr ) : $expected_chr, + 'got' => is_string( $actual_chr ) ? self::preview( $actual_chr ) : $actual_chr, + ) + ); + continue; + } + + if ( is_string( $actual_chr ) ) { + try { + $round_trip_ord = ( $this->targets['mb_ord'] )( $actual_chr ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + 'mb_ord:from-chr', + array( + 'target' => 'mb_ord', + 'code_point' => $code_point, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + $round_trip_ord = false; + } + + if ( $round_trip_ord !== $code_point ) { + $failures[] = self::failure( + 'mb-chr-ord-isomorphism', + 'mb_chr:mb_ord', + array( + 'code_point' => $code_point, + 'got' => $round_trip_ord, + ) + ); + } + } + } + + $contract_probes = array( + array( 'mb_chr', array( 0x41, 'UTF-8' ), 'A' ), + array( 'mb_chr', array( 0x41, 'latin1' ), false ), + array( 'mb_chr', array( 0x41, 'utf8' ), false ), + array( 'mb_chr', array( '65' ), false ), + array( 'mb_ord', array( 'A', 'UTF-8' ), 0x41 ), + array( 'mb_ord', array( 'A', 'latin1' ), false ), + array( 'mb_ord', array( 'A', 'utf8' ), false ), + array( 'mb_ord', array( '' ), false ), + array( 'mb_ord', array( 0x41 ), false ), + ); + + foreach ( $contract_probes as $probe ) { + list( $target, $args, $expected ) = $probe; + + try { + $actual = ( $this->targets[ $target ] )( ...$args ); + } catch ( \Throwable $error ) { + $failures[] = self::failure( + 'target-exception', + "{$target}:contract", + array( + 'target' => $target, + 'args' => array_map( static fn( $arg ) => is_string( $arg ) ? self::preview( $arg ) : $arg, $args ), + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + continue; + } + + if ( $actual !== $expected ) { + $failures[] = self::failure( + "{$target}-contract-mismatch", + $target, + array( + 'args' => array_map( static fn( $arg ) => is_string( $arg ) ? self::preview( $arg ) : $arg, $args ), + 'expected' => is_string( $expected ) ? self::preview( $expected ) : $expected, + 'got' => is_string( $actual ) ? self::preview( $actual ) : $actual, + ) + ); + } + } + return $failures; } @@ -593,6 +833,105 @@ private function check_chunked_scan( string $input, string $ref_scrub ): ?array return null; } + private static function expected_mb_chr( int $code_point ) { + if ( + $code_point < 0 || + ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) || + $code_point > 0x10FFFF + ) { + return false; + } + + return Generator::encode_code_point( $code_point ); + } + + private static function mb_chr_code_point_probes( string $input ): array { + $probes = self::MB_CHR_CODE_POINT_PROBES; + $hash = hash( 'sha256', $input, true ); + + for ( $i = 0; $i < 4; $i++ ) { + $raw = unpack( 'N', substr( $hash, 4 * $i, 4 ) )[1]; + $probes[] = ( $raw % 0x120000 ) - 0x800; + } + + return array_values( array_unique( $probes ) ); + } + + /** + * @return array{0: int|false, 1: int} First code point and byte length. + */ + private static function first_code_point_or_false( string $bytes ): array { + $length = strlen( $bytes ); + if ( 0 === $length ) { + return array( false, 0 ); + } + + $b1 = ord( $bytes[0] ); + if ( $b1 <= 0x7F ) { + return array( $b1, 1 ); + } + + if ( $length < 2 ) { + return array( false, 0 ); + } + + $b2 = ord( $bytes[1] ); + if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) { + return array( + ( ( $b1 & 0x1F ) << 6 ) | ( $b2 & 0x3F ), + 2, + ); + } + + if ( $length < 3 ) { + return array( false, 0 ); + } + + $b3 = ord( $bytes[2] ); + if ( + $b3 >= 0x80 && + $b3 <= 0xBF && + ( + ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) || + ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) || + ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) || + ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) + ) + ) { + return array( + ( ( $b1 & 0x0F ) << 12 ) | ( ( $b2 & 0x3F ) << 6 ) | ( $b3 & 0x3F ), + 3, + ); + } + + if ( $length < 4 ) { + return array( false, 0 ); + } + + $b4 = ord( $bytes[3] ); + if ( + $b3 >= 0x80 && + $b3 <= 0xBF && + $b4 >= 0x80 && + $b4 <= 0xBF && + ( + ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) || + ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) || + ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) + ) + ) { + return array( + ( ( $b1 & 0x07 ) << 18 ) | + ( ( $b2 & 0x3F ) << 12 ) | + ( ( $b3 & 0x3F ) << 6 ) | + ( $b4 & 0x3F ), + 4, + ); + } + + return array( false, 0 ); + } + private static function failure( string $check, string $party, array $detail ): array { return array( 'check' => $check, diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php index 88a5df3b1ae41..27c8822c07b79 100644 --- a/tools/encoding-fuzz/lib/Oracles.php +++ b/tools/encoding-fuzz/lib/Oracles.php @@ -116,16 +116,20 @@ public static function build( array $external_names ): self { }; } - if ( function_exists( 'mb_str_split' ) && function_exists( 'mb_ord' ) ) { + $mb_ord = function_exists( 'mb_ord' ) + ? 'mb_ord' + : ( function_exists( '_mb_ord' ) ? '_mb_ord' : null ); + + if ( function_exists( 'mb_str_split' ) && null !== $mb_ord ) { /* * Trivial decode-and-test reference for noncharacter detection, * independent of both implementations under test (the PCRE * character-class regex and the `_wp_scan_utf8()`-based scan). * Callers must pass valid UTF-8. */ - $oracles->noncharacters['mb'] = static function ( string $valid_utf8 ): bool { + $oracles->noncharacters['mb'] = static function ( string $valid_utf8 ) use ( $mb_ord ): bool { foreach ( mb_str_split( $valid_utf8, 1, 'UTF-8' ) as $character ) { - $code_point = mb_ord( $character, 'UTF-8' ); + $code_point = $mb_ord( $character, 'UTF-8' ); // Fail loudly on contract violations: on ill-formed // input `mb_ord()` returns false, which would otherwise diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php index 19f6ce3d45bc9..29d5d415be34a 100644 --- a/tools/encoding-fuzz/lib/Targets.php +++ b/tools/encoding-fuzz/lib/Targets.php @@ -31,6 +31,8 @@ public static function resolve(): array { 'utf8_decode_fb' => '_wp_utf8_decode_fallback', 'has_nonchars' => 'wp_has_noncharacters', 'has_nonchars_fb' => '_wp_has_noncharacters_fallback', + 'mb_chr' => '_mb_chr', + 'mb_ord' => '_mb_ord', ); switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) { diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index e650738be9ed3..9cd6e1374ae61 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -104,6 +104,8 @@ function check( string $label, bool $ok, string $detail = '' ): void { 'utf8_decode_fb' => '_wp_utf8_decode_fallback', 'has_nonchars' => 'wp_has_noncharacters', 'has_nonchars_fb' => '_wp_has_noncharacters_fallback', + 'mb_chr' => '_mb_chr', + 'mb_ord' => '_mb_ord', ); /** @@ -242,6 +244,18 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr ) ); check( 'catches over-eager noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) ); +// 3r. Character encoder that confuses U+0080 with Windows-1252's euro sign. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'mb_chr' => static fn( int $code_point ) => 0x80 === $code_point ? "\xE2\x82\xAC" : _mb_chr( $code_point ), +) ); +check( 'catches cp1252-confused _mb_chr', in_array( 'mb-chr-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3s. Character decoder that accepts an invalid leading C0 byte. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'mb_ord' => static fn( string $bytes ) => str_starts_with( $bytes, "\xC0" ) ? 0 : _mb_ord( $bytes ), +) ); +check( 'catches invalid-accepting _mb_ord', in_array( 'mb-ord-mismatch', $seen, true ), implode( ',', $seen ) ); + // --------------------------------------------------------------------- // 4. Generator determinism and mix. // --------------------------------------------------------------------- From 6ea247f9da3414d65663320f2292090417f71258 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 14:14:15 +0200 Subject: [PATCH 07/14] Add UTF-8 codepoint span fuzzing --- progress-handoff-xZOoEn.md | 33 ++ tools/encoding-fuzz/README.md | 22 +- tools/encoding-fuzz/lib/Checks.php | 319 +++++++++++++++++++- tools/encoding-fuzz/lib/Targets.php | 82 +++++ tools/encoding-fuzz/tests/harness-smoke.php | 27 ++ 5 files changed, 474 insertions(+), 9 deletions(-) create mode 100644 progress-handoff-xZOoEn.md diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md new file mode 100644 index 0000000000000..4ace0ce8d0896 --- /dev/null +++ b/progress-handoff-xZOoEn.md @@ -0,0 +1,33 @@ +# Progress for handoff-xZOoEn + +Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn.md` + +## 2026-06-11 + +### Current status + +- Confirmed step 0 (`_mb_chr()` / `_mb_ord()` coverage) is already committed as `9d15731f8f`. +- Worktree was clean before starting follow-up work. +- Next active slice: step 1, direct `_wp_utf8_codepoint_span()` coverage. + +### Step 1: `_wp_utf8_codepoint_span()` coverage + +- Status: done; included in the step 1 commit. +- Scope: + - Add `_wp_utf8_codepoint_span()` target wiring. + - Add span properties for scrubbed valid text and arbitrary input. + - Start nonzero-offset checks only at known code point or maximal-subpart boundaries. + - Add mutation tests for off-by-one span length, invalid subpart byte-counting, incorrect `found_code_points`, and stale `found_code_points` on empty spans. +- Verification: + - `php -l tools/encoding-fuzz/lib/Checks.php` + - `php -l tools/encoding-fuzz/lib/Targets.php` + - `php -l tools/encoding-fuzz/lib/Bootstrap.php` + - `php -l tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none` + - `git diff --check` +- Review gate: satisfied by 3 adversarial reviewers. + - Reviewer 1: satisfied after checking the independent maximal-subpart span reference and the stale-count update. + - Reviewer 2: satisfied after checking mutation adequacy, replay/minimize fault behavior, and the README clarification. + - Reviewer 3: initially found the stale `found_code_points` gap; satisfied after the sentinel and stale-count mutation were added. +- Commit: this step commit. diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index 0d196f2a1241f..f035e7147dbcf 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -7,7 +7,8 @@ Differential fuzzer for the WordPress UTF-8 functions: - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()` - `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only) - `_mb_chr()` / `_mb_ord()` -- `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary) +- `_wp_utf8_codepoint_count()`, `_wp_utf8_codepoint_span()`, and the + resumable `_wp_scan_utf8()` paths (secondary) The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main fuzz surface; the mbstring-backed public functions are checked alongside @@ -107,6 +108,12 @@ Internal invariants: - scrub is idempotent - `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed text (each maximal subpart counts as one code point) +- `_wp_utf8_codepoint_span()` reports the original byte span occupied by a + requested number of code points; on scrubbed valid text it matches + `strlen( mb_substr( ... ) )`, and on arbitrary input an independent + maximal-subpart parser checks that invalid subparts count as one code + point. Nonzero starts are probed only at known code point or + maximal-subpart boundaries. - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points` chunks reconstructs the same scrubbed text and always makes forward progress (chunk sizes derive from the input hash, so replays are exact) @@ -193,7 +200,7 @@ php tools/encoding-fuzz/tests/harness-smoke.php ``` Verifies the oracle battery, runs the real targets over the battery -vectors, and — most importantly — mutation-tests the harness: nineteen +vectors, and — most importantly — mutation-tests the harness: twenty-three classes of deliberately broken implementations (validator accepting 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber, identity scrubber, byte-dropping scrubber, off-by-one code point count, @@ -201,16 +208,21 @@ throwing target, cp1252-confused encoder, identity encoder, per-byte decoder, valid-input-mangling decoder, round-trip-violating decoder, null-returning encoder, sometimes-null decoder, blind noncharacter detector, U+FDD0-block-missing detector, over-eager noncharacter -detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`) +detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`, +off-by-one code point span, invalid-subpart byte-counted span, and +wrong or stale `found_code_points` span) must all be caught. It also asserts generator determinism, the valid/invalid input mix, and the documented `wp_has_noncharacters()` divergence stance on ill-formed input. For end-to-end pipeline testing while the real implementations are -healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager` +healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale` injects a broken target into worker, replay, and minimize alike. Fault-injected artifacts record the fault name in their environment -metadata so they cannot be mistaken for real findings: +metadata so they cannot be mistaken for real findings. Replaying or +minimizing a fault-injected artifact requires setting the same +`ENCODING_FUZZ_FAULT`; replay without it checks the healthy targets +against the saved input: ```sh ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/runner.php --lanes 2 --duration-seconds 5 diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php index 6b8bb94b07cae..061a9b0f74f56 100644 --- a/tools/encoding-fuzz/lib/Checks.php +++ b/tools/encoding-fuzz/lib/Checks.php @@ -16,6 +16,10 @@ * - scrub is idempotent * - `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed * text (each maximal subpart counts as one code point) + * - `_wp_utf8_codepoint_span()` reports the original byte span for a + * requested number of code points, with invalid maximal subparts + * counted as one code point and `found_code_points` reporting the + * available/requested count * - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points` * chunks reconstructs the same scrubbed text and always makes * forward progress @@ -281,23 +285,28 @@ public function run( string $input ): array { ); } - // 7. Chunked scan reconstruction. + // 7. Code point span agrees with valid-text and maximal-subpart references. + foreach ( $this->check_codepoint_span( $input, $ref_scrub ) as $failure ) { + $failures[] = $failure; + } + + // 8. Chunked scan reconstruction. $chunk_failure = $this->check_chunked_scan( $input, $ref_scrub ); if ( null !== $chunk_failure ) { $failures[] = $chunk_failure; } - // 8. Legacy utf8_encode()/utf8_decode() fallback differentials. + // 9. Legacy utf8_encode()/utf8_decode() fallback differentials. foreach ( $this->check_utf8_encode_decode( $input, $ref_valid, $mb_validity ) as $failure ) { $failures[] = $failure; } - // 9. Noncharacter detection, on valid input only. + // 10. Noncharacter detection, on valid input only. foreach ( $this->check_noncharacters( $input, $ref_valid ) as $failure ) { $failures[] = $failure; } - // 10. mb_chr()/mb_ord() polyfill differentials and isomorphisms. + // 11. mb_chr()/mb_ord() polyfill differentials and isomorphisms. foreach ( $this->check_mb_chr_ord( $input ) as $failure ) { $failures[] = $failure; } @@ -305,6 +314,165 @@ public function run( string $input ): array { return $failures; } + /** + * Tests `_wp_utf8_codepoint_span()` from known boundaries only. + * + * Starts inside a continuation byte or inside an invalid maximal subpart + * are deliberately outside this property: `_mb_substr()` reaches this + * helper by first computing a boundary with the same maximal-subpart model. + * + * @return array + */ + private function check_codepoint_span( string $input, string $ref_scrub ): array { + if ( ! isset( $this->targets['codepoint_span'] ) ) { + return array(); + } + + $failures = array(); + + list( $offsets, $reference_scrub ) = self::reference_utf8_offsets_and_scrub( $input ); + if ( $reference_scrub !== $ref_scrub ) { + return array( + self::failure( + 'span-reference-disagreement', + 'maximal-subpart-reference', + self::diff_detail( 'maximal-subpart-reference', $ref_scrub, $reference_scrub ) + ), + ); + } + + $segment_count = count( $offsets ) - 1; + foreach ( self::span_probe_indices( $segment_count, $input ) as $segment_index ) { + $byte_offset = $offsets[ $segment_index ]; + $available = $segment_count - $segment_index; + + foreach ( self::span_probe_counts( $available, $input . ":{$segment_index}" ) as $max_code_points ) { + $expected_found = min( $max_code_points, $available ); + $expected_span = $offsets[ $segment_index + $expected_found ] - $byte_offset; + + $failure = $this->assert_codepoint_span( + $input, + $byte_offset, + $max_code_points, + $expected_span, + $expected_found, + 'arbitrary-boundary' + ); + + if ( null !== $failure ) { + $failures[] = $failure; + } + } + } + + $scrubbed_code_points = mb_strlen( $ref_scrub, 'UTF-8' ); + foreach ( self::span_probe_indices( $scrubbed_code_points, $ref_scrub ) as $start_code_point ) { + $byte_offset = strlen( mb_substr( $ref_scrub, 0, $start_code_point, 'UTF-8' ) ); + $available = $scrubbed_code_points - $start_code_point; + + foreach ( self::span_probe_counts( $available, $ref_scrub . ":scrubbed:{$start_code_point}" ) as $max_code_points ) { + $expected_found = min( $max_code_points, $available ); + $expected_span = strlen( mb_substr( $ref_scrub, $start_code_point, $max_code_points, 'UTF-8' ) ); + + $failure = $this->assert_codepoint_span( + $ref_scrub, + $byte_offset, + $max_code_points, + $expected_span, + $expected_found, + 'scrubbed-mb-substr' + ); + + if ( null !== $failure ) { + $failures[] = $failure; + } + } + } + + return $failures; + } + + private function assert_codepoint_span( string $input, int $byte_offset, int $max_code_points, int $expected_span, int $expected_found, string $property ): ?array { + $found_code_points = -1; + + try { + $actual_span = ( $this->targets['codepoint_span'] )( $input, $byte_offset, $max_code_points, $found_code_points ); + } catch ( \Throwable $error ) { + return self::failure( + 'target-exception', + 'codepoint_span', + array( + 'target' => 'codepoint_span', + 'property' => $property, + 'byte_offset' => $byte_offset, + 'max_code_points' => $max_code_points, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + } + + if ( ! is_int( $actual_span ) ) { + return self::failure( + 'codepoint-span-bad-return', + 'codepoint_span', + array( + 'property' => $property, + 'byte_offset' => $byte_offset, + 'max_code_points' => $max_code_points, + 'type' => get_debug_type( $actual_span ), + ) + ); + } + + if ( ! is_int( $found_code_points ) ) { + return self::failure( + 'codepoint-span-found-bad-return', + 'codepoint_span', + array( + 'property' => $property, + 'byte_offset' => $byte_offset, + 'max_code_points' => $max_code_points, + 'type' => get_debug_type( $found_code_points ), + ) + ); + } + + if ( $actual_span !== $expected_span ) { + return self::failure( + 'codepoint-span-mismatch', + 'codepoint_span', + array( + 'property' => $property, + 'byte_offset' => $byte_offset, + 'max_code_points' => $max_code_points, + 'got' => $actual_span, + 'expected' => $expected_span, + 'found_code_points' => $found_code_points, + 'input_preview' => self::preview( $input, $byte_offset ), + ) + ); + } + + if ( $found_code_points !== $expected_found ) { + return self::failure( + 'codepoint-span-found-mismatch', + 'codepoint_span', + array( + 'property' => $property, + 'byte_offset' => $byte_offset, + 'max_code_points' => $max_code_points, + 'got' => $found_code_points, + 'expected' => $expected_found, + 'span' => $actual_span, + 'input_preview' => self::preview( $input, $byte_offset ), + ) + ); + } + + return null; + } + /** * Tests `_mb_chr()` and `_mb_ord()` as partial inverses. The oracle for * `_mb_chr()` is the fuzzer's arithmetic UTF-8 encoder; the oracle for @@ -857,6 +1025,149 @@ private static function mb_chr_code_point_probes( string $input ): array { return array_values( array_unique( $probes ) ); } + /** + * Builds a small boundary table from an independent UTF-8 maximal-subpart + * parser. `$offsets[$i]` is the byte offset before logical code point `$i`. + * + * @return array{0: int[], 1: string} Boundary offsets and scrubbed text. + */ + private static function reference_utf8_offsets_and_scrub( string $bytes ): array { + $length = strlen( $bytes ); + $offsets = array( 0 ); + $scrub = ''; + $at = 0; + + while ( $at < $length ) { + list( $segment_length, $valid ) = self::reference_utf8_segment( $bytes, $at ); + $scrub .= $valid ? substr( $bytes, $at, $segment_length ) : "\u{FFFD}"; + $at += $segment_length; + $offsets[] = $at; + } + + return array( $offsets, $scrub ); + } + + /** + * @return array{0: int, 1: bool} Byte length and whether the segment is well-formed. + */ + private static function reference_utf8_segment( string $bytes, int $at ): array { + $remaining = strlen( $bytes ) - $at; + $b1 = ord( $bytes[ $at ] ); + + if ( $b1 <= 0x7F ) { + return array( 1, true ); + } + + $b2 = $remaining >= 2 ? ord( $bytes[ $at + 1 ] ) : null; + if ( $b1 >= 0xC2 && $b1 <= 0xDF ) { + return self::is_continuation( $b2 ) ? array( 2, true ) : array( 1, false ); + } + + $b3 = $remaining >= 3 ? ord( $bytes[ $at + 2 ] ) : null; + if ( + self::is_continuation( $b3 ) && + ( + ( 0xE0 === $b1 && null !== $b2 && $b2 >= 0xA0 && $b2 <= 0xBF ) || + ( $b1 >= 0xE1 && $b1 <= 0xEC && self::is_continuation( $b2 ) ) || + ( 0xED === $b1 && null !== $b2 && $b2 >= 0x80 && $b2 <= 0x9F ) || + ( $b1 >= 0xEE && $b1 <= 0xEF && self::is_continuation( $b2 ) ) + ) + ) { + return array( 3, true ); + } + + $b4 = $remaining >= 4 ? ord( $bytes[ $at + 3 ] ) : null; + if ( + self::is_continuation( $b3 ) && + self::is_continuation( $b4 ) && + ( + ( 0xF0 === $b1 && null !== $b2 && $b2 >= 0x90 && $b2 <= 0xBF ) || + ( $b1 >= 0xF1 && $b1 <= 0xF3 && self::is_continuation( $b2 ) ) || + ( 0xF4 === $b1 && null !== $b2 && $b2 >= 0x80 && $b2 <= 0x8F ) + ) + ) { + return array( 4, true ); + } + + if ( $b1 >= 0xE0 && $b1 <= 0xEF && self::is_valid_three_byte_second( $b1, $b2 ) ) { + return array( min( $remaining, 2 ), false ); + } + + if ( $b1 >= 0xF0 && $b1 <= 0xF4 && self::is_valid_four_byte_second( $b1, $b2 ) ) { + return array( min( $remaining, self::is_continuation( $b3 ) ? 3 : 2 ), false ); + } + + return array( 1, false ); + } + + private static function is_continuation( ?int $byte ): bool { + return null !== $byte && $byte >= 0x80 && $byte <= 0xBF; + } + + private static function is_valid_three_byte_second( int $b1, ?int $b2 ): bool { + return ( + ( 0xE0 === $b1 && null !== $b2 && $b2 >= 0xA0 && $b2 <= 0xBF ) || + ( $b1 >= 0xE1 && $b1 <= 0xEC && self::is_continuation( $b2 ) ) || + ( 0xED === $b1 && null !== $b2 && $b2 >= 0x80 && $b2 <= 0x9F ) || + ( $b1 >= 0xEE && $b1 <= 0xEF && self::is_continuation( $b2 ) ) + ); + } + + private static function is_valid_four_byte_second( int $b1, ?int $b2 ): bool { + return ( + ( 0xF0 === $b1 && null !== $b2 && $b2 >= 0x90 && $b2 <= 0xBF ) || + ( $b1 >= 0xF1 && $b1 <= 0xF3 && self::is_continuation( $b2 ) ) || + ( 0xF4 === $b1 && null !== $b2 && $b2 >= 0x80 && $b2 <= 0x8F ) + ); + } + + /** + * @return int[] Segment/code point indices to use as start boundaries. + */ + private static function span_probe_indices( int $code_points, string $salt ): array { + $indices = array( + 0, + min( 1, $code_points ), + min( 2, $code_points ), + intdiv( $code_points, 2 ), + max( 0, $code_points - 1 ), + $code_points, + ); + + $hash = hash( 'sha256', $salt, true ); + for ( $i = 0; $i < 4; $i++ ) { + $indices[] = ord( $hash[ $i ] ) % ( $code_points + 1 ); + } + + sort( $indices ); + return array_values( array_unique( $indices ) ); + } + + /** + * @return int[] Requested code point counts to probe from a start boundary. + */ + private static function span_probe_counts( int $available, string $salt ): array { + $counts = array( + 0, + 1, + 2, + 3, + min( 7, $available ), + intdiv( $available, 2 ), + max( 0, $available - 1 ), + $available, + $available + 1, + ); + + $hash = hash( 'sha256', $salt, true ); + for ( $i = 0; $i < 4; $i++ ) { + $counts[] = ord( $hash[ $i ] ) % ( $available + 2 ); + } + + sort( $counts ); + return array_values( array_unique( $counts ) ); + } + /** * @return array{0: int|false, 1: int} First code point and byte length. */ diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php index 29d5d415be34a..7f5b1e27c821a 100644 --- a/tools/encoding-fuzz/lib/Targets.php +++ b/tools/encoding-fuzz/lib/Targets.php @@ -15,6 +15,10 @@ * ENCODING_FUZZ_FAULT=decode-per-byte decoder emits '?' per invalid byte * ENCODING_FUZZ_FAULT=nonchars-miss-fdd0 fallback detector misses U+FDD0–U+FDEF * ENCODING_FUZZ_FAULT=nonchars-overeager public detector also flags U+FDCF + * ENCODING_FUZZ_FAULT=span-off-by-one code point span reports one extra byte + * ENCODING_FUZZ_FAULT=span-invalid-bytes code point span counts invalid bytes individually + * ENCODING_FUZZ_FAULT=span-found-max code point span over-reports found_code_points + * ENCODING_FUZZ_FAULT=span-found-stale code point span leaves found_code_points stale */ class Targets { /** @@ -33,6 +37,7 @@ public static function resolve(): array { 'has_nonchars_fb' => '_wp_has_noncharacters_fallback', 'mb_chr' => '_mb_chr', 'mb_ord' => '_mb_ord', + 'codepoint_span' => '_wp_utf8_codepoint_span', ); switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) { @@ -67,6 +72,22 @@ public static function resolve(): array { case 'nonchars-overeager': $targets['has_nonchars'] = self::nonchars_overeager( ... ); break; + + case 'span-off-by-one': + $targets['codepoint_span'] = self::codepoint_span_off_by_one( ... ); + break; + + case 'span-invalid-bytes': + $targets['codepoint_span'] = self::codepoint_span_counts_invalid_bytes( ... ); + break; + + case 'span-found-max': + $targets['codepoint_span'] = self::codepoint_span_found_max( ... ); + break; + + case 'span-found-stale': + $targets['codepoint_span'] = self::codepoint_span_stale_empty_found( ... ); + break; } return $targets; @@ -116,4 +137,65 @@ public static function decode_per_invalid_byte( string $bytes ): string { return $out; } + + /** + * Deliberately broken span finder: reports the correct found count but + * includes one extra byte whenever a non-empty span was found. + */ + public static function codepoint_span_off_by_one( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int { + $span = _wp_utf8_codepoint_span( $text, $byte_offset, $max_code_points, $found_code_points ); + return $span > 0 ? $span + 1 : $span; + } + + /** + * Deliberately broken span finder: treats each byte of an invalid maximal + * subpart as its own code point, so a two-byte truncated sequence can be + * split in half. + */ + public static function codepoint_span_counts_invalid_bytes( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int { + $was_at = $byte_offset; + $invalid_length = 0; + $end = strlen( $text ); + $found_code_points = 0; + + while ( $byte_offset < $end && $found_code_points < $max_code_points ) { + $needed = $max_code_points - $found_code_points; + $chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed ); + + $found_code_points += $chunk_count; + + if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) { + $bytes_to_take = min( $invalid_length, $max_code_points - $found_code_points ); + $found_code_points += $bytes_to_take; + $byte_offset += $bytes_to_take; + } + } + + return $byte_offset - $was_at; + } + + /** + * Deliberately broken span finder: returns the right byte span but always + * claims it found the requested number of code points. + */ + public static function codepoint_span_found_max( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int { + $span = _wp_utf8_codepoint_span( $text, $byte_offset, $max_code_points, $found_code_points ); + $found_code_points = $max_code_points; + return $span; + } + + /** + * Deliberately broken span finder: leaves the caller's by-reference + * value untouched whenever no bytes are spanned. + */ + public static function codepoint_span_stale_empty_found( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int { + $previous = $found_code_points; + $span = _wp_utf8_codepoint_span( $text, $byte_offset, $max_code_points, $found_code_points ); + + if ( 0 === $span ) { + $found_code_points = $previous; + } + + return $span; + } } diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index 9cd6e1374ae61..883f88497e19a 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -106,6 +106,7 @@ function check( string $label, bool $ok, string $detail = '' ): void { 'has_nonchars_fb' => '_wp_has_noncharacters_fallback', 'mb_chr' => '_mb_chr', 'mb_ord' => '_mb_ord', + 'codepoint_span' => '_wp_utf8_codepoint_span', ); /** @@ -256,6 +257,32 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr ) ); check( 'catches invalid-accepting _mb_ord', in_array( 'mb-ord-mismatch', $seen, true ), implode( ',', $seen ) ); +// 3t. Code point span that reports one extra byte. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'codepoint_span' => Targets::codepoint_span_off_by_one( ... ), +) ); +check( 'catches off-by-one code point span', in_array( 'codepoint-span-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3u. Code point span that treats invalid maximal subparts as one code +// point per byte instead of one code point per maximal subpart. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'codepoint_span' => Targets::codepoint_span_counts_invalid_bytes( ... ), +) ); +check( 'catches byte-counted invalid code point span', in_array( 'codepoint-span-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3v. Code point span that returns the right byte span but corrupts the +// by-reference found count. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'codepoint_span' => Targets::codepoint_span_found_max( ... ), +) ); +check( 'catches wrong code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3w. Code point span that leaves found_code_points stale on empty spans. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'codepoint_span' => Targets::codepoint_span_stale_empty_found( ... ), +) ); +check( 'catches stale empty code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) ); + // --------------------------------------------------------------------- // 4. Generator determinism and mix. // --------------------------------------------------------------------- From 1f875a1f2181fca08a770e3e088a465a9eda363c Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 14:22:08 +0200 Subject: [PATCH 08/14] Add mb substr fuzzing --- progress-handoff-xZOoEn.md | 24 +++ tools/encoding-fuzz/README.md | 18 +- tools/encoding-fuzz/lib/Bootstrap.php | 4 +- tools/encoding-fuzz/lib/Checks.php | 222 +++++++++++++++++++- tools/encoding-fuzz/lib/Targets.php | 55 +++++ tools/encoding-fuzz/lib/wp-stubs.php | 6 + tools/encoding-fuzz/tests/harness-smoke.php | 25 +++ 7 files changed, 346 insertions(+), 8 deletions(-) diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md index 4ace0ce8d0896..4d5fe5e56511b 100644 --- a/progress-handoff-xZOoEn.md +++ b/progress-handoff-xZOoEn.md @@ -31,3 +31,27 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn - Reviewer 2: satisfied after checking mutation adequacy, replay/minimize fault behavior, and the README clarification. - Reviewer 3: initially found the stale `found_code_points` gap; satisfied after the sentinel and stale-count mutation were added. - Commit: this step commit. + +### Step 2: `_mb_substr()` property coverage + +- Status: done; included in the step 2 commit. +- Prior step commit: `6ea247f9da`. +- Scope: + - Load `_mb_substr()` and its `_is_utf8_charset()` dependency into the fuzzer bootstrap. + - Add UTF-8 substring properties over valid and arbitrary input. + - Pin current invalid-input semantics: invalid maximal subparts count as one code point, but the returned substring preserves the original bytes rather than returning scrubbed text. + - Add explicit non-UTF-8 encoding fallback checks against byte-level `substr()`. + - Add mutation tests for byte-offset slicing, scrubbed-input slicing, negative length handling, and non-UTF-8 fallback drift. +- Verification: + - `php -l tools/encoding-fuzz/lib/Checks.php` + - `php -l tools/encoding-fuzz/lib/Targets.php` + - `php -l tools/encoding-fuzz/lib/Bootstrap.php` + - `php -l tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none` + - `git diff --check` +- Review gate: satisfied by 3 adversarial reviewers. + - Reviewer 1: satisfied after checking invalid-input expected substrings, negative start/length semantics, and valid native `mb_substr()` comparison. + - Reviewer 2: satisfied after checking mutation adequacy and faulted worker/replay/minimize behavior. + - Reviewer 3: satisfied after checking bootstrap/stub wiring, edge coverage, performance, and docs/progress accuracy. +- Commit: this step commit. diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index f035e7147dbcf..9f65ee966eaa9 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -7,13 +7,15 @@ Differential fuzzer for the WordPress UTF-8 functions: - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()` - `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only) - `_mb_chr()` / `_mb_ord()` +- `_mb_substr()` - `_wp_utf8_codepoint_count()`, `_wp_utf8_codepoint_span()`, and the resumable `_wp_scan_utf8()` paths (secondary) The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main fuzz surface; the mbstring-backed public functions are checked alongside -them. Only `compat-utf8.php` and `utf8.php` are loaded — no WordPress -bootstrap, database, or `wp-env`. +them. The harness loads `compat-utf8.php`, `utf8.php`, and selected private +UTF-8 helpers extracted from `compat.php` — no WordPress bootstrap, +database, or `wp-env`. ## Oracles @@ -127,6 +129,10 @@ Internal invariants: - `_mb_ord( _mb_chr( $cp ) ) === $cp` for valid scalar values, and `_mb_chr( _mb_ord( $s ) )` reconstructs the first UTF-8 character in `$s` when it is well-formed +- `_mb_substr()` in UTF-8 mode preserves original bytes while treating each + invalid maximal subpart as one code point; on valid input it also agrees + with native `mb_substr()`, and explicit non-UTF-8 encodings fall back to + byte-level `substr()` semantics ## Inputs @@ -200,7 +206,7 @@ php tools/encoding-fuzz/tests/harness-smoke.php ``` Verifies the oracle battery, runs the real targets over the battery -vectors, and — most importantly — mutation-tests the harness: twenty-three +vectors, and — most importantly — mutation-tests the harness: twenty-seven classes of deliberately broken implementations (validator accepting 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber, identity scrubber, byte-dropping scrubber, off-by-one code point count, @@ -210,13 +216,15 @@ null-returning encoder, sometimes-null decoder, blind noncharacter detector, U+FDD0-block-missing detector, over-eager noncharacter detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`, off-by-one code point span, invalid-subpart byte-counted span, and -wrong or stale `found_code_points` span) +wrong or stale `found_code_points` span, byte-offset `_mb_substr()`, +scrubbed-input `_mb_substr()`, negative-length `_mb_substr()`, and +non-UTF-8 fallback drift) must all be caught. It also asserts generator determinism, the valid/invalid input mix, and the documented `wp_has_noncharacters()` divergence stance on ill-formed input. For end-to-end pipeline testing while the real implementations are -healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale` +healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8` injects a broken target into worker, replay, and minimize alike. Fault-injected artifacts record the fault name in their environment metadata so they cannot be mistaken for real findings. Replaying or diff --git a/tools/encoding-fuzz/lib/Bootstrap.php b/tools/encoding-fuzz/lib/Bootstrap.php index be54f7aa5ba4f..de065a39d1ebc 100644 --- a/tools/encoding-fuzz/lib/Bootstrap.php +++ b/tools/encoding-fuzz/lib/Bootstrap.php @@ -4,7 +4,7 @@ /** * Loads the WordPress UTF-8 functions under test into a bare PHP process. * - * Only the UTF-8 files under test are loaded. `_mb_chr()` and `_mb_ord()` + * Only the UTF-8 files under test are loaded. A few private UTF-8 helpers * live in `compat.php`, so their function bodies are extracted from that * source file without loading the rest of WordPress compatibility glue. */ @@ -21,7 +21,7 @@ public static function load_targets(): void { $root = self::repo_root(); require_once __DIR__ . '/wp-stubs.php'; require_once $root . '/src/wp-includes/compat-utf8.php'; - self::load_compat_functions( $root . '/src/wp-includes/compat.php', array( '_mb_chr', '_mb_ord' ) ); + self::load_compat_functions( $root . '/src/wp-includes/compat.php', array( '_is_utf8_charset', '_mb_chr', '_mb_ord', '_mb_substr' ) ); require_once $root . '/src/wp-includes/utf8.php'; /* diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php index 061a9b0f74f56..387f871f8d796 100644 --- a/tools/encoding-fuzz/lib/Checks.php +++ b/tools/encoding-fuzz/lib/Checks.php @@ -48,6 +48,10 @@ * - `_mb_ord( _mb_chr( cp ) ) === cp` and * `_mb_chr( _mb_ord( s ) ) === first UTF-8 character in s` where * those expressions are defined. + * - `_mb_substr()` preserves original bytes while using UTF-8 + * code-point/maximal-subpart offsets; on valid input it agrees with + * native `mb_substr()`, and for non-UTF-8 encodings it agrees with + * byte-level `substr()`. * * Target callables are injectable so the harness smoke test can verify * that deliberately broken implementations are caught. @@ -306,7 +310,12 @@ public function run( string $input ): array { $failures[] = $failure; } - // 11. mb_chr()/mb_ord() polyfill differentials and isomorphisms. + // 11. _mb_substr() UTF-8 and byte-fallback properties. + foreach ( $this->check_mb_substr( $input, $ref_valid, $ref_scrub ) as $failure ) { + $failures[] = $failure; + } + + // 12. mb_chr()/mb_ord() polyfill differentials and isomorphisms. foreach ( $this->check_mb_chr_ord( $input ) as $failure ) { $failures[] = $failure; } @@ -473,6 +482,145 @@ private function assert_codepoint_span( string $input, int $byte_offset, int $ma return null; } + /** + * Tests `_mb_substr()` against the semantics currently implemented by + * `compat.php`: UTF-8 mode computes character offsets by treating each + * invalid maximal subpart as one code point, then returns the original + * bytes in the selected range. It does not slice scrubbed text. + * + * @return array + */ + private function check_mb_substr( string $input, bool $ref_valid, string $ref_scrub ): array { + if ( ! isset( $this->targets['mb_substr'] ) ) { + return array(); + } + + $failures = array(); + + list( $offsets, $reference_scrub ) = self::reference_utf8_offsets_and_scrub( $input ); + if ( $reference_scrub !== $ref_scrub ) { + return array( + self::failure( + 'substr-reference-disagreement', + 'maximal-subpart-reference', + self::diff_detail( 'maximal-subpart-reference', $ref_scrub, $reference_scrub ) + ), + ); + } + + $code_points = count( $offsets ) - 1; + $encodings = array( 'UTF-8', 'utf8', null ); + foreach ( self::mb_substr_probes( $code_points, $input . ':utf8' ) as $i => $probe ) { + list( $start, $length ) = $probe; + $encoding = $encodings[ $i % count( $encodings ) ]; + $expected = self::expected_mb_substr_from_offsets( $input, $offsets, $start, $length ); + + $failure = $this->assert_mb_substr( + $input, + $start, + $length, + $encoding, + $expected, + 'utf8-maximal-subpart' + ); + + if ( null !== $failure ) { + $failures[] = $failure; + } + } + + if ( $ref_valid && function_exists( 'mb_substr' ) ) { + foreach ( self::mb_substr_probes( $code_points, $input . ':native' ) as $probe ) { + list( $start, $length ) = $probe; + $expected = mb_substr( $input, $start, $length, 'UTF-8' ); + + $failure = $this->assert_mb_substr( + $input, + $start, + $length, + 'UTF-8', + $expected, + 'valid-native-mb-substr' + ); + + if ( null !== $failure ) { + $failures[] = $failure; + } + } + } + + $byte_encodings = array( 'ISO-8859-1', 'latin1', 'Windows-1252', 'UTF 8' ); + foreach ( array_slice( self::mb_substr_probes( strlen( $input ), $input . ':bytes' ), 0, 18 ) as $i => $probe ) { + list( $start, $length ) = $probe; + $encoding = $byte_encodings[ $i % count( $byte_encodings ) ]; + $expected = is_null( $length ) ? substr( $input, $start ) : substr( $input, $start, $length ); + + $failure = $this->assert_mb_substr( + $input, + $start, + $length, + $encoding, + $expected, + 'non-utf8-byte-substr' + ); + + if ( null !== $failure ) { + $failures[] = $failure; + } + } + + return $failures; + } + + private function assert_mb_substr( string $input, int $start, ?int $length, ?string $encoding, string $expected, string $property ): ?array { + try { + $actual = ( $this->targets['mb_substr'] )( $input, $start, $length, $encoding ); + } catch ( \Throwable $error ) { + return self::failure( + 'target-exception', + 'mb_substr', + array( + 'target' => 'mb_substr', + 'property' => $property, + 'start' => $start, + 'length' => $length, + 'encoding' => $encoding, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + } + + if ( ! is_string( $actual ) ) { + return self::failure( + 'mb-substr-bad-return', + 'mb_substr', + array( + 'property' => $property, + 'start' => $start, + 'length' => $length, + 'encoding' => $encoding, + 'type' => get_debug_type( $actual ), + ) + ); + } + + if ( $actual !== $expected ) { + return self::failure( + 'mb-substr-mismatch', + 'mb_substr', + array( + 'property' => $property, + 'start' => $start, + 'length' => $length, + 'encoding' => $encoding, + ) + self::diff_detail( 'mb_substr', $expected, $actual ) + ); + } + + return null; + } + /** * Tests `_mb_chr()` and `_mb_ord()` as partial inverses. The oracle for * `_mb_chr()` is the fuzzer's arithmetic UTF-8 encoder; the oracle for @@ -1168,6 +1316,78 @@ private static function span_probe_counts( int $available, string $salt ): array return array_values( array_unique( $counts ) ); } + /** + * @return array Start/length probes. + */ + private static function mb_substr_probes( int $code_points, string $salt ): array { + $mid = intdiv( $code_points, 2 ); + $last = max( 0, $code_points - 1 ); + + $probes = array( + array( 0, null ), + array( 0, 0 ), + array( 0, 1 ), + array( 1, null ), + array( 1, 1 ), + array( 2, 3 ), + array( $mid, 1 ), + array( $last, 1 ), + array( $code_points, 1 ), + array( $code_points + 1, 1 ), + array( -1, null ), + array( -1, 1 ), + array( -2, 1 ), + array( -$code_points, 2 ), + array( -( $code_points + 1 ), 2 ), + array( 0, -1 ), + array( 1, -1 ), + array( $mid, -1 ), + array( -2, -1 ), + array( 0, -$code_points ), + array( 1, -( $code_points + 1 ) ), + ); + + $range = max( 3, $code_points + 3 ); + $hash = hash( 'sha256', $salt, true ); + for ( $i = 0; $i < 4; $i++ ) { + $start = ( ord( $hash[ $i ] ) % ( ( 2 * $range ) + 1 ) ) - $range; + $length = ( ord( $hash[ $i + 4 ] ) % ( ( 2 * $range ) + 2 ) ) - $range; + if ( 0 === ord( $hash[ $i + 8 ] ) % 5 ) { + $length = null; + } + + $probes[] = array( $start, $length ); + } + + $unique = array(); + foreach ( $probes as $probe ) { + $unique[ json_encode( $probe ) ] = $probe; + } + + return array_values( $unique ); + } + + /** + * @param int[] $offsets Boundary offsets from `reference_utf8_offsets_and_scrub()`. + */ + private static function expected_mb_substr_from_offsets( string $input, array $offsets, int $start, ?int $length ): string { + $total = count( $offsets ) - 1; + $normalized_start = $start < 0 ? max( 0, $total + $start ) : $start; + $start_index = min( $normalized_start, $total ); + $start_offset = $offsets[ $start_index ]; + + if ( null === $length ) { + return substr( $input, $start_offset ); + } + + $normalized_length = $length < 0 + ? max( 0, $total - $normalized_start + $length ) + : $length; + $end_index = min( $start_index + $normalized_length, $total ); + + return substr( $input, $start_offset, $offsets[ $end_index ] - $start_offset ); + } + /** * @return array{0: int|false, 1: int} First code point and byte length. */ diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php index 7f5b1e27c821a..80a4cfcfa3800 100644 --- a/tools/encoding-fuzz/lib/Targets.php +++ b/tools/encoding-fuzz/lib/Targets.php @@ -19,6 +19,10 @@ * ENCODING_FUZZ_FAULT=span-invalid-bytes code point span counts invalid bytes individually * ENCODING_FUZZ_FAULT=span-found-max code point span over-reports found_code_points * ENCODING_FUZZ_FAULT=span-found-stale code point span leaves found_code_points stale + * ENCODING_FUZZ_FAULT=substr-byte-level substr treats UTF-8 offsets as byte offsets + * ENCODING_FUZZ_FAULT=substr-scrub substr slices scrubbed invalid input + * ENCODING_FUZZ_FAULT=substr-no-neg-len substr ignores negative lengths + * ENCODING_FUZZ_FAULT=substr-force-utf8 substr ignores non-UTF-8 byte fallback */ class Targets { /** @@ -38,6 +42,7 @@ public static function resolve(): array { 'mb_chr' => '_mb_chr', 'mb_ord' => '_mb_ord', 'codepoint_span' => '_wp_utf8_codepoint_span', + 'mb_substr' => '_mb_substr', ); switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) { @@ -88,6 +93,22 @@ public static function resolve(): array { case 'span-found-stale': $targets['codepoint_span'] = self::codepoint_span_stale_empty_found( ... ); break; + + case 'substr-byte-level': + $targets['mb_substr'] = self::mb_substr_byte_level( ... ); + break; + + case 'substr-scrub': + $targets['mb_substr'] = self::mb_substr_scrub_invalid( ... ); + break; + + case 'substr-no-neg-len': + $targets['mb_substr'] = self::mb_substr_no_negative_length( ... ); + break; + + case 'substr-force-utf8': + $targets['mb_substr'] = self::mb_substr_force_utf8( ... ); + break; } return $targets; @@ -198,4 +219,38 @@ public static function codepoint_span_stale_empty_found( string $text, int $byte return $span; } + + /** + * Deliberately broken substring: treats character offsets as byte offsets. + */ + public static function mb_substr_byte_level( $str, $start, $length = null, $encoding = null ) { + return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); + } + + /** + * Deliberately broken substring: slices scrubbed UTF-8, masking that + * `_mb_substr()` is expected to preserve original invalid bytes. + */ + public static function mb_substr_scrub_invalid( $str, $start, $length = null, $encoding = null ) { + if ( _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ) { + $str = wp_scrub_utf8( $str ); + } + + return _mb_substr( $str, $start, $length, $encoding ); + } + + /** + * Deliberately broken substring: handles negative lengths as "to the end". + */ + public static function mb_substr_no_negative_length( $str, $start, $length = null, $encoding = null ) { + return _mb_substr( $str, $start, is_int( $length ) && $length < 0 ? null : $length, $encoding ); + } + + /** + * Deliberately broken substring: runs the UTF-8 path even for explicit + * non-UTF-8 encodings, instead of falling back to byte-level `substr()`. + */ + public static function mb_substr_force_utf8( $str, $start, $length = null, $encoding = null ) { + return _mb_substr( $str, $start, $length, _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ? $encoding : 'UTF-8' ); + } } diff --git a/tools/encoding-fuzz/lib/wp-stubs.php b/tools/encoding-fuzz/lib/wp-stubs.php index ffe4cbc64a191..f86bd4b367332 100644 --- a/tools/encoding-fuzz/lib/wp-stubs.php +++ b/tools/encoding-fuzz/lib/wp-stubs.php @@ -14,3 +14,9 @@ function _wp_can_use_pcre_u( $set = null ): bool { return (bool) $utf8_pcre; } } + +if ( ! function_exists( 'get_option' ) ) { + function get_option( $option, $default_value = false ) { + return 'blog_charset' === $option ? 'UTF-8' : $default_value; + } +} diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index 883f88497e19a..4977d1b5679e1 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -107,6 +107,7 @@ function check( string $label, bool $ok, string $detail = '' ): void { 'mb_chr' => '_mb_chr', 'mb_ord' => '_mb_ord', 'codepoint_span' => '_wp_utf8_codepoint_span', + 'mb_substr' => '_mb_substr', ); /** @@ -283,6 +284,30 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr ) ); check( 'catches stale empty code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) ); +// 3x. UTF-8 substring that treats character offsets as byte offsets. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'mb_substr' => Targets::mb_substr_byte_level( ... ), +) ); +check( 'catches byte-offset _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3y. UTF-8 substring that slices scrubbed text, losing original invalid bytes. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'mb_substr' => Targets::mb_substr_scrub_invalid( ... ), +) ); +check( 'catches scrubbed-input _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3z. UTF-8 substring that ignores negative length semantics. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'mb_substr' => Targets::mb_substr_no_negative_length( ... ), +) ); +check( 'catches negative-length _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3aa. Non-UTF-8 substring must fall back to byte-level substr(). +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'mb_substr' => Targets::mb_substr_force_utf8( ... ), +) ); +check( 'catches non-UTF-8 _mb_substr fallback drift', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) ); + // --------------------------------------------------------------------- // 4. Generator determinism and mix. // --------------------------------------------------------------------- From a0f6820eb1adce18e5a7df56988b119881f44911 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 14:32:40 +0200 Subject: [PATCH 09/14] Add bounded codepoint count fuzzing --- progress-handoff-xZOoEn.md | 22 ++ tools/encoding-fuzz/README.md | 14 +- tools/encoding-fuzz/lib/Checks.php | 226 +++++++++++++++++--- tools/encoding-fuzz/lib/Targets.php | 62 ++++++ tools/encoding-fuzz/tests/harness-smoke.php | 64 ++++-- 5 files changed, 334 insertions(+), 54 deletions(-) diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md index 4d5fe5e56511b..aaf355609eada 100644 --- a/progress-handoff-xZOoEn.md +++ b/progress-handoff-xZOoEn.md @@ -55,3 +55,25 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn - Reviewer 2: satisfied after checking mutation adequacy and faulted worker/replay/minimize behavior. - Reviewer 3: satisfied after checking bootstrap/stub wiring, edge coverage, performance, and docs/progress accuracy. - Commit: this step commit. + +### Step 3: bounded `_wp_utf8_codepoint_count()` coverage + +- Status: done; included in the step 3 commit. +- Prior step commit: `1f875a1f21`. +- Scope: + - Add bounded `_wp_utf8_codepoint_count()` probes for negative offsets, zero lengths, oversized lengths, nonzero byte offsets, and ranges ending before/at/after code point boundaries. + - Pin current byte-window semantics: a range ending inside a valid multibyte character or invalid maximal subpart counts the truncated prefix as one invalid subpart. + - Add mutation tests for invalid-byte counting, range-end off-by-one behavior, and ignored byte offsets. +- Verification: + - `php -l tools/encoding-fuzz/lib/Checks.php` + - `php -l tools/encoding-fuzz/lib/Targets.php` + - `php -l tools/encoding-fuzz/lib/Bootstrap.php` + - `php -l tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none` + - `git diff --check` +- Review gate: satisfied by 3 adversarial reviewers. + - Reviewer 1: satisfied after checking the bounded-window model, negative offsets, truncation semantics, and reference independence. + - Reviewer 2: satisfied after checking the new mutation modes through worker/replay/minimize. + - Reviewer 3: satisfied after checking probe coverage, performance, docs/progress accuracy, and the smoke comment cleanup. +- Commit: this step commit. diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index 9f65ee966eaa9..606a50ac70152 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -108,8 +108,10 @@ Internal invariants: - valid ⟺ scrub returns the input unchanged - scrub output is always valid UTF-8 - scrub is idempotent -- `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed text - (each maximal subpart counts as one code point) +- `_wp_utf8_codepoint_count()` equals an independent maximal-subpart count + for whole strings and bounded byte windows; a byte window ending inside a + multibyte character or invalid maximal subpart counts its truncated prefix + as one invalid subpart - `_wp_utf8_codepoint_span()` reports the original byte span occupied by a requested number of code points; on scrubbed valid text it matches `strlen( mb_substr( ... ) )`, and on arbitrary input an independent @@ -206,11 +208,13 @@ php tools/encoding-fuzz/tests/harness-smoke.php ``` Verifies the oracle battery, runs the real targets over the battery -vectors, and — most importantly — mutation-tests the harness: twenty-seven +vectors, and — most importantly — mutation-tests the harness: thirty classes of deliberately broken implementations (validator accepting 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber, identity scrubber, byte-dropping scrubber, off-by-one code point count, -throwing target, cp1252-confused encoder, identity encoder, per-byte +invalid-byte-counting code point count, range-end off-by-one code point +count, byte-offset-ignoring code point count, throwing target, +cp1252-confused encoder, identity encoder, per-byte decoder, valid-input-mangling decoder, round-trip-violating decoder, null-returning encoder, sometimes-null decoder, blind noncharacter detector, U+FDD0-block-missing detector, over-eager noncharacter @@ -224,7 +228,7 @@ valid/invalid input mix, and the documented `wp_has_noncharacters()` divergence stance on ill-formed input. For end-to-end pipeline testing while the real implementations are -healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8` +healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset` injects a broken target into worker, replay, and minimize alike. Fault-injected artifacts record the fault name in their environment metadata so they cannot be mistaken for real findings. Replaying or diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php index 387f871f8d796..72a9a0995f52c 100644 --- a/tools/encoding-fuzz/lib/Checks.php +++ b/tools/encoding-fuzz/lib/Checks.php @@ -14,8 +14,8 @@ * - valid ⟺ scrub returns the input unchanged * - scrub output is always valid UTF-8 * - scrub is idempotent - * - `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed - * text (each maximal subpart counts as one code point) + * - `_wp_utf8_codepoint_count()` equals the independent maximal-subpart + * count for whole strings and bounded byte windows * - `_wp_utf8_codepoint_span()` reports the original byte span for a * requested number of code points, with invalid maximal subparts * counted as one code point and `found_code_points` reporting the @@ -263,30 +263,9 @@ public function run( string $input ): array { } } - // 6. Code point count agrees with the scrubbed length. - try { - $count = ( $this->targets['codepoint_count'] )( $input ); - $expected = mb_strlen( $ref_scrub, 'UTF-8' ); - if ( $count !== $expected ) { - $failures[] = self::failure( - 'codepoint-count-mismatch', - 'codepoint_count', - array( - 'got' => $count, - 'expected' => $expected, - ) - ); - } - } catch ( \Throwable $error ) { - $failures[] = self::failure( - 'target-exception', - 'codepoint_count', - array( - 'target' => 'codepoint_count', - 'message' => $error->getMessage(), - 'class' => get_class( $error ), - ) - ); + // 6. Code point count agrees with whole-string and bounded-window references. + foreach ( $this->check_codepoint_count( $input, $ref_scrub ) as $failure ) { + $failures[] = $failure; } // 7. Code point span agrees with valid-text and maximal-subpart references. @@ -323,6 +302,114 @@ public function run( string $input ): array { return $failures; } + /** + * Tests `_wp_utf8_codepoint_count()` over whole strings and bounded byte + * windows. Byte windows are counted as standalone strings: if a window + * ends inside a valid multibyte character or invalid maximal subpart, + * the truncated prefix counts as one invalid maximal subpart. + * + * @return array + */ + private function check_codepoint_count( string $input, string $ref_scrub ): array { + if ( ! isset( $this->targets['codepoint_count'] ) ) { + return array(); + } + + list( $offsets, $reference_scrub ) = self::reference_utf8_offsets_and_scrub( $input ); + if ( $reference_scrub !== $ref_scrub ) { + return array( + self::failure( + 'count-reference-disagreement', + 'maximal-subpart-reference', + self::diff_detail( 'maximal-subpart-reference', $ref_scrub, $reference_scrub ) + ), + ); + } + + $failures = array(); + $whole = $this->assert_codepoint_count( + $input, + 0, + null, + count( $offsets ) - 1, + 'whole-string' + ); + + if ( null !== $whole ) { + $failures[] = $whole; + } + + foreach ( self::codepoint_count_probes( $offsets, strlen( $input ), $input ) as $probe ) { + list( $byte_offset, $max_byte_length ) = $probe; + $expected = self::expected_codepoint_count_window( $input, $byte_offset, $max_byte_length ); + + $failure = $this->assert_codepoint_count( + $input, + $byte_offset, + $max_byte_length, + $expected, + 'bounded-window' + ); + + if ( null !== $failure ) { + $failures[] = $failure; + } + } + + return $failures; + } + + private function assert_codepoint_count( string $input, int $byte_offset, ?int $max_byte_length, int $expected, string $property ): ?array { + try { + $actual = null === $max_byte_length + ? ( $this->targets['codepoint_count'] )( $input ) + : ( $this->targets['codepoint_count'] )( $input, $byte_offset, $max_byte_length ); + } catch ( \Throwable $error ) { + return self::failure( + 'target-exception', + 'codepoint_count', + array( + 'target' => 'codepoint_count', + 'property' => $property, + 'byte_offset' => $byte_offset, + 'max_byte_length' => $max_byte_length, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + } + + if ( ! is_int( $actual ) ) { + return self::failure( + 'codepoint-count-bad-return', + 'codepoint_count', + array( + 'property' => $property, + 'byte_offset' => $byte_offset, + 'max_byte_length' => $max_byte_length, + 'type' => get_debug_type( $actual ), + ) + ); + } + + if ( $actual !== $expected ) { + return self::failure( + 'codepoint-count-mismatch', + 'codepoint_count', + array( + 'property' => $property, + 'byte_offset' => $byte_offset, + 'max_byte_length' => $max_byte_length, + 'got' => $actual, + 'expected' => $expected, + 'input_preview' => self::preview( $input, max( 0, $byte_offset ) ), + ) + ); + } + + return null; + } + /** * Tests `_wp_utf8_codepoint_span()` from known boundaries only. * @@ -1388,6 +1475,93 @@ private static function expected_mb_substr_from_offsets( string $input, array $o return substr( $input, $start_offset, $offsets[ $end_index ] - $start_offset ); } + /** + * @param int[] $offsets Boundary offsets from `reference_utf8_offsets_and_scrub()`. + * @return array Byte offset and max byte length probes. + */ + private static function codepoint_count_probes( array $offsets, int $byte_length, string $salt ): array { + $segment_count = count( $offsets ) - 1; + $probes = array( + array( -1, 0 ), + array( -1, 1 ), + array( -5, 10 ), + array( 0, -1 ), + array( 0, 0 ), + array( 0, $byte_length ), + array( 0, $byte_length + 1 ), + array( $byte_length, 0 ), + array( $byte_length, 1 ), + array( $byte_length + 1, 1 ), + ); + + foreach ( self::span_probe_indices( $segment_count, $salt . ':boundaries' ) as $segment_index ) { + $byte_offset = $offsets[ $segment_index ]; + $remaining = max( 0, $byte_length - $byte_offset ); + $lengths = array( + 0, + 1, + 2, + 3, + min( 7, $remaining ), + $remaining, + $remaining + 1, + ); + + if ( $segment_index < $segment_count ) { + $next_segment_length = $offsets[ $segment_index + 1 ] - $byte_offset; + $lengths[] = max( 0, $next_segment_length - 1 ); + $lengths[] = $next_segment_length; + $lengths[] = $next_segment_length + 1; + } + + if ( $segment_index + 2 <= $segment_count ) { + $two_segment_length = $offsets[ $segment_index + 2 ] - $byte_offset; + $lengths[] = max( 0, $two_segment_length - 1 ); + $lengths[] = $two_segment_length; + } + + foreach ( $lengths as $length ) { + $probes[] = array( $byte_offset, $length ); + } + } + + foreach ( array( 0, 1, 2, intdiv( $byte_length, 2 ), max( 0, $byte_length - 1 ), $byte_length ) as $byte_offset ) { + $remaining = max( 0, $byte_length - $byte_offset ); + foreach ( array( 0, 1, 2, min( 7, $remaining ), $remaining, $remaining + 1 ) as $length ) { + $probes[] = array( $byte_offset, $length ); + } + } + + $hash = hash( 'sha256', $salt . ':count', true ); + $range = max( 3, $byte_length + 2 ); + for ( $i = 0; $i < 4; $i++ ) { + $byte_offset = ( ord( $hash[ $i ] ) % ( ( 2 * $range ) + 1 ) ) - $range; + $length = ( ord( $hash[ $i + 4 ] ) % ( $byte_length + 8 ) ) - 2; + $probes[] = array( $byte_offset, $length ); + } + + $unique = array(); + foreach ( $probes as $probe ) { + $unique[ json_encode( $probe ) ] = $probe; + } + + return array_values( $unique ); + } + + private static function expected_codepoint_count_window( string $input, int $byte_offset, int $max_byte_length ): int { + if ( $byte_offset < 0 || $max_byte_length < 0 ) { + return 0; + } + + $window = substr( $input, $byte_offset, $max_byte_length ); + if ( '' === $window ) { + return 0; + } + + list( $offsets ) = self::reference_utf8_offsets_and_scrub( $window ); + return count( $offsets ) - 1; + } + /** * @return array{0: int|false, 1: int} First code point and byte length. */ diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php index 80a4cfcfa3800..8f26dc833a6c0 100644 --- a/tools/encoding-fuzz/lib/Targets.php +++ b/tools/encoding-fuzz/lib/Targets.php @@ -23,6 +23,9 @@ * ENCODING_FUZZ_FAULT=substr-scrub substr slices scrubbed invalid input * ENCODING_FUZZ_FAULT=substr-no-neg-len substr ignores negative lengths * ENCODING_FUZZ_FAULT=substr-force-utf8 substr ignores non-UTF-8 byte fallback + * ENCODING_FUZZ_FAULT=count-invalid-bytes count treats invalid bytes individually + * ENCODING_FUZZ_FAULT=count-range-minus1 count stops one byte early in bounded ranges + * ENCODING_FUZZ_FAULT=count-ignore-offset count ignores the requested byte offset */ class Targets { /** @@ -109,6 +112,18 @@ public static function resolve(): array { case 'substr-force-utf8': $targets['mb_substr'] = self::mb_substr_force_utf8( ... ); break; + + case 'count-invalid-bytes': + $targets['codepoint_count'] = self::codepoint_count_invalid_bytes( ... ); + break; + + case 'count-range-minus1': + $targets['codepoint_count'] = self::codepoint_count_range_minus_one( ... ); + break; + + case 'count-ignore-offset': + $targets['codepoint_count'] = self::codepoint_count_ignore_offset( ... ); + break; } return $targets; @@ -253,4 +268,51 @@ public static function mb_substr_no_negative_length( $str, $start, $length = nul public static function mb_substr_force_utf8( $str, $start, $length = null, $encoding = null ) { return _mb_substr( $str, $start, $length, _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ? $encoding : 'UTF-8' ); } + + /** + * Deliberately broken code point counter: treats every byte in an invalid + * maximal subpart as a separate code point. + */ + public static function codepoint_count_invalid_bytes( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int { + $byte_offset = $byte_offset ?? 0; + $max_byte_length = $max_byte_length ?? PHP_INT_MAX; + + if ( $byte_offset < 0 || $max_byte_length < 0 ) { + return 0; + } + + $count = 0; + $at = $byte_offset; + $end = strlen( $text ); + $invalid_length = 0; + $max_byte_length = min( $end - $at, $max_byte_length ); + + while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) { + $count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) ); + $count += $invalid_length; + $at += $invalid_length; + } + + return $count; + } + + /** + * Deliberately broken code point counter: stops one byte early when a + * bounded range is requested. + */ + public static function codepoint_count_range_minus_one( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int { + $max_byte_length = $max_byte_length ?? PHP_INT_MAX; + if ( $max_byte_length <= 0 ) { + return _wp_utf8_codepoint_count( $text, $byte_offset, $max_byte_length ); + } + + return _wp_utf8_codepoint_count( $text, $byte_offset, $max_byte_length - 1 ); + } + + /** + * Deliberately broken code point counter: always starts at byte offset 0. + */ + public static function codepoint_count_ignore_offset( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int { + return _wp_utf8_codepoint_count( $text, 0, $max_byte_length ); + } } diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index 4977d1b5679e1..7e15f51945dab 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -162,13 +162,31 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr ) ); check( 'catches byte-dropping scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3f. Code point counter that counts invalid bytes individually. +// 3f. Code point counter with a simple off-by-one drift on invalid input. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( - 'codepoint_count' => static fn( string $bytes ): int => _wp_utf8_codepoint_count( $bytes ) + ( wp_is_valid_utf8( $bytes ) ? 0 : 1 ), + 'codepoint_count' => static fn( string $bytes, ?int $offset = 0, ?int $length = PHP_INT_MAX ): int => _wp_utf8_codepoint_count( $bytes, $offset, $length ) + ( wp_is_valid_utf8( $bytes ) ? 0 : 1 ), ) ); check( 'catches off-by-one code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3g. Throwing target is reported, not fatal. +// 3g. Code point counter that counts each byte in invalid maximal subparts. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'codepoint_count' => Targets::codepoint_count_invalid_bytes( ... ), +) ); +check( 'catches invalid-byte-counting code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3h. Bounded counter that stops one byte early at the range end. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'codepoint_count' => Targets::codepoint_count_range_minus_one( ... ), +) ); +check( 'catches range-end off-by-one code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3i. Bounded counter that ignores the byte offset. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'codepoint_count' => Targets::codepoint_count_ignore_offset( ... ), +) ); +check( 'catches byte-offset-ignoring code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3j. Throwing target is reported, not fatal. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'is_valid_fb' => static function ( string $bytes ): bool { throw new \RuntimeException( 'boom' ); @@ -176,13 +194,13 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr ) ); check( 'reports throwing target', in_array( 'target-exception', $seen, true ), implode( ',', $seen ) ); -// 3h. Encoder that confuses ISO-8859-1 with Windows-1252 (0x80 becomes '€'). +// 3k. Encoder that confuses ISO-8859-1 with Windows-1252 (0x80 becomes '€'). $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'utf8_encode_fb' => static fn( string $bytes ): string => str_replace( "\xC2\x80", "\xE2\x82\xAC", _wp_utf8_encode_fallback( $bytes ) ), ) ); check( 'catches cp1252-confused encoder', in_array( 'utf8-encode-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3i. Encoder that passes high bytes through raw (invalid UTF-8 output). +// 3l. Encoder that passes high bytes through raw (invalid UTF-8 output). $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'utf8_encode_fb' => static fn( string $bytes ): string => $bytes, ) ); @@ -192,27 +210,27 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr implode( ',', $seen ) ); -// 3j. Decoder that emits one '?' per invalid byte instead of per maximal +// 3m. Decoder that emits one '?' per invalid byte instead of per maximal // subpart (`E2 8C` becomes '??' instead of '?'). $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'utf8_decode_fb' => Targets::decode_per_invalid_byte( ... ), ) ); check( 'catches per-byte decoder', in_array( 'utf8-decode-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3k. Decoder that mangles a mappable code point on fully valid input. +// 3n. Decoder that mangles a mappable code point on fully valid input. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'utf8_decode_fb' => static fn( string $bytes ): string => str_replace( "\xFC", "\xFD", _wp_utf8_decode_fallback( $bytes ) ), ) ); check( 'catches decoder mangling valid input', in_array( 'utf8-decode-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3l. Decoder that drops U+0080 entirely; the encode→decode round trip +// 3o. Decoder that drops U+0080 entirely; the encode→decode round trip // must restore every input byte string exactly. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'utf8_decode_fb' => static fn( string $bytes ): string => str_replace( "\x80", '', _wp_utf8_decode_fallback( $bytes ) ), ) ); check( 'catches round-trip violation', in_array( 'utf8-round-trip-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3m. Encoder that returns null (the fallbacks are untyped, so a broken +// 3p. Encoder that returns null (the fallbacks are untyped, so a broken // variant can return non-strings without throwing); must be reported, // not silently skipped by every encode-side check. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( @@ -220,89 +238,89 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr ) ); check( 'catches null-returning encoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) ); -// 3n. Decoder that returns null only for some inputs; must be reported +// 3q. Decoder that returns null only for some inputs; must be reported // from both the direct call and the round-trip path without crashing. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'utf8_decode_fb' => static fn( string $bytes ) => str_contains( $bytes, "\x80" ) ? null : _wp_utf8_decode_fallback( $bytes ), ) ); check( 'catches sometimes-null decoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) ); -// 3o. Noncharacter detector that never finds anything. +// 3r. Noncharacter detector that never finds anything. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'has_nonchars_fb' => static fn( string $text ): bool => false, ) ); check( 'catches blind noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3p. Detector that misses the contiguous U+FDD0–U+FDEF block (the +// 3s. Detector that misses the contiguous U+FDD0–U+FDEF block (the // plane-final pairs alone are a plausible spec misreading). $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'has_nonchars_fb' => Targets::nonchars_missing_fdd0_block( ... ), ) ); check( 'catches detector missing U+FDD0 block', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3q. Over-eager detector that flags U+FDCF, just below the block. +// 3t. Over-eager detector that flags U+FDCF, just below the block. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'has_nonchars' => Targets::nonchars_overeager( ... ), ) ); check( 'catches over-eager noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3r. Character encoder that confuses U+0080 with Windows-1252's euro sign. +// 3u. Character encoder that confuses U+0080 with Windows-1252's euro sign. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'mb_chr' => static fn( int $code_point ) => 0x80 === $code_point ? "\xE2\x82\xAC" : _mb_chr( $code_point ), ) ); check( 'catches cp1252-confused _mb_chr', in_array( 'mb-chr-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3s. Character decoder that accepts an invalid leading C0 byte. +// 3v. Character decoder that accepts an invalid leading C0 byte. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'mb_ord' => static fn( string $bytes ) => str_starts_with( $bytes, "\xC0" ) ? 0 : _mb_ord( $bytes ), ) ); check( 'catches invalid-accepting _mb_ord', in_array( 'mb-ord-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3t. Code point span that reports one extra byte. +// 3w. Code point span that reports one extra byte. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'codepoint_span' => Targets::codepoint_span_off_by_one( ... ), ) ); check( 'catches off-by-one code point span', in_array( 'codepoint-span-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3u. Code point span that treats invalid maximal subparts as one code +// 3x. Code point span that treats invalid maximal subparts as one code // point per byte instead of one code point per maximal subpart. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'codepoint_span' => Targets::codepoint_span_counts_invalid_bytes( ... ), ) ); check( 'catches byte-counted invalid code point span', in_array( 'codepoint-span-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3v. Code point span that returns the right byte span but corrupts the +// 3y. Code point span that returns the right byte span but corrupts the // by-reference found count. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'codepoint_span' => Targets::codepoint_span_found_max( ... ), ) ); check( 'catches wrong code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3w. Code point span that leaves found_code_points stale on empty spans. +// 3z. Code point span that leaves found_code_points stale on empty spans. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'codepoint_span' => Targets::codepoint_span_stale_empty_found( ... ), ) ); check( 'catches stale empty code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3x. UTF-8 substring that treats character offsets as byte offsets. +// 3aa. UTF-8 substring that treats character offsets as byte offsets. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'mb_substr' => Targets::mb_substr_byte_level( ... ), ) ); check( 'catches byte-offset _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3y. UTF-8 substring that slices scrubbed text, losing original invalid bytes. +// 3ab. UTF-8 substring that slices scrubbed text, losing original invalid bytes. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'mb_substr' => Targets::mb_substr_scrub_invalid( ... ), ) ); check( 'catches scrubbed-input _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3z. UTF-8 substring that ignores negative length semantics. +// 3ac. UTF-8 substring that ignores negative length semantics. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'mb_substr' => Targets::mb_substr_no_negative_length( ... ), ) ); check( 'catches negative-length _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) ); -// 3aa. Non-UTF-8 substring must fall back to byte-level substr(). +// 3ad. Non-UTF-8 substring must fall back to byte-level substr(). $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'mb_substr' => Targets::mb_substr_force_utf8( ... ), ) ); From 1c208acee0af707cee235690158fae1546366772 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 14:50:15 +0200 Subject: [PATCH 10/14] Add bounded UTF-8 scan fuzzing --- progress-handoff-xZOoEn.md | 22 ++ tools/encoding-fuzz/README.md | 13 +- tools/encoding-fuzz/lib/Checks.php | 289 +++++++++++++++++++- tools/encoding-fuzz/lib/Targets.php | 85 ++++++ tools/encoding-fuzz/tests/harness-smoke.php | 65 +++++ 5 files changed, 466 insertions(+), 8 deletions(-) diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md index aaf355609eada..5cb171e6e8a3a 100644 --- a/progress-handoff-xZOoEn.md +++ b/progress-handoff-xZOoEn.md @@ -77,3 +77,25 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn - Reviewer 2: satisfied after checking the new mutation modes through worker/replay/minimize. - Reviewer 3: satisfied after checking probe coverage, performance, docs/progress accuracy, and the smoke comment cleanup. - Commit: this step commit. + +### Step 4: bounded `_wp_scan_utf8()` properties + +- Status: done; included in the step 4 commit. +- Prior step commit: `a0f6820eb1`. +- Scope: + - Add direct `_wp_scan_utf8()` probes for `max_bytes`, `max_code_points`, negative limits, nonzero boundary starts, invalid spans, by-ref noncharacter flag reset, and scanned-region noncharacter reporting. + - Pin current scan semantics: valid multibyte characters that start before the byte limit are scanned whole, while invalid spans are bounded by `max_bytes`. + - Add mutation tests for ignored `max_bytes`, noncharacter leakage from outside the scanned region, missed noncharacters inside the scanned region, ASCII fast-path overrun of `max_code_points`, and stale noncharacter flags. +- Verification: + - `php -l tools/encoding-fuzz/lib/Checks.php` + - `php -l tools/encoding-fuzz/lib/Targets.php` + - `php -l tools/encoding-fuzz/lib/Bootstrap.php` + - `php -l tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none` + - `git diff --check` +- Review gate: satisfied by 3 adversarial reviewers. + - Reviewer 1: initially found stale `has_noncharacters` and negative-bound gaps; satisfied after adding stale-true probes and negative limit probes. + - Reviewer 2: initially found missing false-negative noncharacter mutation coverage; satisfied after adding `scan-miss-nonchars` and selector wiring checks. + - Reviewer 3: satisfied after checking probe volume, performance, README mutation count/list, fault list, and progress ordering. +- Commit: this step commit. diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index 606a50ac70152..a8ec642a5258f 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -118,6 +118,10 @@ Internal invariants: maximal-subpart parser checks that invalid subparts count as one code point. Nonzero starts are probed only at known code point or maximal-subpart boundaries. +- bounded `_wp_scan_utf8()` calls agree with an independent scan model for + `max_bytes`, `max_code_points`, negative limits, nonzero boundary starts, + invalid spans, forward progress, by-ref noncharacter flag reset, and + scanned-region noncharacter reporting - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points` chunks reconstructs the same scrubbed text and always makes forward progress (chunk sizes derive from the input hash, so replays are exact) @@ -208,7 +212,7 @@ php tools/encoding-fuzz/tests/harness-smoke.php ``` Verifies the oracle battery, runs the real targets over the battery -vectors, and — most importantly — mutation-tests the harness: thirty +vectors, and — most importantly — mutation-tests the harness: thirty-five classes of deliberately broken implementations (validator accepting 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber, identity scrubber, byte-dropping scrubber, off-by-one code point count, @@ -222,13 +226,16 @@ detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`, off-by-one code point span, invalid-subpart byte-counted span, and wrong or stale `found_code_points` span, byte-offset `_mb_substr()`, scrubbed-input `_mb_substr()`, negative-length `_mb_substr()`, and -non-UTF-8 fallback drift) +non-UTF-8 fallback drift, max-bytes-ignoring `_wp_scan_utf8()`, +noncharacter-leaking `_wp_scan_utf8()`, noncharacter-missing +`_wp_scan_utf8()`, ASCII-overrunning `_wp_scan_utf8()`, and +stale-noncharacter-flag `_wp_scan_utf8()`) must all be caught. It also asserts generator determinism, the valid/invalid input mix, and the documented `wp_has_noncharacters()` divergence stance on ill-formed input. For end-to-end pipeline testing while the real implementations are -healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset` +healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset|scan-ignore-bytes|scan-nonchars-leak|scan-miss-nonchars|scan-ascii-overrun|scan-stale-nonchars` injects a broken target into worker, replay, and minimize alike. Fault-injected artifacts record the fault name in their environment metadata so they cannot be mistaken for real findings. Replaying or diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php index 72a9a0995f52c..2a48994979cb1 100644 --- a/tools/encoding-fuzz/lib/Checks.php +++ b/tools/encoding-fuzz/lib/Checks.php @@ -20,6 +20,10 @@ * requested number of code points, with invalid maximal subparts * counted as one code point and `found_code_points` reporting the * available/requested count + * - bounded `_wp_scan_utf8()` calls agree with an independent scan model + * for `max_bytes`, `max_code_points`, negative limits, nonzero boundary + * starts, invalid spans, by-ref noncharacter flag reset, and + * scanned-region noncharacter reporting * - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points` * chunks reconstructs the same scrubbed text and always makes * forward progress @@ -273,28 +277,33 @@ public function run( string $input ): array { $failures[] = $failure; } - // 8. Chunked scan reconstruction. + // 8. Direct bounded scan properties. + foreach ( $this->check_bounded_scan( $input, $ref_scrub ) as $failure ) { + $failures[] = $failure; + } + + // 9. Chunked scan reconstruction. $chunk_failure = $this->check_chunked_scan( $input, $ref_scrub ); if ( null !== $chunk_failure ) { $failures[] = $chunk_failure; } - // 9. Legacy utf8_encode()/utf8_decode() fallback differentials. + // 10. Legacy utf8_encode()/utf8_decode() fallback differentials. foreach ( $this->check_utf8_encode_decode( $input, $ref_valid, $mb_validity ) as $failure ) { $failures[] = $failure; } - // 10. Noncharacter detection, on valid input only. + // 11. Noncharacter detection, on valid input only. foreach ( $this->check_noncharacters( $input, $ref_valid ) as $failure ) { $failures[] = $failure; } - // 11. _mb_substr() UTF-8 and byte-fallback properties. + // 12. _mb_substr() UTF-8 and byte-fallback properties. foreach ( $this->check_mb_substr( $input, $ref_valid, $ref_scrub ) as $failure ) { $failures[] = $failure; } - // 12. mb_chr()/mb_ord() polyfill differentials and isomorphisms. + // 13. mb_chr()/mb_ord() polyfill differentials and isomorphisms. foreach ( $this->check_mb_chr_ord( $input ) as $failure ) { $failures[] = $failure; } @@ -410,6 +419,151 @@ private function assert_codepoint_count( string $input, int $byte_offset, ?int $ return null; } + /** + * Tests optional `_wp_scan_utf8()` bounds directly from known code + * point/maximal-subpart boundaries. Starts inside continuation bytes or + * inside invalid maximal subparts remain undefined for this property. + * + * @return array + */ + private function check_bounded_scan( string $input, string $ref_scrub ): array { + if ( ! isset( $this->targets['scan_utf8'] ) ) { + return array(); + } + + list( $offsets, $reference_scrub ) = self::reference_utf8_offsets_and_scrub( $input ); + if ( $reference_scrub !== $ref_scrub ) { + return array( + self::failure( + 'scan-reference-disagreement', + 'maximal-subpart-reference', + self::diff_detail( 'maximal-subpart-reference', $ref_scrub, $reference_scrub ) + ), + ); + } + + $failures = array(); + foreach ( self::scan_utf8_probes( $offsets, strlen( $input ), $input ) as $probe ) { + list( $start, $max_bytes, $max_code_points ) = $probe; + $expected = self::expected_scan_utf8( $input, $start, $max_bytes, $max_code_points ); + + $failure = $this->assert_scan_utf8( + $input, + $start, + $max_bytes, + $max_code_points, + $expected + ); + + if ( null !== $failure ) { + $failures[] = $failure; + } + } + + return $failures; + } + + /** + * @param array{count: int, at: int, invalid_length: int, has_noncharacters: bool} $expected + */ + private function assert_scan_utf8( string $input, int $start, ?int $max_bytes, ?int $max_code_points, array $expected ): ?array { + $failure = $this->assert_scan_utf8_with_initial_has( $input, $start, $max_bytes, $max_code_points, null, $expected ); + if ( null !== $failure ) { + return $failure; + } + + if ( ! $expected['has_noncharacters'] ) { + $failure = $this->assert_scan_utf8_with_initial_has( + $input, + $start, + $max_bytes, + $max_code_points, + true, + $expected + ); + + if ( null !== $failure ) { + return $failure; + } + } + + return null; + } + + /** + * @param array{count: int, at: int, invalid_length: int, has_noncharacters: bool} $expected + */ + private function assert_scan_utf8_with_initial_has( string $input, int $start, ?int $max_bytes, ?int $max_code_points, ?bool $initial_has, array $expected ): ?array { + $at = $start; + $invalid_length = -1; + $has_noncharacters = $initial_has; + + try { + $count = ( $this->targets['scan_utf8'] )( $input, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters ); + } catch ( \Throwable $error ) { + return self::failure( + 'target-exception', + 'scan_utf8', + array( + 'target' => 'scan_utf8', + 'start' => $start, + 'max_bytes' => $max_bytes, + 'max_code_points' => $max_code_points, + 'initial_has' => $initial_has, + 'message' => $error->getMessage(), + 'class' => get_class( $error ), + ) + ); + } + + if ( + ! is_int( $count ) || + ! is_int( $at ) || + ! is_int( $invalid_length ) || + ( ! is_bool( $has_noncharacters ) && ! in_array( $has_noncharacters, array( 0, 1 ), true ) ) + ) { + return self::failure( + 'scan-utf8-bad-return', + 'scan_utf8', + array( + 'start' => $start, + 'max_bytes' => $max_bytes, + 'max_code_points' => $max_code_points, + 'initial_has' => $initial_has, + 'count_type' => get_debug_type( $count ), + 'at_type' => get_debug_type( $at ), + 'invalid_length_type' => get_debug_type( $invalid_length ), + 'has_noncharacters_type' => get_debug_type( $has_noncharacters ), + ) + ); + } + + $actual = array( + 'count' => $count, + 'at' => $at, + 'invalid_length' => $invalid_length, + 'has_noncharacters' => (bool) $has_noncharacters, + ); + + if ( $actual !== $expected ) { + return self::failure( + 'scan-utf8-mismatch', + 'scan_utf8', + array( + 'start' => $start, + 'max_bytes' => $max_bytes, + 'max_code_points' => $max_code_points, + 'initial_has' => $initial_has, + 'got' => $actual, + 'expected' => $expected, + 'input_preview' => self::preview( $input, $start ), + ) + ); + } + + return null; + } + /** * Tests `_wp_utf8_codepoint_span()` from known boundaries only. * @@ -1562,6 +1716,131 @@ private static function expected_codepoint_count_window( string $input, int $byt return count( $offsets ) - 1; } + /** + * @param int[] $offsets Boundary offsets from `reference_utf8_offsets_and_scrub()`. + * @return array Start, max bytes, max code points. + */ + private static function scan_utf8_probes( array $offsets, int $byte_length, string $salt ): array { + $segment_count = count( $offsets ) - 1; + $probes = array( + array( 0, null, null ), + array( 0, 0, null ), + array( 0, null, 0 ), + array( $byte_length, null, null ), + array( $byte_length, 1, 1 ), + ); + + foreach ( self::span_probe_indices( $segment_count, $salt . ':scan' ) as $segment_index ) { + $start = $offsets[ $segment_index ]; + $remaining = max( 0, $byte_length - $start ); + $available = $segment_count - $segment_index; + + $byte_limits = array( null, -1, 0, 1, min( 7, $remaining ), $remaining, $remaining + 1 ); + if ( $segment_index < $segment_count ) { + $next_length = $offsets[ $segment_index + 1 ] - $start; + $byte_limits[] = max( 0, $next_length - 1 ); + $byte_limits[] = $next_length; + $byte_limits[] = $next_length + 1; + } + if ( $segment_index + 2 <= $segment_count ) { + $two_length = $offsets[ $segment_index + 2 ] - $start; + $byte_limits[] = max( 0, $two_length - 1 ); + $byte_limits[] = $two_length; + } + + $point_limits = array( null, -1, 0, 1, 2, min( 7, $available ), $available, $available + 1 ); + + foreach ( array_values( array_unique( $byte_limits ) ) as $max_bytes ) { + $probes[] = array( $start, $max_bytes, null ); + } + + foreach ( array_values( array_unique( $point_limits ) ) as $max_code_points ) { + $probes[] = array( $start, null, $max_code_points ); + } + + foreach ( array( -1, 0, 1, min( 7, $remaining ), $remaining ) as $max_bytes ) { + foreach ( array( -1, 0, 1, min( 3, $available ) ) as $max_code_points ) { + $probes[] = array( $start, $max_bytes, $max_code_points ); + } + } + } + + $hash = hash( 'sha256', $salt . ':scan-random', true ); + for ( $i = 0; $i < 4; $i++ ) { + $start_index = ord( $hash[ $i ] ) % ( $segment_count + 1 ); + $start = $offsets[ $start_index ]; + $remaining = max( 0, $byte_length - $start ); + $available = $segment_count - $start_index; + $max_bytes = ord( $hash[ $i + 4 ] ) % ( $remaining + 2 ); + $max_code_points = ord( $hash[ $i + 8 ] ) % ( $available + 2 ); + $probes[] = array( $start, $max_bytes, $max_code_points ); + } + + $unique = array(); + foreach ( $probes as $probe ) { + $unique[ json_encode( $probe ) ] = $probe; + } + + return array_values( $unique ); + } + + /** + * @return array{count: int, at: int, invalid_length: int, has_noncharacters: bool} + */ + private static function expected_scan_utf8( string $input, int $start, ?int $max_bytes, ?int $max_code_points ): array { + $byte_length = strlen( $input ); + $end = min( $byte_length, $start + ( $max_bytes ?? PHP_INT_MAX ) ); + $max_code_points = $max_code_points ?? PHP_INT_MAX; + $at = $start; + $count = 0; + $has_noncharacters = false; + + while ( $at < $end ) { + if ( $count >= $max_code_points ) { + return array( + 'count' => $count, + 'at' => $at, + 'invalid_length' => 0, + 'has_noncharacters' => $has_noncharacters, + ); + } + + list( $segment_length, $valid ) = self::reference_utf8_segment( $input, $at ); + + if ( ! $valid ) { + return array( + 'count' => $count, + 'at' => $at, + 'invalid_length' => min( $segment_length, $end - $at ), + 'has_noncharacters' => $has_noncharacters, + ); + } + + $character = substr( $input, $at, $segment_length ); + list( $code_point ) = self::first_code_point_or_false( $character ); + if ( is_int( $code_point ) && self::is_noncharacter_code_point( $code_point ) ) { + $has_noncharacters = true; + } + + ++$count; + $at += $segment_length; + } + + return array( + 'count' => $count, + 'at' => $at, + 'invalid_length' => 0, + 'has_noncharacters' => $has_noncharacters, + ); + } + + private static function is_noncharacter_code_point( int $code_point ): bool { + return ( + ( $code_point >= 0xFDD0 && $code_point <= 0xFDEF ) || + 0xFFFE === ( $code_point & 0xFFFE ) + ); + } + /** * @return array{0: int|false, 1: int} First code point and byte length. */ diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php index 8f26dc833a6c0..67bb56c08e078 100644 --- a/tools/encoding-fuzz/lib/Targets.php +++ b/tools/encoding-fuzz/lib/Targets.php @@ -26,6 +26,11 @@ * ENCODING_FUZZ_FAULT=count-invalid-bytes count treats invalid bytes individually * ENCODING_FUZZ_FAULT=count-range-minus1 count stops one byte early in bounded ranges * ENCODING_FUZZ_FAULT=count-ignore-offset count ignores the requested byte offset + * ENCODING_FUZZ_FAULT=scan-ignore-bytes scan ignores max_bytes + * ENCODING_FUZZ_FAULT=scan-nonchars-leak scan reports noncharacters outside scanned region + * ENCODING_FUZZ_FAULT=scan-miss-nonchars scan misses noncharacters inside scanned region + * ENCODING_FUZZ_FAULT=scan-ascii-overrun scan ASCII fast path overruns max_code_points + * ENCODING_FUZZ_FAULT=scan-stale-nonchars scan leaves a stale noncharacter flag */ class Targets { /** @@ -46,6 +51,7 @@ public static function resolve(): array { 'mb_ord' => '_mb_ord', 'codepoint_span' => '_wp_utf8_codepoint_span', 'mb_substr' => '_mb_substr', + 'scan_utf8' => '_wp_scan_utf8', ); switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) { @@ -124,6 +130,26 @@ public static function resolve(): array { case 'count-ignore-offset': $targets['codepoint_count'] = self::codepoint_count_ignore_offset( ... ); break; + + case 'scan-ignore-bytes': + $targets['scan_utf8'] = self::scan_utf8_ignore_max_bytes( ... ); + break; + + case 'scan-nonchars-leak': + $targets['scan_utf8'] = self::scan_utf8_noncharacters_leak( ... ); + break; + + case 'scan-miss-nonchars': + $targets['scan_utf8'] = self::scan_utf8_miss_noncharacters( ... ); + break; + + case 'scan-ascii-overrun': + $targets['scan_utf8'] = self::scan_utf8_ascii_overrun( ... ); + break; + + case 'scan-stale-nonchars': + $targets['scan_utf8'] = self::scan_utf8_stale_noncharacters( ... ); + break; } return $targets; @@ -315,4 +341,63 @@ public static function codepoint_count_range_minus_one( string $text, ?int $byte public static function codepoint_count_ignore_offset( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int { return _wp_utf8_codepoint_count( $text, 0, $max_byte_length ); } + + /** + * Deliberately broken scan: ignores the byte limit. + */ + public static function scan_utf8_ignore_max_bytes( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int { + return _wp_scan_utf8( $bytes, $at, $invalid_length, null, $max_code_points, $has_noncharacters ); + } + + /** + * Deliberately broken scan: leaks noncharacters from outside the scanned + * region into `$has_noncharacters`. + */ + public static function scan_utf8_noncharacters_leak( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int { + $count = _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters ); + + if ( _wp_has_noncharacters_fallback( $bytes ) ) { + $has_noncharacters = true; + } + + return $count; + } + + /** + * Deliberately broken scan: misses noncharacters inside the scanned + * region. + */ + public static function scan_utf8_miss_noncharacters( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int { + $count = _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters ); + $has_noncharacters = false; + + return $count; + } + + /** + * Deliberately broken scan: the ASCII fast path consumes one extra code + * point when a code point limit is supplied. + */ + public static function scan_utf8_ascii_overrun( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int { + if ( null !== $max_code_points && $at < strlen( $bytes ) && ord( $bytes[ $at ] ) <= 0x7F ) { + return _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points + 1, $has_noncharacters ); + } + + return _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters ); + } + + /** + * Deliberately broken scan: preserves a stale noncharacter flag instead + * of resetting it for the current scan. + */ + public static function scan_utf8_stale_noncharacters( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int { + $initial_has = $has_noncharacters; + $count = _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters ); + + if ( true === $initial_has && ! (bool) $has_noncharacters ) { + $has_noncharacters = true; + } + + return $count; + } } diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index 7e15f51945dab..b5f6d3d11242c 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -108,6 +108,7 @@ function check( string $label, bool $ok, string $detail = '' ): void { 'mb_ord' => '_mb_ord', 'codepoint_span' => '_wp_utf8_codepoint_span', 'mb_substr' => '_mb_substr', + 'scan_utf8' => '_wp_scan_utf8', ); /** @@ -127,6 +128,35 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr return array_keys( $seen ); } +/** + * Runs every battery vector through `Targets::resolve()` with a fault + * environment variable, proving the CLI fault selector names are wired. + * + * @return string[] Distinct check names observed. + */ +function fault_run( Oracles $oracles, array $vectors, string $fault ): array { + $previous_fault = getenv( 'ENCODING_FUZZ_FAULT' ); + putenv( "ENCODING_FUZZ_FAULT={$fault}" ); + + try { + $checks = new Checks( $oracles, Targets::resolve() ); + $seen = array(); + foreach ( $vectors as $bytes ) { + foreach ( $checks->run( $bytes ) as $failure ) { + $seen[ $failure['check'] ] = true; + } + } + } finally { + if ( false === $previous_fault ) { + putenv( 'ENCODING_FUZZ_FAULT' ); + } else { + putenv( "ENCODING_FUZZ_FAULT={$previous_fault}" ); + } + } + + return array_keys( $seen ); +} + // 3a. Validator that wrongly accepts a never-valid byte. $seen = broken_run( $oracles, $real_targets, $battery_vectors, array( 'is_valid_fb' => static fn( string $bytes ): bool => str_contains( $bytes, "\xC0" ) ? true : _wp_is_valid_utf8_fallback( $bytes ), @@ -326,6 +356,41 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr ) ); check( 'catches non-UTF-8 _mb_substr fallback drift', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) ); +// 3ae. Bounded scan that ignores max_bytes. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'scan_utf8' => Targets::scan_utf8_ignore_max_bytes( ... ), +) ); +check( 'catches max_bytes-ignoring scan', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3af. Bounded scan that leaks noncharacters from outside the scanned region. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'scan_utf8' => Targets::scan_utf8_noncharacters_leak( ... ), +) ); +check( 'catches noncharacter-leaking scan', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3ag. Bounded scan that misses noncharacters inside the scanned region. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'scan_utf8' => Targets::scan_utf8_miss_noncharacters( ... ), +) ); +check( 'catches noncharacter-missing scan', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3ah. Bounded scan whose ASCII fast path overruns max_code_points. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'scan_utf8' => Targets::scan_utf8_ascii_overrun( ... ), +) ); +check( 'catches ASCII-overrunning scan', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) ); + +// 3ai. Bounded scan that preserves a stale noncharacter flag. +$seen = broken_run( $oracles, $real_targets, $battery_vectors, array( + 'scan_utf8' => Targets::scan_utf8_stale_noncharacters( ... ), +) ); +check( 'catches stale noncharacter scan flag', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) ); + +foreach ( array( 'scan-ignore-bytes', 'scan-nonchars-leak', 'scan-miss-nonchars', 'scan-ascii-overrun', 'scan-stale-nonchars' ) as $fault ) { + $seen = fault_run( $oracles, $battery_vectors, $fault ); + check( "fault selector {$fault} is wired", in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) ); +} + // --------------------------------------------------------------------- // 4. Generator determinism and mix. // --------------------------------------------------------------------- From 4005f40d3ccd630613f31179aeb6a5d3970356cc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 15:07:15 +0200 Subject: [PATCH 11/14] Add deterministic UTF-8 boundary corpus --- progress-handoff-xZOoEn.md | 27 ++ tools/encoding-fuzz/README.md | 21 +- tools/encoding-fuzz/corpus.php | 184 ++++++++++++ tools/encoding-fuzz/lib/Corpus.php | 310 ++++++++++++++++++++ tools/encoding-fuzz/tests/harness-smoke.php | 89 +++++- 5 files changed, 625 insertions(+), 6 deletions(-) create mode 100644 tools/encoding-fuzz/corpus.php create mode 100644 tools/encoding-fuzz/lib/Corpus.php diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md index 5cb171e6e8a3a..d618244deca93 100644 --- a/progress-handoff-xZOoEn.md +++ b/progress-handoff-xZOoEn.md @@ -99,3 +99,30 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn - Reviewer 2: initially found missing false-negative noncharacter mutation coverage; satisfied after adding `scan-miss-nonchars` and selector wiring checks. - Reviewer 3: satisfied after checking probe volume, performance, README mutation count/list, fault list, and progress ordering. - Commit: this step commit. + +### Step 5: deterministic short-boundary corpus + +- Status: done; included in the step 5 commit. +- Prior step commit: `1c208acee0`. +- Scope: + - Add a deterministic short-boundary corpus separate from the random generator so random `(seed, case)` derivation remains stable. + - Cover lead-byte boundary classes crossed with boundary continuation positions, adjacent invalid maximal subparts, valid/malformed sandwiches, EOF truncations, and noncharacter boundary neighbors. + - Add a standalone corpus runner and smoke coverage for the new fixed cases. +- Verification: + - `php -l tools/encoding-fuzz/lib/Checks.php` + - `php -l tools/encoding-fuzz/lib/Targets.php` + - `php -l tools/encoding-fuzz/lib/Bootstrap.php` + - `php -l tools/encoding-fuzz/lib/Corpus.php` + - `php -l tools/encoding-fuzz/corpus.php` + - `php -l tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/corpus.php --external none` + - `php -d disable_functions=utf8_encode,utf8_decode tools/encoding-fuzz/corpus.php --external none` + - `ENCODING_FUZZ_FAULT=scan-ignore-bytes php tools/encoding-fuzz/corpus.php --external none --output-dir /tmp/encoding-fuzz-corpus-fault` + - `php tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none` + - `git diff --cached --check` +- Review gate: satisfied by 3 adversarial reviewers. + - Reviewer 1: initially noted byte-level dedupe hid intended labels; satisfied after preserving label-level corpus entries. + - Reviewer 2: noted smoke skipped the new CLI/artifact path; satisfied after adding CLI smoke coverage, fail-closed artifact writes, and manual faulted artifact verification. + - Reviewer 3: noted count/fingerprint/runtime and oracle-event ordering gaps; satisfied after pinning corpus count/fingerprint, updating smoke docs, and making CLI smoke parse NDJSON by record type. +- Commit: this step commit. diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index a8ec642a5258f..e3563af7df3f1 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -142,7 +142,7 @@ Internal invariants: ## Inputs -Each case is fully determined by `(seed, case index)` **for a given +Random cases are fully determined by `(seed, case index)` **for a given generator version**: changing the generator (e.g. its boundary code point list) invalidates `--seed`/`--case` re-derivation of older findings. Failure artifacts embed the input bytes, so `--failure` and @@ -156,6 +156,14 @@ ISO-8859-1-ish text, UTF-16 with/without BOM, long ASCII runs with broken tails (`strspn()` fast-path stress), and repeated motifs. Roughly a third of generated inputs are fully valid UTF-8. +A separate deterministic short-boundary corpus lives outside the random +generator so changing the fixed corpus does not perturb random +`(seed, case)` reproduction. It covers lead-byte boundary classes +crossed with boundary second/third/fourth byte positions, adjacent +invalid maximal subparts, valid text immediately before and after +malformed prefixes, EOF truncations at each prefix length, and +noncharacter boundary neighbors. + ## Common Commands Run one worker batch: @@ -164,6 +172,12 @@ Run one worker batch: php tools/encoding-fuzz/worker.php --seed 1 --cases 5000 ``` +Run the deterministic short-boundary corpus: + +```sh +php tools/encoding-fuzz/corpus.php +``` + Run parallel lanes for a minute (artifacts under `artifacts/encoding-fuzz/`): ```sh @@ -231,8 +245,9 @@ noncharacter-leaking `_wp_scan_utf8()`, noncharacter-missing `_wp_scan_utf8()`, ASCII-overrunning `_wp_scan_utf8()`, and stale-noncharacter-flag `_wp_scan_utf8()`) must all be caught. It also asserts generator determinism, the -valid/invalid input mix, and the documented -`wp_has_noncharacters()` divergence stance on ill-formed input. +valid/invalid input mix, the deterministic short-boundary corpus, and +the documented `wp_has_noncharacters()` divergence stance on ill-formed +input. For end-to-end pipeline testing while the real implementations are healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset|scan-ignore-bytes|scan-nonchars-leak|scan-miss-nonchars|scan-ascii-overrun|scan-stale-nonchars` diff --git a/tools/encoding-fuzz/corpus.php b/tools/encoding-fuzz/corpus.php new file mode 100644 index 0000000000000..460beb7677caf --- /dev/null +++ b/tools/encoding-fuzz/corpus.php @@ -0,0 +1,184 @@ + 'auto', + 'output-dir' => '', + 'progress-every' => 0, + ) +); + +Bootstrap::load_targets(); + +$oracles = Oracles::build( Cli::resolve_externals( $options['external'] ) ); +foreach ( $oracles->drain_events() as $event ) { + Cli::emit( array( 'type' => 'oracle-event' ) + $event ); +} + +if ( ! $oracles->has_required() ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => 'mbstring oracle unavailable or failed the battery; cannot run corpus without a primary oracle', + ) + ); + exit( 2 ); +} + +$output_dir = $options['output-dir']; +if ( '' !== $output_dir && ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot create output dir {$output_dir}", + ) + ); + exit( 2 ); +} + +$checks = new Checks( $oracles ); +$mb_valid = $oracles->validity_oracles()['mb']; +$cases = Corpus::short_boundary_cases(); +$stats = array( + 'cases' => 0, + 'failures' => 0, + 'valid_inputs' => 0, + 'bytes' => 0, + 'by_strategy' => array( + 'short-boundary-corpus' => 0, + ), +); +$started_at = microtime( true ); + +Cli::emit( + array( + 'type' => 'start', + 'corpus' => 'short-boundary', + 'cases' => count( $cases ), + 'environment' => Cli::environment_metadata( $oracles ), + ) +); + +foreach ( $cases as $case => $entry ) { + $input = $entry['bytes']; + $label = $entry['label']; + $failures = $checks->run( $input ); + + ++$stats['cases']; + ++$stats['by_strategy']['short-boundary-corpus']; + $stats['bytes'] += strlen( $input ); + if ( $mb_valid( $input ) ) { + ++$stats['valid_inputs']; + } + + foreach ( $oracles->drain_events() as $event ) { + Cli::emit( array( 'type' => 'oracle-event', 'case' => $case, 'corpus_label' => $label ) + $event ); + } + + if ( array() !== $failures ) { + $stats['failures'] += count( $failures ); + + $record = array( + 'type' => 'failure', + 'corpus' => 'short-boundary', + 'case' => $case, + 'corpus_label' => $label, + 'strategy' => 'short-boundary-corpus', + 'input_size' => strlen( $input ), + 'signatures' => array_values( array_unique( array_column( $failures, 'signature' ) ) ), + 'failures' => $failures, + 'input_base64' => base64_encode( $input ), + ); + + if ( '' !== $output_dir ) { + $case_dir = "{$output_dir}/failure-corpus-short-boundary-case{$case}"; + if ( ! is_dir( $case_dir ) && ! mkdir( $case_dir, 0777, true ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot create artifact dir {$case_dir}", + ) + ); + $oracles->shutdown(); + exit( 2 ); + } + if ( false === file_put_contents( "{$case_dir}/input.bin", $input ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot write {$case_dir}/input.bin", + ) + ); + $oracles->shutdown(); + exit( 2 ); + } + + $artifact = $record; + $artifact['environment'] = Cli::environment_metadata( $oracles ); + $artifact['git'] = Cli::git_metadata( Bootstrap::repo_root() ); + if ( false === file_put_contents( + "{$case_dir}/failure.json", + json_encode( $artifact, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES ) + ) ) { + Cli::emit( + array( + 'type' => 'fatal', + 'reason' => "cannot write {$case_dir}/failure.json", + ) + ); + $oracles->shutdown(); + exit( 2 ); + } + $record['artifact_dir'] = $case_dir; + } + + Cli::emit( $record ); + } + + if ( + $options['progress-every'] > 0 && + 0 === ( $stats['cases'] % max( 1, $options['progress-every'] ) ) + ) { + $elapsed = microtime( true ) - $started_at; + Cli::emit( + array( + 'type' => 'progress', + 'corpus' => 'short-boundary', + 'case' => $case, + 'cases_done' => $stats['cases'], + 'failures' => $stats['failures'], + 'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null, + ) + ); + } +} + +$elapsed = microtime( true ) - $started_at; +Cli::emit( + array( + 'type' => 'done', + 'corpus' => 'short-boundary', + 'stats' => $stats, + 'elapsed_sec' => round( $elapsed, 2 ), + 'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null, + ) +); + +$oracles->shutdown(); +exit( $stats['failures'] > 0 ? 1 : 0 ); diff --git a/tools/encoding-fuzz/lib/Corpus.php b/tools/encoding-fuzz/lib/Corpus.php new file mode 100644 index 0000000000000..1ff98071b3892 --- /dev/null +++ b/tools/encoding-fuzz/lib/Corpus.php @@ -0,0 +1,310 @@ + + */ + public static function short_boundary_cases(): array { + $cases = array(); + + self::add_lead_boundary_cases( $cases ); + self::add_adjacent_invalid_cases( $cases ); + self::add_sandwich_cases( $cases ); + self::add_truncation_cases( $cases ); + self::add_noncharacter_boundary_cases( $cases ); + + return $cases; + } + + /** + * @param array $cases + */ + private static function add_lead_boundary_cases( array &$cases ): void { + foreach ( self::LEAD_CLASS_BYTES as $lead ) { + self::add_case( $cases, 'lead:' . self::hex_byte( $lead ), self::bytes( $lead ) ); + } + + foreach ( self::TWO_BYTE_LEADS as $lead ) { + foreach ( self::BOUNDARY_BYTES as $second ) { + self::add_case( + $cases, + sprintf( 'two-second:%02x-%02x', $lead, $second ), + self::bytes( $lead, $second ) + ); + } + } + + foreach ( self::THREE_BYTE_LEADS as $lead ) { + list( $base_second, $base_third ) = self::three_byte_baseline( $lead ); + foreach ( self::BOUNDARY_BYTES as $second ) { + self::add_case( + $cases, + sprintf( 'three-second:%02x-%02x-%02x', $lead, $second, $base_third ), + self::bytes( $lead, $second, $base_third ) + ); + } + + foreach ( self::BOUNDARY_BYTES as $third ) { + self::add_case( + $cases, + sprintf( 'three-third:%02x-%02x-%02x', $lead, $base_second, $third ), + self::bytes( $lead, $base_second, $third ) + ); + } + } + + foreach ( self::FOUR_BYTE_LEADS as $lead ) { + list( $base_second, $base_third, $base_fourth ) = self::four_byte_baseline( $lead ); + foreach ( self::BOUNDARY_BYTES as $second ) { + self::add_case( + $cases, + sprintf( 'four-second:%02x-%02x-%02x-%02x', $lead, $second, $base_third, $base_fourth ), + self::bytes( $lead, $second, $base_third, $base_fourth ) + ); + } + + foreach ( self::BOUNDARY_BYTES as $third ) { + self::add_case( + $cases, + sprintf( 'four-third:%02x-%02x-%02x-%02x', $lead, $base_second, $third, $base_fourth ), + self::bytes( $lead, $base_second, $third, $base_fourth ) + ); + } + + foreach ( self::BOUNDARY_BYTES as $fourth ) { + self::add_case( + $cases, + sprintf( 'four-fourth:%02x-%02x-%02x-%02x', $lead, $base_second, $base_third, $fourth ), + self::bytes( $lead, $base_second, $base_third, $fourth ) + ); + } + } + } + + /** + * @param array $cases + */ + private static function add_adjacent_invalid_cases( array &$cases ): void { + $adjacent = array( + 'continuation-run' => "\x80\xBF\x80", + 'never-valid-leads' => "\xC0\xC1\xF5\xFE\xFF", + 'overlong-pair' => "\xE0\x80\xE0\x9F", + 'surrogate-pair' => "\xED\xA0\xED\xB0", + 'past-range-pair' => "\xF4\x90\xF5\x80", + 'truncated-three' => "\xE2\x8C\xE2\x8C", + 'truncated-four' => "\xF1\x80\x80\xF0\x90", + 'unicode-table-3-8' => "\xF1\x80\x80\xE1\x80\xC2", + 'bad-lead-after-cont' => "\x80\xF5\xBF\xFE", + ); + + foreach ( $adjacent as $label => $bytes ) { + self::add_case( $cases, "adjacent-invalid:{$label}", $bytes ); + } + } + + /** + * @param array $cases + */ + private static function add_sandwich_cases( array &$cases ): void { + $valid_atoms = array( + 'ascii' => 'a', + 'two-byte' => "\xC2\x80", + 'three-byte' => "\xE2\x9C\x8F", + 'four-byte' => "\xF0\x90\x80\x80", + 'noncharacter' => "\xEF\xBF\xBE", + ); + $malformed = array( + 'lone-continuation' => "\x80", + 'never-valid-c0' => "\xC0", + 'truncated-two' => "\xC2", + 'overlong-three' => "\xE0\x80", + 'surrogate' => "\xED\xA0", + 'truncated-three' => "\xE2\x8C", + 'truncated-four' => "\xF1\x80\x80", + 'past-range' => "\xF4\x90", + 'never-valid-f5' => "\xF5", + 'never-valid-ff' => "\xFF", + ); + + foreach ( $valid_atoms as $valid_label => $valid ) { + foreach ( $malformed as $bad_label => $bad ) { + self::add_case( $cases, "sandwich:{$valid_label}-before-{$bad_label}", $valid . $bad ); + self::add_case( $cases, "sandwich:{$bad_label}-before-{$valid_label}", $bad . $valid ); + self::add_case( $cases, "sandwich:{$valid_label}-around-{$bad_label}", $valid . $bad . $valid ); + } + } + } + + /** + * @param array $cases + */ + private static function add_truncation_cases( array &$cases ): void { + $complete = array( + 'two-min' => "\xC2\x80", + 'two-max' => "\xDF\xBF", + 'three-min' => "\xE0\xA0\x80", + 'three-mid' => "\xE1\x80\x80", + 'surrogate-hi' => "\xED\x9F\xBF", + 'nonchar' => "\xEF\xBF\xBE", + 'four-min' => "\xF0\x90\x80\x80", + 'four-mid' => "\xF1\x80\x80\x80", + 'four-max' => "\xF4\x8F\xBF\xBF", + ); + + foreach ( $complete as $label => $bytes ) { + $length = strlen( $bytes ); + for ( $prefix_length = 1; $prefix_length < $length; $prefix_length++ ) { + $prefix = substr( $bytes, 0, $prefix_length ); + self::add_case( $cases, "truncation:{$label}-{$prefix_length}", $prefix ); + self::add_case( $cases, "truncation:ascii-{$label}-{$prefix_length}", 'a' . $prefix ); + } + } + } + + /** + * @param array $cases + */ + private static function add_noncharacter_boundary_cases( array &$cases ): void { + $code_points = array( + 0xFDCF, + 0xFDD0, + 0xFDEF, + 0xFDF0, + 0xFFFD, + 0xFFFE, + 0xFFFF, + ); + + for ( $plane = 0; $plane <= 0x10; $plane++ ) { + $final = ( $plane << 16 ) | 0xFFFF; + $code_points[] = $final - 2; + $code_points[] = $final - 1; + $code_points[] = $final; + } + + foreach ( array_values( array_unique( $code_points ) ) as $code_point ) { + $bytes = Generator::encode_code_point( $code_point ); + $label = sprintf( 'noncharacter-boundary:u+%04x', $code_point ); + self::add_case( $cases, $label, $bytes ); + self::add_case( $cases, "{$label}-embedded", 'a' . $bytes . 'b' ); + } + } + + /** + * @param array $cases + */ + private static function add_case( array &$cases, string $label, string $bytes ): void { + $cases[] = array( + 'label' => $label, + 'bytes' => $bytes, + ); + } + + /** + * @return array{0: int, 1: int} + */ + private static function three_byte_baseline( int $lead ): array { + switch ( $lead ) { + case 0xE0: + return array( 0xA0, 0x80 ); + case 0xED: + return array( 0x9F, 0xBF ); + default: + return array( 0x80, 0x80 ); + } + } + + /** + * @return array{0: int, 1: int, 2: int} + */ + private static function four_byte_baseline( int $lead ): array { + switch ( $lead ) { + case 0xF0: + return array( 0x90, 0x80, 0x80 ); + case 0xF4: + return array( 0x8F, 0xBF, 0xBF ); + default: + return array( 0x80, 0x80, 0x80 ); + } + } + + private static function bytes( int ...$bytes ): string { + $out = ''; + foreach ( $bytes as $byte ) { + $out .= chr( $byte ); + } + return $out; + } + + private static function hex_byte( int $byte ): string { + return sprintf( '%02x', $byte ); + } +} diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index b5f6d3d11242c..6fcd660776fe6 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -9,7 +9,9 @@ * masquerade as "no findings". * 4. The generator is deterministic and produces the advertised mix of * valid and invalid inputs across all strategies. - * 5. A short real fuzz run completes. + * 5. The deterministic short-boundary corpus is stable and clean. + * 6. A short real fuzz run completes. + * 7. The one-shot exhaustive companion test passes and catches its mutant. * * Exit codes: 0 pass, 1 fail. */ @@ -418,7 +420,88 @@ function fault_run( Oracles $oracles, array $vectors, string $fault ): array { ); // --------------------------------------------------------------------- -// 5. Short real fuzz run. +// 5. Deterministic short-boundary corpus. +// --------------------------------------------------------------------- +$corpus_cases = Corpus::short_boundary_cases(); +$corpus_categories = array(); +foreach ( $corpus_cases as $entry ) { + $category = explode( ':', $entry['label'], 2 )[0]; + $corpus_categories[ $category ] = true; +} + +$expected_categories = array( + 'lead', + 'two-second', + 'three-second', + 'three-third', + 'four-second', + 'four-third', + 'four-fourth', + 'adjacent-invalid', + 'sandwich', + 'truncation', + 'noncharacter-boundary', +); +$missing_categories = array_values( array_diff( $expected_categories, array_keys( $corpus_categories ) ) ); +check( + 'short-boundary corpus has broad deterministic coverage', + 1133 === count( $corpus_cases ) && array() === $missing_categories, + 'count ' . count( $corpus_cases ) . ', missing ' . implode( ',', $missing_categories ) +); + +$corpus_fingerprint = static function ( array $cases ): string { + $parts = array(); + foreach ( $cases as $entry ) { + $parts[] = $entry['label'] . '=' . bin2hex( $entry['bytes'] ); + } + return hash( 'sha256', implode( "\n", $parts ) ); +}; +check( + 'short-boundary corpus deterministic', + '93f63dec5d9534e0ed1db643d5eb0596ececb0807cc3fb92cc6fe21fc4c60fbd' === $corpus_fingerprint( $corpus_cases ) +); + +$corpus_failures = 0; +foreach ( $corpus_cases as $entry ) { + $failures = $checks->run( $entry['bytes'] ); + foreach ( $failures as $failure ) { + ++$corpus_failures; + echo " corpus finding: {$failure['signature']} on {$entry['label']} " . bin2hex( $entry['bytes'] ) . "\n"; + } +} +check( 'short-boundary corpus clean (' . count( $corpus_cases ) . ' cases)', 0 === $corpus_failures ); + +$corpus_command = escapeshellarg( PHP_BINARY ) . ' ' . escapeshellarg( __DIR__ . '/../corpus.php' ) . ' --external none'; +exec( "{$corpus_command} 2>&1", $corpus_output, $corpus_code ); +$corpus_start = null; +$corpus_done = null; +foreach ( $corpus_output as $line ) { + $record = json_decode( $line, true ); + if ( ! is_array( $record ) ) { + continue; + } + + if ( 'start' === ( $record['type'] ?? null ) ) { + $corpus_start = $record; + } elseif ( 'done' === ( $record['type'] ?? null ) ) { + $corpus_done = $record; + } +} +check( + 'short-boundary corpus CLI clean', + 0 === $corpus_code && + is_array( $corpus_start ) && + is_array( $corpus_done ) && + 'start' === ( $corpus_start['type'] ?? null ) && + 'done' === ( $corpus_done['type'] ?? null ) && + 1133 === ( $corpus_start['cases'] ?? null ) && + 1133 === ( $corpus_done['stats']['cases'] ?? null ) && + 0 === ( $corpus_done['stats']['failures'] ?? null ), + implode( ' | ', array_slice( $corpus_output, -3 ) ) +); + +// --------------------------------------------------------------------- +// 6. Short real fuzz run. // --------------------------------------------------------------------- $fuzz_failures = 0; for ( $i = 0; $i < 300; $i++ ) { @@ -432,7 +515,7 @@ function fault_run( Oracles $oracles, array $vectors, string $fault ): array { check( '300-case fuzz run clean (real findings would also surface here)', 0 === $fuzz_failures ); // --------------------------------------------------------------------- -// 6. One-shot exhaustive companion test: must pass, and its detection +// 7. One-shot exhaustive companion test: must pass, and its detection // must provably fire (same mutation-testing rule as everything else). // --------------------------------------------------------------------- $exhaustive = escapeshellarg( PHP_BINARY ) . ' ' . escapeshellarg( __DIR__ . '/code-point-to-utf8-exhaustive.php' ); From a6d67b18f06ac4a0cf8c2e2fc00cc7f1f39dcf26 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 15:21:17 +0200 Subject: [PATCH 12/14] Add encoding fuzzer environment matrix --- progress-handoff-xZOoEn.md | 29 +++ tools/encoding-fuzz/README.md | 14 ++ tools/encoding-fuzz/lib/Cli.php | 16 +- tools/encoding-fuzz/lib/wp-stubs.php | 9 +- tools/encoding-fuzz/matrix.php | 255 +++++++++++++++++++++++++++ 5 files changed, 316 insertions(+), 7 deletions(-) create mode 100644 tools/encoding-fuzz/matrix.php diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md index d618244deca93..6b549275cd59b 100644 --- a/progress-handoff-xZOoEn.md +++ b/progress-handoff-xZOoEn.md @@ -126,3 +126,32 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn - Reviewer 2: noted smoke skipped the new CLI/artifact path; satisfied after adding CLI smoke coverage, fail-closed artifact writes, and manual faulted artifact verification. - Reviewer 3: noted count/fingerprint/runtime and oracle-event ordering gaps; satisfied after pinning corpus count/fingerprint, updating smoke docs, and making CLI smoke parse NDJSON by record type. - Commit: this step commit. + +### Step 6: environment matrix + +- Status: done; included in the step 6 commit. +- Prior step commit: `4005f40d3c`. +- Scope: + - Add a compact environment matrix command that runs the fixed corpus under current environment, forced no-PCRE-u target branch, simulated PHP 9 native `utf8_encode()` / `utf8_decode()` absence, and missing primary mbstring oracle functions. + - Add a fuzzer-only PCRE-u override in `wp-stubs.php` so the fallback `wp_has_noncharacters()` branch can be exercised without a separate PHP build. + - Document that a true no-mbstring target run still requires a PHP build without mbstring because the local harness fails closed without its mb-backed primary oracle. +- Verification: + - `php -l tools/encoding-fuzz/lib/Checks.php` + - `php -l tools/encoding-fuzz/lib/Targets.php` + - `php -l tools/encoding-fuzz/lib/Bootstrap.php` + - `php -l tools/encoding-fuzz/lib/Cli.php` + - `php -l tools/encoding-fuzz/lib/wp-stubs.php` + - `php -l tools/encoding-fuzz/matrix.php` + - `php -l tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/matrix.php` + - `env ENCODING_FUZZ_FORCE_PCRE_U=0 php tools/encoding-fuzz/matrix.php` + - `php tools/encoding-fuzz/corpus.php --external none` + - `php -d disable_functions=utf8_encode,utf8_decode tools/encoding-fuzz/corpus.php --external none` + - `php tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none` + - `git diff --cached --check` +- Review gate: satisfied by 3 adversarial reviewers. + - Reviewer 1: initially noted PCRE override metadata and force-on risks; satisfied after adding `pcre_u_override` metadata and making the override force-off only. + - Reviewer 2: initially found matrix pipe-deadlock, exit-code, and NDJSON-shape issues; satisfied after nonblocking pipe reads, harness-error exit `2`, and stricter record parsing. + - Reviewer 3: initially found the matrix exit-code contract mismatch; satisfied after preserving exit `2` for harness-error-shaped failures and checking docs/progress accuracy. +- Commit: this step commit. diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index e3563af7df3f1..e89e5d712a516 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -178,6 +178,20 @@ Run the deterministic short-boundary corpus: php tools/encoding-fuzz/corpus.php ``` +Run the compact environment matrix: + +```sh +php tools/encoding-fuzz/matrix.php +``` + +The matrix runs the fixed corpus in the current environment, with the +fuzzer's PCRE-u branch forced off, with native `utf8_encode()` / +`utf8_decode()` disabled to simulate PHP 9, and with the primary mbstring +oracle functions disabled to verify the harness fails closed. A true +no-mbstring target run still requires a PHP build without mbstring; the +local harness intentionally refuses to fuzz without the mb-backed primary +oracle. + Run parallel lanes for a minute (artifacts under `artifacts/encoding-fuzz/`): ```sh diff --git a/tools/encoding-fuzz/lib/Cli.php b/tools/encoding-fuzz/lib/Cli.php index 3ddd47679b5ab..e6d6b62c59b98 100644 --- a/tools/encoding-fuzz/lib/Cli.php +++ b/tools/encoding-fuzz/lib/Cli.php @@ -103,15 +103,21 @@ public static function git_metadata( string $repo_root ): array { } public static function environment_metadata( Oracles $oracles ): array { + $forced_pcre_u = getenv( 'ENCODING_FUZZ_FORCE_PCRE_U' ); + $pcre_override = false !== $forced_pcre_u && in_array( strtolower( $forced_pcre_u ), array( '0', 'false', 'no', 'off' ), true ) + ? 'off' + : null; + return array( - 'php' => PHP_VERSION, - 'os' => PHP_OS_FAMILY, - 'oracles' => $oracles->names(), + 'php' => PHP_VERSION, + 'os' => PHP_OS_FAMILY, + 'oracles' => $oracles->names(), // Which environment branch of utf8.php loaded (PCRE vs fallback). - 'pcre_u' => function_exists( '_wp_can_use_pcre_u' ) ? _wp_can_use_pcre_u() : null, + 'pcre_u' => function_exists( '_wp_can_use_pcre_u' ) ? _wp_can_use_pcre_u() : null, + 'pcre_u_override' => $pcre_override, // Mark fault-injected artifacts so they can never be mistaken // for real findings. - 'fault' => getenv( 'ENCODING_FUZZ_FAULT' ) ?: null, + 'fault' => getenv( 'ENCODING_FUZZ_FAULT' ) ?: null, ); } } diff --git a/tools/encoding-fuzz/lib/wp-stubs.php b/tools/encoding-fuzz/lib/wp-stubs.php index f86bd4b367332..2eaa662cbbeba 100644 --- a/tools/encoding-fuzz/lib/wp-stubs.php +++ b/tools/encoding-fuzz/lib/wp-stubs.php @@ -8,8 +8,13 @@ function _wp_can_use_pcre_u( $set = null ): bool { static $utf8_pcre = null; if ( null === $utf8_pcre ) { - // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged - $utf8_pcre = false !== @preg_match( '/^./u', 'a' ); + $forced = getenv( 'ENCODING_FUZZ_FORCE_PCRE_U' ); + if ( false !== $forced && in_array( strtolower( $forced ), array( '0', 'false', 'no', 'off' ), true ) ) { + $utf8_pcre = false; + } else { + // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged + $utf8_pcre = false !== @preg_match( '/^./u', 'a' ); + } } return (bool) $utf8_pcre; } diff --git a/tools/encoding-fuzz/matrix.php b/tools/encoding-fuzz/matrix.php new file mode 100644 index 0000000000000..98f72f63cd498 --- /dev/null +++ b/tools/encoding-fuzz/matrix.php @@ -0,0 +1,255 @@ + $env + * @return array{code: int, stdout: string, stderr: string} + */ +function matrix_run_command( array $command, array $env = array() ): array { + $process = proc_open( + $command, + array( + 0 => array( 'file', '/dev/null', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'pipe', 'w' ), + ), + $pipes, + null, + array_merge( $_ENV, $env ) + ); + + if ( ! is_resource( $process ) ) { + return array( + 'code' => 2, + 'stdout' => '', + 'stderr' => 'proc_open failed', + ); + } + + stream_set_blocking( $pipes[1], false ); + stream_set_blocking( $pipes[2], false ); + + $stdout = ''; + $stderr = ''; + $open = array( + 1 => $pipes[1], + 2 => $pipes[2], + ); + $deadline = microtime( true ) + 120; + + while ( array() !== $open ) { + if ( microtime( true ) > $deadline ) { + proc_terminate( $process, 9 ); + foreach ( $open as $pipe ) { + fclose( $pipe ); + } + proc_close( $process ); + return array( + 'code' => 2, + 'stdout' => $stdout, + 'stderr' => $stderr . "\nmatrix child timed out", + ); + } + + $read = array_values( $open ); + $write = null; + $except = null; + $ready = stream_select( $read, $write, $except, 1, 0 ); + + if ( false === $ready ) { + foreach ( $open as $pipe ) { + fclose( $pipe ); + } + proc_close( $process ); + return array( + 'code' => 2, + 'stdout' => $stdout, + 'stderr' => $stderr . "\nmatrix stream_select failed", + ); + } + + foreach ( $read as $pipe ) { + $chunk = stream_get_contents( $pipe ); + if ( false === $chunk || '' === $chunk ) { + continue; + } + + if ( $pipe === $pipes[1] ) { + $stdout .= $chunk; + } else { + $stderr .= $chunk; + } + } + + foreach ( $open as $index => $pipe ) { + if ( feof( $pipe ) ) { + $chunk = stream_get_contents( $pipe ); + if ( is_string( $chunk ) && '' !== $chunk ) { + if ( 1 === $index ) { + $stdout .= $chunk; + } else { + $stderr .= $chunk; + } + } + fclose( $pipe ); + unset( $open[ $index ] ); + } + } + } + + return array( + 'code' => proc_close( $process ), + 'stdout' => (string) $stdout, + 'stderr' => (string) $stderr, + ); +} + +/** + * @return array{records: array>, malformed: string[]} + */ +function matrix_decode_ndjson( string $stdout ): array { + $records = array(); + $malformed = array(); + foreach ( explode( "\n", trim( $stdout ) ) as $line ) { + if ( '' === $line ) { + continue; + } + + $record = json_decode( $line, true ); + if ( is_array( $record ) && isset( $record['type'] ) && is_string( $record['type'] ) ) { + $records[] = $record; + } else { + $malformed[] = $line; + } + } + return array( + 'records' => $records, + 'malformed' => $malformed, + ); +} + +/** + * @param array> $records + */ +function matrix_first_record( array $records, string $type ): ?array { + foreach ( $records as $record ) { + if ( $type === ( $record['type'] ?? null ) ) { + return $record; + } + } + return null; +} + +/** + * @param array> $records + */ +function matrix_has_oracle_event( array $records, string $oracle ): bool { + foreach ( $records as $record ) { + if ( 'oracle-event' === ( $record['type'] ?? null ) && $oracle === ( $record['oracle'] ?? null ) ) { + return true; + } + } + return false; +} + +$cases = array( + array( + 'name' => 'current-corpus', + 'command' => array( PHP_BINARY, __DIR__ . '/corpus.php', '--external', 'none' ), + 'env' => array( 'ENCODING_FUZZ_FORCE_PCRE_U' => '' ), + 'check' => static function ( array $run ): array { + $decoded = matrix_decode_ndjson( $run['stdout'] ); + $records = $decoded['records']; + $start = matrix_first_record( $records, 'start' ); + $done = matrix_first_record( $records, 'done' ); + $ok = 0 === $run['code'] && + array() === $decoded['malformed'] && + is_array( $start ) && + is_array( $done ) && + true === ( $start['environment']['pcre_u'] ?? null ) && + 0 === ( $done['stats']['failures'] ?? null ); + return array( $ok, is_array( $done ) ? json_encode( $done['stats'], JSON_UNESCAPED_SLASHES ) : trim( $run['stderr'] ), 2 === $run['code'] || array() !== $decoded['malformed'] ); + }, + ), + array( + 'name' => 'forced-no-pcre-corpus', + 'command' => array( PHP_BINARY, __DIR__ . '/corpus.php', '--external', 'none' ), + 'env' => array( 'ENCODING_FUZZ_FORCE_PCRE_U' => '0' ), + 'check' => static function ( array $run ): array { + $decoded = matrix_decode_ndjson( $run['stdout'] ); + $records = $decoded['records']; + $start = matrix_first_record( $records, 'start' ); + $done = matrix_first_record( $records, 'done' ); + $ok = 0 === $run['code'] && + array() === $decoded['malformed'] && + is_array( $start ) && + is_array( $done ) && + false === ( $start['environment']['pcre_u'] ?? null ) && + 'off' === ( $start['environment']['pcre_u_override'] ?? null ) && + 0 === ( $done['stats']['failures'] ?? null ); + return array( $ok, is_array( $start ) ? json_encode( $start['environment'], JSON_UNESCAPED_SLASHES ) : trim( $run['stderr'] ), 2 === $run['code'] || array() !== $decoded['malformed'] ); + }, + ), + array( + 'name' => 'native-unavailable-corpus', + 'command' => array( PHP_BINARY, '-d', 'disable_functions=utf8_encode,utf8_decode', __DIR__ . '/corpus.php', '--external', 'none' ), + 'env' => array( 'ENCODING_FUZZ_FORCE_PCRE_U' => '' ), + 'check' => static function ( array $run ): array { + $decoded = matrix_decode_ndjson( $run['stdout'] ); + $records = $decoded['records']; + $done = matrix_first_record( $records, 'done' ); + $ok = 0 === $run['code'] && + array() === $decoded['malformed'] && + is_array( $done ) && + matrix_has_oracle_event( $records, 'native' ) && + 0 === ( $done['stats']['failures'] ?? null ); + return array( $ok, is_array( $done ) ? json_encode( $done['stats'], JSON_UNESCAPED_SLASHES ) : trim( $run['stderr'] ), 2 === $run['code'] || array() !== $decoded['malformed'] ); + }, + ), + array( + 'name' => 'mb-oracle-unavailable-fails-closed', + 'command' => array( PHP_BINARY, '-d', 'disable_functions=mb_check_encoding,mb_scrub', __DIR__ . '/corpus.php', '--external', 'none' ), + 'env' => array( 'ENCODING_FUZZ_FORCE_PCRE_U' => '' ), + 'check' => static function ( array $run ): array { + $decoded = matrix_decode_ndjson( $run['stdout'] ); + $records = $decoded['records']; + $fatal = matrix_first_record( $records, 'fatal' ); + $ok = 2 === $run['code'] && + array() === $decoded['malformed'] && + matrix_has_oracle_event( $records, 'mb' ) && + is_array( $fatal ); + return array( $ok, is_array( $fatal ) ? (string) $fatal['reason'] : trim( $run['stderr'] ), array() !== $decoded['malformed'] || ( 2 === $run['code'] && ! $ok ) ); + }, + ), +); + +$failed = 0; +$harness_error = false; +foreach ( $cases as $case ) { + $run = matrix_run_command( $case['command'], $case['env'] ?? array() ); + list( $ok, $detail, $case_harness_error ) = $case['check']( $run ); + if ( $ok ) { + echo "PASS {$case['name']}\n"; + } else { + ++$failed; + $harness_error = $harness_error || $case_harness_error; + echo "FAIL {$case['name']}: exit {$run['code']}; {$detail}\n"; + } +} + +echo $failed > 0 ? "\n{$failed} matrix check(s) FAILED\n" : "\nAll matrix checks passed\n"; +exit( $failed > 0 ? ( $harness_error ? 2 : 1 ) : 0 ); From 56dc15b1f6292fd06d694958b606dd4f96d4e5cd Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 15:26:46 +0200 Subject: [PATCH 13/14] Document invalid-input noncharacter policy --- progress-handoff-xZOoEn.md | 23 +++++++++++++++++++++++ tools/encoding-fuzz/README.md | 14 ++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md index 6b549275cd59b..b35700e57dd19 100644 --- a/progress-handoff-xZOoEn.md +++ b/progress-handoff-xZOoEn.md @@ -155,3 +155,26 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn - Reviewer 2: initially found matrix pipe-deadlock, exit-code, and NDJSON-shape issues; satisfied after nonblocking pipe reads, harness-error exit `2`, and stricter record parsing. - Reviewer 3: initially found the matrix exit-code contract mismatch; satisfied after preserving exit `2` for harness-error-shaped failures and checking docs/progress accuracy. - Commit: this step commit. + +### Step 7: invalid-input noncharacter policy + +- Status: done; included in the step 7 commit. +- Prior step commit: `a6d67b18f0`. +- Scope: + - Do not broaden invalid-input noncharacter fuzzing. + - Document the currently pinned divergence between the PCRE-u public path and `_wp_has_noncharacters_fallback()` on ill-formed input. + - Record that further fuzz expansion is blocked on a Core policy decision: document `wp_has_noncharacters()` as valid-input-only or align public/fallback behavior on ill-formed input. +- Verification: + - `php -l tools/encoding-fuzz/lib/Checks.php` + - `php -l tools/encoding-fuzz/lib/Targets.php` + - `php -l tools/encoding-fuzz/lib/Bootstrap.php` + - `php -l tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/tests/harness-smoke.php` + - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none` + - `git diff --cached --check` + - Manual probe confirmed `wp_has_noncharacters( "\xC0\xEF\xBF\xBE" ) === false` and `_wp_has_noncharacters_fallback( "\xC0\xEF\xBF\xBE" ) === true` in the current PCRE-u environment. +- Review gate: satisfied by 3 adversarial reviewers. + - Reviewer 1: satisfied after checking the README policy text against current public/fallback behavior and the handoff. + - Reviewer 2: satisfied after confirming the diff is docs/progress only and does not broaden invalid-input noncharacter fuzzing. + - Reviewer 3: satisfied after checking previous steps are complete, staged scope is limited, and this section is updated before commit. +- Commit: this step commit. diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index e89e5d712a516..ef8a0d7742dc2 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -140,6 +140,20 @@ Internal invariants: with native `mb_substr()`, and explicit non-UTF-8 encodings fall back to byte-level `substr()` semantics +## Invalid-Input Noncharacter Policy + +Noncharacter differentials intentionally remain valid-input-only. The +current invalid-input divergence is pinned by smoke, not fuzz-expanded: +on hosts using the PCRE-u branch, +`wp_has_noncharacters( "\xC0\xEF\xBF\xBE" )` returns false because the +regular expression fails on ill-formed UTF-8, while +`_wp_has_noncharacters_fallback( "\xC0\xEF\xBF\xBE" )` returns true +because the fallback scanner skips the invalid byte and finds U+FFFE. + +Do not add invalid-input noncharacter fuzzing until Core decides whether +`wp_has_noncharacters()` is documented as valid-input-only or the public +and fallback paths are aligned on ill-formed input. + ## Inputs Random cases are fully determined by `(seed, case index)` **for a given From 98303df4dd805a1482ff8d26b2bfce11581366c1 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 17:42:29 +0200 Subject: [PATCH 14/14] Adapt encoding fuzzer to trunk noncharacter behavior --- tools/encoding-fuzz/README.md | 61 ++++++++------------- tools/encoding-fuzz/lib/Checks.php | 38 ++++--------- tools/encoding-fuzz/lib/Oracles.php | 58 +++++++++++++++----- tools/encoding-fuzz/lib/wp-stubs.php | 4 ++ tools/encoding-fuzz/tests/harness-smoke.php | 42 +++++++------- 5 files changed, 105 insertions(+), 98 deletions(-) diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md index ef8a0d7742dc2..75eb6a8ed16db 100644 --- a/tools/encoding-fuzz/README.md +++ b/tools/encoding-fuzz/README.md @@ -5,7 +5,7 @@ Differential fuzzer for the WordPress UTF-8 functions: - `wp_is_valid_utf8()` / `_wp_is_valid_utf8_fallback()` - `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()` - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()` -- `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only) +- `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` - `_mb_chr()` / `_mb_ord()` - `_mb_substr()` - `_wp_utf8_codepoint_count()`, `_wp_utf8_codepoint_span()`, and the @@ -23,7 +23,8 @@ Every result is compared against independent known-good implementations: | Oracle | Backing | Validity | Scrub | Encode | Decode | Nonchars | |-----------|--------------------------------------|----------|-------|--------|--------|----------| -| `mb` | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` / `mb_str_split()`+`mb_ord()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) | ✓ (valid input only) | +| `bytes` | independent UTF-8 noncharacter byte-sequence list | | | | | ✓ (primary) | +| `mb` | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` / `mb_str_split()`+`mb_ord()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) | ✓ (valid UTF-8 cross-check) | | `pcre` | PCRE2 strict UTF validation | ✓ | | | | | | `intl` | ICU `UConverter::transcode()` | | ✓ | | | | | `python3` | CPython codec, persistent subprocess | ✓ | ✓ | | | | @@ -46,14 +47,14 @@ the PHP 9 polyfill in `compat.php` prefers `mb_convert_encoding()` with `_wp_utf8_decode_fallback()` as its mbstring-less shadow (ticket #63863). -The `mb` noncharacter oracle (a trivial decode-and-test over -`mb_str_split()` / `mb_ord()`) backs the `wp_has_noncharacters()` -differential. Like every oracle it must pass a hand-derived battery, -which covers the boundaries and interior of the U+FDD0–U+FDEF block -and the final two code points of every plane with their neighbors — -the PCRE implementation under test enumerates each plane as a separate -hand-typed escape, so per-plane coverage is the point. It is defined -on valid input only — see the noncharacter policy under Checks. +The primary noncharacter oracle is an independent list of UTF-8 byte +sequences for U+FDD0–U+FDEF and the final two code points of every +plane. It is defined over arbitrary bytes, matching the public +function's byte-sequence contract. On valid UTF-8, a trivial mb +decode-and-test oracle (`mb_str_split()` / `mb_ord()`) cross-checks the +byte oracle. The battery covers boundaries and interior points of the +U+FDD0–U+FDEF block, every plane-final pair with neighbors, and +ill-formed surrounds. Because native and mb decoding agree on *every* valid code point (verified exhaustively over U+0000–U+10FFFF), the valid-input-only @@ -86,22 +87,10 @@ decode oracle on valid input only). Oracle-vs-oracle disagreements are reported separately (`oracle-disagreement`) so they don't masquerade as WordPress bugs. -Noncharacter detection is a three-way differential on **valid input -only**: `wp_has_noncharacters()` (the PCRE branch on hosts with -PCRE-u; without PCRE-u the public function aliases the fallback and -the differential degenerates to two distinct implementations — the -worker records which branch loaded as `pcre_u` in its environment -metadata), `_wp_has_noncharacters_fallback()`, and the trivial mb -reference must agree. On ill-formed input the public function's answer -depends on which environment branch of `utf8.php` loaded — the PCRE -branch returns false for any ill-formed input because `preg_match` -fails, while the fallback skips invalid spans and reports the -noncharacters around them (`"\xC0\xEF\xBF\xBE"`: PCRE false, fallback -true). The fuzzer's stance is that behavior is undefined unless -`wp_is_valid_utf8()`; the divergence itself is pinned by a fixed -regression vector in the smoke test, and aligning the implementations -(or documenting the stance in core) is an open upstream question for -the function author. +Noncharacter detection is checked on arbitrary bytes: +`wp_has_noncharacters()`, the deprecated `_wp_has_noncharacters_fallback()` +wrapper, and the independent byte-sequence reference must agree. On +valid UTF-8, the mb decode-and-test reference must also agree. Internal invariants: @@ -142,17 +131,12 @@ Internal invariants: ## Invalid-Input Noncharacter Policy -Noncharacter differentials intentionally remain valid-input-only. The -current invalid-input divergence is pinned by smoke, not fuzz-expanded: -on hosts using the PCRE-u branch, -`wp_has_noncharacters( "\xC0\xEF\xBF\xBE" )` returns false because the -regular expression fails on ill-formed UTF-8, while -`_wp_has_noncharacters_fallback( "\xC0\xEF\xBF\xBE" )` returns true -because the fallback scanner skips the invalid byte and finds U+FFFE. - -Do not add invalid-input noncharacter fuzzing until Core decides whether -`wp_has_noncharacters()` is documented as valid-input-only or the public -and fallback paths are aligned on ill-formed input. +Trunk aligned the invalid-input behavior: `wp_has_noncharacters()` +matches the UTF-8 byte sequences for noncharacters directly, so +malformed bytes elsewhere in the string do not suppress detection. +`_wp_has_noncharacters_fallback()` is deprecated and delegates to the +public function. The fuzzer therefore includes invalid-input +noncharacter cases in the normal differential. ## Inputs @@ -274,8 +258,7 @@ noncharacter-leaking `_wp_scan_utf8()`, noncharacter-missing stale-noncharacter-flag `_wp_scan_utf8()`) must all be caught. It also asserts generator determinism, the valid/invalid input mix, the deterministic short-boundary corpus, and -the documented `wp_has_noncharacters()` divergence stance on ill-formed -input. +the aligned `wp_has_noncharacters()` behavior on ill-formed input. For end-to-end pipeline testing while the real implementations are healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset|scan-ignore-bytes|scan-nonchars-leak|scan-miss-nonchars|scan-ascii-overrun|scan-stale-nonchars` diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php index 2a48994979cb1..2d040a7b9e6d9 100644 --- a/tools/encoding-fuzz/lib/Checks.php +++ b/tools/encoding-fuzz/lib/Checks.php @@ -28,11 +28,10 @@ * chunks reconstructs the same scrubbed text and always makes * forward progress * - * Noncharacter detection (VALID input only — the public function's - * answer on ill-formed input depends on which environment branch of - * `utf8.php` loaded, a documented divergence pinned by the smoke test): + * Noncharacter detection: * - `wp_has_noncharacters()` and `_wp_has_noncharacters_fallback()` vs - * a trivial decode-and-test reference. + * an independent UTF-8 noncharacter byte-sequence oracle, with an mb + * decode-and-test cross-check on valid UTF-8. * * Legacy `utf8_encode()` / `utf8_decode()` fallbacks: * - `_wp_utf8_encode_fallback()` vs every encode oracle on arbitrary @@ -293,7 +292,7 @@ public function run( string $input ): array { $failures[] = $failure; } - // 11. Noncharacter detection, on valid input only. + // 11. Noncharacter detection. foreach ( $this->check_noncharacters( $input, $ref_valid ) as $failure ) { $failures[] = $failure; } @@ -1065,37 +1064,24 @@ private function check_mb_chr_ord( string $input ): array { } /** - * Three-way differential for noncharacter detection on VALID input: - * the public `wp_has_noncharacters()` (the PCRE branch on hosts with - * PCRE-u; otherwise it aliases the fallback and this degenerates to - * two distinct implementations), the `_wp_scan_utf8()`-based - * fallback, and the trivial mb reference must all agree. - * - * Ill-formed input is deliberately skipped: the PCRE branch answers - * false on any ill-formed input (`preg_match` fails) while the - * fallback skips invalid spans and reports noncharacters around - * them, so the same public function answers differently depending - * on which environment branch loaded. That stance — behavior is - * undefined unless `wp_is_valid_utf8()` — is pinned by a fixed - * regression vector in the smoke test, not fuzzed. + * Differential for noncharacter detection over arbitrary bytes. The + * primary oracle searches for the UTF-8 byte sequences that encode + * Unicode noncharacters. On valid UTF-8 input, the trivial mb + * decode-and-test oracle is also cross-checked. * * @return array */ private function check_noncharacters( string $input, bool $ref_valid ): array { - if ( ! $ref_valid ) { - return array(); - } - $oracles = $this->oracles->noncharacter_oracles(); - if ( ! isset( $oracles['mb'] ) ) { + if ( ! isset( $oracles['bytes'] ) ) { return array(); } $failures = array(); - $expected = $oracles['mb']( $input ); + $expected = $oracles['bytes']( $input ); foreach ( $oracles as $name => $oracle ) { - if ( 'mb' === $name ) { + if ( 'bytes' === $name || ( 'mb' === $name && ! $ref_valid ) ) { continue; } @@ -1138,7 +1124,7 @@ private function check_noncharacters( string $input, bool $ref_valid ): array { 'target' => $key, 'got' => $result, 'expected' => $expected, - 'oracle' => 'mb', + 'oracle' => 'bytes', 'input_preview' => self::preview( $input ), ) ); diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php index 27c8822c07b79..72c5fe736cdf7 100644 --- a/tools/encoding-fuzz/lib/Oracles.php +++ b/tools/encoding-fuzz/lib/Oracles.php @@ -8,9 +8,9 @@ * Scrub oracles answer "what does maximal-subpart replacement produce?". * Encode oracles answer "what is this ISO-8859-1 text as UTF-8?". * Decode oracles answer "what is this UTF-8 text as ISO-8859-1?". - * Noncharacter oracles answer "does this VALID UTF-8 text contain a - * Unicode noncharacter?" (U+FDD0–U+FDEF, or any code point whose low - * sixteen bits are FFFE or FFFF). They are defined on valid input only. + * Noncharacter oracles answer "do these bytes contain the UTF-8 encoding + * of a Unicode noncharacter?" (U+FDD0–U+FDEF, or any code point whose low + * sixteen bits are FFFE or FFFF). * * - mbstring: `mb_check_encoding()` / `mb_scrub()` (maximal subpart * since PHP 8.1.6), `mb_convert_encoding()` for the @@ -66,7 +66,7 @@ class Oracles { /** @var array Decode oracles trusted on valid UTF-8 input only. */ private array $decode_valid_only = array(); - /** @var array Defined on valid UTF-8 input only. */ + /** @var array */ private array $noncharacters = array(); /** @var ExternalOracle[] */ @@ -120,12 +120,36 @@ public static function build( array $external_names ): self { ? 'mb_ord' : ( function_exists( '_mb_ord' ) ? '_mb_ord' : null ); + $oracles->noncharacters['bytes'] = static function ( string $bytes ): bool { + static $noncharacter_sequences = null; + if ( null === $noncharacter_sequences ) { + $noncharacter_sequences = array(); + + for ( $code_point = 0xFDD0; $code_point <= 0xFDEF; $code_point++ ) { + $noncharacter_sequences[] = Generator::encode_code_point( $code_point ); + } + + for ( $plane = 0; $plane <= 0x10; $plane++ ) { + $final = ( $plane << 16 ) | 0xFFFF; + $noncharacter_sequences[] = Generator::encode_code_point( $final - 1 ); + $noncharacter_sequences[] = Generator::encode_code_point( $final ); + } + } + + foreach ( $noncharacter_sequences as $sequence ) { + if ( str_contains( $bytes, $sequence ) ) { + return true; + } + } + + return false; + }; + if ( function_exists( 'mb_str_split' ) && null !== $mb_ord ) { /* * Trivial decode-and-test reference for noncharacter detection, - * independent of both implementations under test (the PCRE - * character-class regex and the `_wp_scan_utf8()`-based scan). - * Callers must pass valid UTF-8. + * independent of the byte-sequence search. Callers must pass + * valid UTF-8. */ $oracles->noncharacters['mb'] = static function ( string $valid_utf8 ) use ( $mb_ord ): bool { foreach ( mb_str_split( $valid_utf8, 1, 'UTF-8' ) as $character ) { @@ -327,11 +351,9 @@ public static function decode_battery(): array { /** * Known-answer vectors for the noncharacter oracles. All inputs are - * valid UTF-8 (the question is only defined there) and cover the - * boundaries AND interior of the U+FDD0–U+FDEF block plus the final - * two code points of EVERY plane with their U+xFFFD neighbors — the - * PCRE implementation under test enumerates each plane as a separate - * hand-typed escape, exactly where a single-plane typo would hide. + * valid UTF-8 and ill-formed surrounds, covering the boundaries AND + * interior of the U+FDD0–U+FDEF block plus the final two code points + * of EVERY plane with their U+xFFFD neighbors. * * Expectations are hand-derived from the Unicode definition; bytes * for the looped vectors come from the pure-arithmetic @@ -339,12 +361,16 @@ public static function decode_battery(): array { * against `mb_chr()` by `tests/code-point-to-utf8-exhaustive.php`), * keeping the encoding independent of the mbstring-backed oracle. * - * @return array [valid utf8 bytes, has noncharacters] + * @return array [bytes, has noncharacters] */ public static function noncharacter_battery(): array { $vectors = array( array( '', false ), array( 'abc', false ), + array( "\xC0abc", false ), + array( "\xC0\xEF\xBF\xBE", true ), + array( "\xC0a\xEF\xB7\x90b", true ), + array( "\xC0\xEF\xB7\x8F", false ), array( "\u{FDCF}", false ), // Last code point before the contiguous block. array( "\u{FDD0}", true ), // First of the contiguous block. array( "\u{FDDA}", true ), // Interior of the block: a lookup-table bug @@ -423,6 +449,10 @@ private function verify_battery(): void { list( $bytes, $expected ) = $vector; foreach ( $this->noncharacters as $name => $check ) { + if ( 'mb' === $name && ( ! function_exists( 'mb_check_encoding' ) || ! mb_check_encoding( $bytes, 'UTF-8' ) ) ) { + continue; + } + $got = $check( $bytes ); if ( $got !== $expected ) { $this->disable( $name, sprintf( @@ -514,7 +544,7 @@ public function decode_oracle_is_valid_only( string $name ): bool { return $this->decode_valid_only[ $name ] ?? false; } - /** @return array Defined on valid UTF-8 input only. */ + /** @return array */ public function noncharacter_oracles(): array { return $this->noncharacters; } diff --git a/tools/encoding-fuzz/lib/wp-stubs.php b/tools/encoding-fuzz/lib/wp-stubs.php index 2eaa662cbbeba..3ece6f3292650 100644 --- a/tools/encoding-fuzz/lib/wp-stubs.php +++ b/tools/encoding-fuzz/lib/wp-stubs.php @@ -25,3 +25,7 @@ function get_option( $option, $default_value = false ) { return 'blog_charset' === $option ? 'UTF-8' : $default_value; } } + +if ( ! function_exists( '_deprecated_function' ) ) { + function _deprecated_function( $function_name, $version, $replacement = '' ): void {} +} diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php index 6fcd660776fe6..31f188685a9bf 100644 --- a/tools/encoding-fuzz/tests/harness-smoke.php +++ b/tools/encoding-fuzz/tests/harness-smoke.php @@ -71,27 +71,31 @@ function check( string $label, bool $ok, string $detail = '' ): void { check( 'real targets clean on battery', array() === $battery_fails, implode( '; ', $battery_fails ) ); /* - * Documented stance: `wp_has_noncharacters()` is undefined on ill-formed - * input. On hosts with PCRE-u the public function answers false on ANY - * ill-formed input (`preg_match` fails) while the fallback skips invalid - * spans and reports the noncharacters around them. This regression - * vector pins the divergence; if it ever changes, the semantics were - * touched and the valid-input-only fuzzing policy must be revisited. + * Trunk aligned invalid-input behavior by making the public function search + * for noncharacter UTF-8 byte sequences directly and deprecating the old + * private fallback into a wrapper. */ $nonchar_probe = "\xC0\xEF\xBF\xBE"; // Invalid byte, then U+FFFE. -if ( _wp_can_use_pcre_u() ) { - check( - 'documented wp_has_noncharacters divergence on ill-formed input unchanged', - false === wp_has_noncharacters( $nonchar_probe ) && true === _wp_has_noncharacters_fallback( $nonchar_probe ), - sprintf( - 'public: %s, fallback: %s', - var_export( wp_has_noncharacters( $nonchar_probe ), true ), - var_export( _wp_has_noncharacters_fallback( $nonchar_probe ), true ) - ) - ); -} else { - echo "SKIP documented wp_has_noncharacters divergence (no PCRE-u: public function aliases the fallback)\n"; -} +check( + 'wp_has_noncharacters detects noncharacters inside ill-formed input', + true === wp_has_noncharacters( $nonchar_probe ) && true === _wp_has_noncharacters_fallback( $nonchar_probe ), + sprintf( + 'public: %s, fallback: %s', + var_export( wp_has_noncharacters( $nonchar_probe ), true ), + var_export( _wp_has_noncharacters_fallback( $nonchar_probe ), true ) + ) +); + +$nonchar_absent_probe = "\xC0abc"; +check( + 'wp_has_noncharacters ignores ill-formed input without noncharacters', + false === wp_has_noncharacters( $nonchar_absent_probe ) && false === _wp_has_noncharacters_fallback( $nonchar_absent_probe ), + sprintf( + 'public: %s, fallback: %s', + var_export( wp_has_noncharacters( $nonchar_absent_probe ), true ), + var_export( _wp_has_noncharacters_fallback( $nonchar_absent_probe ), true ) + ) +); // --------------------------------------------------------------------- // 3. Broken implementations must be caught.