From 3cc3e64765aab7410e2f8c9c85dbb679ad511cc7 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Wed, 10 Jun 2026 14:54:49 +0200
Subject: [PATCH 01/14] Add differential fuzzer for WordPress UTF-8 encoding
 functions.

Fuzzes wp_is_valid_utf8(), wp_scrub_utf8(), and their pure-PHP
fallbacks against five independent known-good oracles: mbstring,
PCRE2, ICU (intl), CPython, and the WHATWG TextDecoder (Node), the
last two as persistent subprocesses. All oracles must pass a
hand-computed known-answer battery before use; iconv is excluded
because libiconv accepts code points above U+10FFFF.

Beyond the differentials, internal invariants are checked: validity
iff scrub identity, scrub output validity, scrub idempotence,
code point counts against the scrubbed length, and chunked
_wp_scan_utf8() reconstruction with deterministic resumable-scan
budgets.

Inputs mix nine deterministic strategies (random bytes, boundary-
heavy valid UTF-8, mutations, invalid-atom splices, latin1, UTF-16,
ASCII fast-path stress, repeated motifs); every case is reproducible
from (seed, case index) alone. Includes a multi-lane runner with
stall detection, replay and signature-preserving minimization tools,
and a harness self-test that mutation-tests detection against seven
classes of deliberately broken implementations.
---
 tools/encoding-fuzz/README.md                | 141 +++++++
 tools/encoding-fuzz/lib/Bootstrap.php        |  36 ++
 tools/encoding-fuzz/lib/Checks.php           | 358 ++++++++++++++++++
 tools/encoding-fuzz/lib/Cli.php              | 112 ++++++
 tools/encoding-fuzz/lib/ExternalOracle.php   | 177 +++++++++
 tools/encoding-fuzz/lib/Generator.php        | 375 +++++++++++++++++++
 tools/encoding-fuzz/lib/Oracles.php          | 227 +++++++++++
 tools/encoding-fuzz/lib/Prng.php             |  92 +++++
 tools/encoding-fuzz/lib/Targets.php          |  44 +++
 tools/encoding-fuzz/lib/autoload.php         |  16 +
 tools/encoding-fuzz/lib/wp-stubs.php         |  16 +
 tools/encoding-fuzz/minimize.php             | 161 ++++++++
 tools/encoding-fuzz/oracles/oracle-node.mjs  |  54 +++
 tools/encoding-fuzz/oracles/oracle-python.py |  53 +++
 tools/encoding-fuzz/replay.php               |  93 +++++
 tools/encoding-fuzz/runner.php               | 280 ++++++++++++++
 tools/encoding-fuzz/tests/harness-smoke.php  | 184 +++++++++
 tools/encoding-fuzz/worker.php               | 173 +++++++++
 18 files changed, 2592 insertions(+)
 create mode 100644 tools/encoding-fuzz/README.md
 create mode 100644 tools/encoding-fuzz/lib/Bootstrap.php
 create mode 100644 tools/encoding-fuzz/lib/Checks.php
 create mode 100644 tools/encoding-fuzz/lib/Cli.php
 create mode 100644 tools/encoding-fuzz/lib/ExternalOracle.php
 create mode 100644 tools/encoding-fuzz/lib/Generator.php
 create mode 100644 tools/encoding-fuzz/lib/Oracles.php
 create mode 100644 tools/encoding-fuzz/lib/Prng.php
 create mode 100644 tools/encoding-fuzz/lib/Targets.php
 create mode 100644 tools/encoding-fuzz/lib/autoload.php
 create mode 100644 tools/encoding-fuzz/lib/wp-stubs.php
 create mode 100644 tools/encoding-fuzz/minimize.php
 create mode 100644 tools/encoding-fuzz/oracles/oracle-node.mjs
 create mode 100644 tools/encoding-fuzz/oracles/oracle-python.py
 create mode 100644 tools/encoding-fuzz/replay.php
 create mode 100644 tools/encoding-fuzz/runner.php
 create mode 100644 tools/encoding-fuzz/tests/harness-smoke.php
 create mode 100644 tools/encoding-fuzz/worker.php

diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
new file mode 100644
index 0000000000000..8deee79516156
--- /dev/null
+++ b/tools/encoding-fuzz/README.md
@@ -0,0 +1,141 @@
+# UTF-8 Encoding Fuzzer
+
+Differential fuzzer for the WordPress UTF-8 functions:
+
+- `wp_is_valid_utf8()` / `_wp_is_valid_utf8_fallback()`
+- `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()`
+- `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary)
+
+The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main
+fuzz surface; the mbstring-backed public functions are checked alongside
+them. Only `compat-utf8.php` and `utf8.php` are loaded — no WordPress
+bootstrap, database, or `wp-env`.
+
+## Oracles
+
+Every result is compared against independent known-good implementations:
+
+| Oracle    | Backing                              | Validity | Scrub |
+|-----------|--------------------------------------|----------|-------|
+| `mb`      | `mb_check_encoding()` / `mb_scrub()` | ✓        | ✓ (primary) |
+| `pcre`    | PCRE2 strict UTF validation          | ✓        |       |
+| `intl`    | ICU `UConverter::transcode()`        |          | ✓     |
+| `python3` | CPython codec, persistent subprocess | ✓        | ✓     |
+| `node`    | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓     |
+
+All scrub oracles implement the Unicode "maximal subpart" replacement
+recommendation (Unicode 16.0 §3.9, Table 3-8), which is the documented
+behavior of `wp_scrub_utf8()`. Every oracle must pass a hand-computed
+known-answer battery at startup; one that fails (or whose subprocess
+dies) is disabled and reported rather than allowed to produce noise.
+iconv is deliberately excluded: GNU libiconv accepts code points above
+U+10FFFF and fails the battery.
+
+`mb` (PHP ≥ 8.1.6, for maximal-subpart `mb_scrub()`) is required.
+External oracles are auto-detected; control them with
+`--external auto|python3|node|python3,node|none`.
+
+## Checks
+
+Differentials: both validity targets against every validity oracle, both
+scrub targets against every scrub oracle. Oracle-vs-oracle disagreements
+are reported separately (`oracle-disagreement`) so they don't masquerade
+as WordPress bugs.
+
+Internal invariants:
+
+- valid ⟺ scrub returns the input unchanged
+- scrub output is always valid UTF-8
+- scrub is idempotent
+- `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed text
+  (each maximal subpart counts as one code point)
+- scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points`
+  chunks reconstructs the same scrubbed text and always makes forward
+  progress (chunk sizes derive from the input hash, so replays are exact)
+
+## Inputs
+
+Each case is fully determined by `(seed, case index)`. The generator
+mixes nine strategies: uniformly random bytes, random ASCII,
+boundary-heavy valid UTF-8 (encoding-length edges, surrogate-gap edges,
+noncharacters, BOM, U+10FFFF), mutated valid UTF-8 (bit flips,
+truncations, splices), splices of hand-picked valid/invalid atoms
+(overlongs, surrogates, truncated sequences, out-of-range leads),
+ISO-8859-1-ish text, UTF-16 with/without BOM, long ASCII runs with
+broken tails (`strspn()` fast-path stress), and repeated motifs.
+Roughly a third of generated inputs are fully valid UTF-8.
+
+## Common Commands
+
+Run one worker batch:
+
+```sh
+php tools/encoding-fuzz/worker.php --seed 1 --cases 5000
+```
+
+Run parallel lanes for a minute (artifacts under `artifacts/encoding-fuzz/`):
+
+```sh
+php tools/encoding-fuzz/runner.php --lanes 4 --duration-seconds 60
+```
+
+Run indefinitely:
+
+```sh
+php tools/encoding-fuzz/runner.php --lanes 8 --duration-seconds 0 --max-cases 0
+```
+
+The duration budget stops new batches; in-flight batches finish, so a
+run can overshoot by up to one batch (`--cases-per-batch`, default 2000).
+A lane silent for `--stall-timeout` seconds (default 120) is killed and
+its seed recorded for reproduction.
+
+Replay a failure (or any input, or a re-derived case):
+
+```sh
+php tools/encoding-fuzz/replay.php --failure artifacts/encoding-fuzz/run-.../failure-seedS-caseN/failure.json
+php tools/encoding-fuzz/replay.php --input some-bytes.bin
+php tools/encoding-fuzz/replay.php --seed 123 --case 45
+```
+
+Minimize a failure while preserving its signature:
+
+```sh
+php tools/encoding-fuzz/minimize.php --failure .../failure.json
+```
+
+Exit codes everywhere: `0` clean, `1` findings, `2` harness error.
+
+## Artifacts
+
+The runner writes `summary.ndjson` (every worker event), `state.json`
+(aggregate counters, failure/stall seeds, compact Git metadata, stop
+reason), per-lane stderr logs, and one directory per failing case with
+`input.bin` and a self-contained `failure.json` (base64 input, signatures,
+diff windows with hex previews, environment and Git metadata).
+
+## Harness Self-Test
+
+```sh
+php tools/encoding-fuzz/tests/harness-smoke.php
+```
+
+Verifies the oracle battery, runs the real targets over the battery
+vectors, and — most importantly — mutation-tests the harness: seven
+classes of deliberately broken implementations (validator accepting
+0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber,
+identity scrubber, byte-dropping scrubber, off-by-one code point count,
+throwing target) must all be caught. It also asserts generator
+determinism and the valid/invalid input mix.
+
+For end-to-end pipeline testing while the real implementations are
+healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal` injects a broken
+target into worker, replay, and minimize alike:
+
+```sh
+ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/runner.php --lanes 2 --duration-seconds 5
+ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/minimize.php --failure .../failure.json
+```
+
+(The `non-maximal` fault minimizes to the two bytes `E0 F4`: two
+adjacent maximal subparts whose replacement characters get collapsed.)
diff --git a/tools/encoding-fuzz/lib/Bootstrap.php b/tools/encoding-fuzz/lib/Bootstrap.php
new file mode 100644
index 0000000000000..e92921dcf272d
--- /dev/null
+++ b/tools/encoding-fuzz/lib/Bootstrap.php
@@ -0,0 +1,36 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Loads the WordPress UTF-8 functions under test into a bare PHP process.
+ *
+ * Only `compat-utf8.php` and `utf8.php` are loaded. `utf8.php` calls
+ * `_wp_can_use_pcre_u()` at load time, which normally lives in
+ * `compat.php`; a minimal stand-in from `wp-stubs.php` covers it so the
+ * rest of WordPress stays out of the fuzzer process.
+ */
+class Bootstrap {
+	public static function repo_root(): string {
+		return dirname( __DIR__, 3 );
+	}
+
+	public static function load_targets(): void {
+		if ( function_exists( 'wp_is_valid_utf8' ) ) {
+			return;
+		}
+
+		require_once __DIR__ . '/wp-stubs.php';
+
+		$root = self::repo_root();
+		require_once $root . '/src/wp-includes/compat-utf8.php';
+		require_once $root . '/src/wp-includes/utf8.php';
+
+		/*
+		 * `wp_scrub_utf8()` saves and restores the global substitute character,
+		 * so the restored value should already be the one oracles expect.
+		 */
+		if ( function_exists( 'mb_substitute_character' ) ) {
+			mb_substitute_character( 0xFFFD );
+		}
+	}
+}
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
new file mode 100644
index 0000000000000..ad242666c1f2f
--- /dev/null
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -0,0 +1,358 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Runs every differential and invariant check against one input.
+ *
+ * Differentials (against known-good oracles):
+ *  - `wp_is_valid_utf8()` and `_wp_is_valid_utf8_fallback()` vs every
+ *    validity oracle (mb, pcre, python3, node).
+ *  - `wp_scrub_utf8()` and `_wp_scrub_utf8_fallback()` vs every scrub
+ *    oracle (mb, intl, python3, node).
+ *
+ * Internal invariants (true by definition of the API):
+ *  - valid ⟺ scrub returns the input unchanged
+ *  - scrub output is always valid UTF-8
+ *  - scrub is idempotent
+ *  - `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed
+ *    text (each maximal subpart counts as one code point)
+ *  - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points`
+ *    chunks reconstructs the same scrubbed text and always makes
+ *    forward progress
+ *
+ * Target callables are injectable so the harness smoke test can verify
+ * that deliberately broken implementations are caught.
+ */
+class Checks {
+	public const PREVIEW_BYTES = 48;
+
+	private Oracles $oracles;
+
+	/** @var array<string, callable> */
+	private array $targets;
+
+	public function __construct( Oracles $oracles, ?array $targets = null ) {
+		$this->oracles = $oracles;
+		$this->targets = $targets ?? Targets::resolve();
+	}
+
+	/**
+	 * @return array<int, array{check: string, signature: string, detail: array}> Failures; empty when all checks pass.
+	 */
+	public function run( string $input ): array {
+		$failures = array();
+
+		// Reference values from the primary oracle.
+		$mb_validity = $this->oracles->validity_oracles()['mb'] ?? null;
+		$mb_scrubber = $this->oracles->scrub_oracles()['mb'] ?? null;
+		if ( null === $mb_validity || null === $mb_scrubber ) {
+			return array( self::failure( 'harness-error', 'harness', array( 'reason' => 'mb oracle unavailable' ) ) );
+		}
+
+		$ref_valid = $mb_validity( $input );
+		$ref_scrub = $mb_scrubber( $input );
+
+		// Target executions, guarded against exceptions.
+		$results = array();
+		foreach ( array( 'is_valid', 'is_valid_fb', 'scrub', 'scrub_fb' ) as $key ) {
+			try {
+				$results[ $key ] = ( $this->targets[ $key ] )( $input );
+			} catch ( \Throwable $error ) {
+				$failures[] = self::failure(
+					'target-exception',
+					$key,
+					array(
+						'target'  => $key,
+						'message' => $error->getMessage(),
+						'class'   => get_class( $error ),
+					)
+				);
+				$results[ $key ] = null;
+			}
+		}
+
+		// 1. Validity differential.
+		foreach ( array( 'is_valid', 'is_valid_fb' ) as $key ) {
+			if ( null !== $results[ $key ] && $results[ $key ] !== $ref_valid ) {
+				$failures[] = self::failure(
+					'validity-mismatch',
+					$key,
+					array(
+						'target'   => $key,
+						'got'      => $results[ $key ],
+						'expected' => $ref_valid,
+						'oracle'   => 'mb',
+					)
+				);
+			}
+		}
+
+		foreach ( $this->oracles->validity_oracles() as $name => $oracle ) {
+			if ( 'mb' === $name ) {
+				continue;
+			}
+
+			$oracle_valid = $oracle( $input );
+			if ( null === $oracle_valid ) {
+				$this->oracles->disable( $name, 'transport failure during case' );
+				continue;
+			}
+
+			if ( $oracle_valid !== $ref_valid ) {
+				$failures[] = self::failure(
+					'oracle-disagreement',
+					"validity:{$name}",
+					array(
+						'kind'     => 'validity',
+						'oracle'   => $name,
+						'got'      => $oracle_valid,
+						'expected' => $ref_valid,
+					)
+				);
+			}
+		}
+
+		// 2. Scrub differential.
+		foreach ( array( 'scrub', 'scrub_fb' ) as $key ) {
+			if ( null !== $results[ $key ] && $results[ $key ] !== $ref_scrub ) {
+				$failures[] = self::failure(
+					'scrub-mismatch',
+					$key,
+					self::diff_detail( $key, $ref_scrub, $results[ $key ] )
+				);
+			}
+		}
+
+		foreach ( $this->oracles->scrub_oracles() as $name => $oracle ) {
+			if ( 'mb' === $name ) {
+				continue;
+			}
+
+			$oracle_scrub = $oracle( $input );
+			if ( null === $oracle_scrub ) {
+				$this->oracles->disable( $name, 'transport failure during case' );
+				continue;
+			}
+
+			if ( $oracle_scrub !== $ref_scrub ) {
+				$failures[] = self::failure(
+					'oracle-disagreement',
+					"scrub:{$name}",
+					self::diff_detail( $name, $ref_scrub, $oracle_scrub )
+				);
+			}
+		}
+
+		// 3. valid ⟺ scrub identity.
+		foreach ( array( 'is_valid' => 'scrub', 'is_valid_fb' => 'scrub_fb' ) as $valid_key => $scrub_key ) {
+			if ( null === $results[ $valid_key ] || null === $results[ $scrub_key ] ) {
+				continue;
+			}
+
+			$identity = $results[ $scrub_key ] === $input;
+			if ( $results[ $valid_key ] !== $identity ) {
+				$failures[] = self::failure(
+					'valid-iff-scrub-identity',
+					$valid_key,
+					array(
+						'valid_target'   => $valid_key,
+						'scrub_target'   => $scrub_key,
+						'valid'          => $results[ $valid_key ],
+						'scrub_identity' => $identity,
+					)
+				);
+			}
+		}
+
+		// 4. Scrub output must be valid UTF-8. 5. Scrub must be idempotent.
+		foreach ( array( 'scrub', 'scrub_fb' ) as $key ) {
+			if ( null === $results[ $key ] ) {
+				continue;
+			}
+
+			$scrubbed = $results[ $key ];
+			if ( ! $mb_validity( $scrubbed ) ) {
+				$failures[] = self::failure(
+					'scrubbed-not-valid',
+					$key,
+					array(
+						'target'  => $key,
+						'scrub_preview' => self::preview( $scrubbed ),
+					)
+				);
+			}
+
+			try {
+				$twice = ( $this->targets[ $key ] )( $scrubbed );
+			} catch ( \Throwable $error ) {
+				$failures[] = self::failure(
+					'target-exception',
+					"{$key}:idempotence",
+					array(
+						'target'  => $key,
+						'message' => $error->getMessage(),
+						'class'   => get_class( $error ),
+					)
+				);
+				$twice = $scrubbed;
+			}
+
+			if ( $twice !== $scrubbed ) {
+				$failures[] = self::failure(
+					'scrub-not-idempotent',
+					$key,
+					self::diff_detail( $key, $scrubbed, $twice )
+				);
+			}
+		}
+
+		// 6. Code point count agrees with the scrubbed length.
+		try {
+			$count    = ( $this->targets['codepoint_count'] )( $input );
+			$expected = mb_strlen( $ref_scrub, 'UTF-8' );
+			if ( $count !== $expected ) {
+				$failures[] = self::failure(
+					'codepoint-count-mismatch',
+					'codepoint_count',
+					array(
+						'got'      => $count,
+						'expected' => $expected,
+					)
+				);
+			}
+		} catch ( \Throwable $error ) {
+			$failures[] = self::failure(
+				'target-exception',
+				'codepoint_count',
+				array(
+					'target'  => 'codepoint_count',
+					'message' => $error->getMessage(),
+					'class'   => get_class( $error ),
+				)
+			);
+		}
+
+		// 7. Chunked scan reconstruction.
+		$chunk_failure = $this->check_chunked_scan( $input, $ref_scrub );
+		if ( null !== $chunk_failure ) {
+			$failures[] = $chunk_failure;
+		}
+
+		return $failures;
+	}
+
+	/**
+	 * Rebuilds the scrubbed text by calling `_wp_scan_utf8()` directly
+	 * with pseudo-random `max_code_points` budgets, exercising the
+	 * resumable-scan paths the plain fallbacks never hit. Chunk sizes
+	 * derive from the input hash, so replaying the input replays the
+	 * exact chunking.
+	 */
+	private function check_chunked_scan( string $input, string $ref_scrub ): ?array {
+		if ( ! function_exists( '_wp_scan_utf8' ) ) {
+			return null;
+		}
+
+		$length      = strlen( $input );
+		$chunk_bytes = hash( 'sha256', $input, true );
+		$chunk_index = 0;
+		$at          = 0;
+		$out         = '';
+		$guard       = ( 2 * $length ) + 16;
+
+		while ( $at < $length ) {
+			if ( --$guard < 0 ) {
+				return self::failure(
+					'scan-no-progress',
+					'chunked-scan',
+					array(
+						'at'     => $at,
+						'length' => $length,
+					)
+				);
+			}
+
+			$was_at         = $at;
+			$invalid_length = 0;
+			$max_points     = 1 + ( ord( $chunk_bytes[ $chunk_index % 32 ] ) % 7 );
+			++$chunk_index;
+
+			try {
+				_wp_scan_utf8( $input, $at, $invalid_length, null, $max_points );
+			} catch ( \Throwable $error ) {
+				return self::failure(
+					'target-exception',
+					'chunked-scan',
+					array(
+						'target'  => '_wp_scan_utf8',
+						'message' => $error->getMessage(),
+						'class'   => get_class( $error ),
+					)
+				);
+			}
+
+			$out .= substr( $input, $was_at, $at - $was_at );
+
+			if ( $invalid_length > 0 ) {
+				$out .= "\u{FFFD}";
+				$at  += $invalid_length;
+			} elseif ( $at === $was_at && $at < $length ) {
+				return self::failure(
+					'scan-no-progress',
+					'chunked-scan',
+					array(
+						'at'         => $at,
+						'length'     => $length,
+						'max_points' => $max_points,
+					)
+				);
+			}
+		}
+
+		if ( $out !== $ref_scrub ) {
+			return self::failure(
+				'chunked-scan-mismatch',
+				'chunked-scan',
+				self::diff_detail( 'chunked-scan', $ref_scrub, $out )
+			);
+		}
+
+		return null;
+	}
+
+	private static function failure( string $check, string $party, array $detail ): array {
+		return array(
+			'check'     => $check,
+			'signature' => "{$check}:{$party}",
+			'detail'    => $detail,
+		);
+	}
+
+	private static function diff_detail( string $party, string $expected, string $got ): array {
+		$offset = self::first_difference( $expected, $got );
+
+		return array(
+			'party'           => $party,
+			'expected_length' => strlen( $expected ),
+			'got_length'      => strlen( $got ),
+			'first_diff_at'   => $offset,
+			'expected_window' => self::preview( $expected, $offset ),
+			'got_window'      => self::preview( $got, $offset ),
+		);
+	}
+
+	private static function first_difference( string $a, string $b ): int {
+		$max = min( strlen( $a ), strlen( $b ) );
+		for ( $i = 0; $i < $max; $i++ ) {
+			if ( $a[ $i ] !== $b[ $i ] ) {
+				return $i;
+			}
+		}
+		return $max;
+	}
+
+	private static function preview( string $bytes, int $center = 0 ): string {
+		$start = max( 0, $center - intdiv( self::PREVIEW_BYTES, 2 ) );
+		return bin2hex( substr( $bytes, $start, self::PREVIEW_BYTES ) );
+	}
+}
diff --git a/tools/encoding-fuzz/lib/Cli.php b/tools/encoding-fuzz/lib/Cli.php
new file mode 100644
index 0000000000000..14c5d4f671324
--- /dev/null
+++ b/tools/encoding-fuzz/lib/Cli.php
@@ -0,0 +1,112 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Small shared helpers for the CLI entry points.
+ */
+class Cli {
+	/**
+	 * Parses `--name value` and `--name=value` pairs.
+	 *
+	 * @param string[] $argv
+	 * @param array<string, mixed> $defaults Option name => default value.
+	 * @return array<string, mixed>
+	 */
+	public static function parse_args( array $argv, array $defaults ): array {
+		$options = $defaults;
+		$count   = count( $argv );
+
+		for ( $i = 1; $i < $count; $i++ ) {
+			$arg = $argv[ $i ];
+			if ( 0 !== strncmp( $arg, '--', 2 ) ) {
+				fwrite( STDERR, "Unexpected argument: {$arg}\n" );
+				exit( 2 );
+			}
+
+			$body = substr( $arg, 2 );
+			if ( false !== strpos( $body, '=' ) ) {
+				list( $name, $value ) = explode( '=', $body, 2 );
+			} else {
+				$name = $body;
+				if ( $i + 1 >= $count ) {
+					fwrite( STDERR, "Missing value for --{$name}\n" );
+					exit( 2 );
+				}
+				$value = $argv[ ++$i ];
+			}
+
+			if ( ! array_key_exists( $name, $defaults ) ) {
+				fwrite( STDERR, "Unknown option --{$name}\n" );
+				exit( 2 );
+			}
+
+			$options[ $name ] = is_int( $defaults[ $name ] ) ? (int) $value : $value;
+		}
+
+		return $options;
+	}
+
+	/**
+	 * Resolves an `--external` option value to a list of oracle names.
+	 *
+	 * @return string[]
+	 */
+	public static function resolve_externals( string $option ): array {
+		if ( 'none' === $option ) {
+			return array();
+		}
+
+		if ( 'auto' === $option ) {
+			return array( 'python3', 'node' );
+		}
+
+		return array_values( array_filter( array_map( 'trim', explode( ',', $option ) ) ) );
+	}
+
+	public static function emit( array $record ): void {
+		fwrite( STDOUT, json_encode( $record, JSON_UNESCAPED_SLASHES ) . "\n" );
+	}
+
+	/**
+	 * Compact Git metadata, collected once per process.
+	 */
+	public static function git_metadata( string $repo_root ): array {
+		$run = static function ( array $command ) use ( $repo_root ): ?string {
+			$process = @proc_open(
+				$command,
+				array(
+					0 => array( 'file', '/dev/null', 'r' ),
+					1 => array( 'pipe', 'w' ),
+					2 => array( 'file', '/dev/null', 'a' ),
+				),
+				$pipes,
+				$repo_root
+			);
+			if ( ! is_resource( $process ) ) {
+				return null;
+			}
+			$out  = stream_get_contents( $pipes[1] );
+			fclose( $pipes[1] );
+			$code = proc_close( $process );
+			return 0 === $code ? trim( (string) $out ) : null;
+		};
+
+		$commit = $run( array( 'git', 'rev-parse', 'HEAD' ) );
+		$branch = $run( array( 'git', 'rev-parse', '--abbrev-ref', 'HEAD' ) );
+		$status = $run( array( 'git', 'status', '--porcelain', '--untracked-files=no' ) );
+
+		return array(
+			'commit' => $commit,
+			'branch' => $branch,
+			'dirty'  => null === $status ? null : '' !== $status,
+		);
+	}
+
+	public static function environment_metadata( Oracles $oracles ): array {
+		return array(
+			'php'     => PHP_VERSION,
+			'os'      => PHP_OS_FAMILY,
+			'oracles' => $oracles->names(),
+		);
+	}
+}
diff --git a/tools/encoding-fuzz/lib/ExternalOracle.php b/tools/encoding-fuzz/lib/ExternalOracle.php
new file mode 100644
index 0000000000000..8130ad484388b
--- /dev/null
+++ b/tools/encoding-fuzz/lib/ExternalOracle.php
@@ -0,0 +1,177 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Client for a persistent external oracle subprocess speaking the
+ * length-prefixed binary protocol documented in `oracles/`.
+ *
+ * One subprocess handles all cases for the life of the worker, so the
+ * per-case cost is a single pipe round trip.
+ */
+class ExternalOracle {
+	public string $name;
+	private array $command;
+	/** @var resource|null */
+	private $process = null;
+	/** @var resource|null */
+	private $stdin = null;
+	/** @var resource|null */
+	private $stdout = null;
+	private ?string $last_error = null;
+	private ?string $memo_input = null;
+	private ?array $memo_result = null;
+
+	public function __construct( string $name, array $command ) {
+		$this->name    = $name;
+		$this->command = $command;
+	}
+
+	/**
+	 * @return array{0: ?self, 1: ?string} Oracle or null, plus error message.
+	 */
+	public static function create( string $name ): array {
+		switch ( $name ) {
+			case 'python3':
+				$command = array( 'python3', __DIR__ . '/../oracles/oracle-python.py' );
+				break;
+			case 'node':
+				$command = array( 'node', __DIR__ . '/../oracles/oracle-node.mjs' );
+				break;
+			default:
+				return array( null, "unknown external oracle '{$name}'" );
+		}
+
+		$oracle = new self( $name, $command );
+		$error  = $oracle->start();
+		if ( null !== $error ) {
+			return array( null, $error );
+		}
+
+		return array( $oracle, null );
+	}
+
+	private function start(): ?string {
+		$descriptors = array(
+			0 => array( 'pipe', 'r' ),
+			1 => array( 'pipe', 'w' ),
+			2 => array( 'file', '/dev/null', 'a' ),
+		);
+
+		$process = @proc_open( $this->command, $descriptors, $pipes );
+		if ( ! is_resource( $process ) ) {
+			return "failed to launch {$this->name} oracle";
+		}
+
+		$this->process = $process;
+		$this->stdin   = $pipes[0];
+		$this->stdout  = $pipes[1];
+
+		// Probe with a trivial request so launch failures surface immediately.
+		$probe = $this->check( 'ok' );
+		if ( null === $probe || true !== $probe['valid'] || 'ok' !== $probe['scrubbed'] ) {
+			$detail = $this->last_error ?? 'bad probe response';
+			$this->shutdown();
+			return "{$this->name} oracle failed startup probe: {$detail}";
+		}
+
+		return null;
+	}
+
+	public function is_alive(): bool {
+		return null !== $this->process;
+	}
+
+	public function last_error(): ?string {
+		return $this->last_error;
+	}
+
+	/**
+	 * @return array{valid: bool, scrubbed: string}|null Null on transport failure.
+	 */
+	public function check( string $bytes ): ?array {
+		if ( null === $this->process ) {
+			return null;
+		}
+
+		// Validity and scrub oracles ask about the same input back to
+		// back; answer both from one pipe round trip.
+		if ( $bytes === $this->memo_input ) {
+			return $this->memo_result;
+		}
+
+		$request = pack( 'N', strlen( $bytes ) ) . $bytes;
+		if ( ! $this->write_exact( $request ) ) {
+			$this->fail( 'write failed' );
+			return null;
+		}
+
+		$header = $this->read_exact( 5 );
+		if ( null === $header ) {
+			$this->fail( 'short response header' );
+			return null;
+		}
+
+		$valid  = "\x00" !== $header[0];
+		$length = unpack( 'Nlength', substr( $header, 1 ) )['length'];
+
+		$scrubbed = 0 === $length ? '' : $this->read_exact( $length );
+		if ( null === $scrubbed ) {
+			$this->fail( 'short response body' );
+			return null;
+		}
+
+		$this->memo_input  = $bytes;
+		$this->memo_result = array(
+			'valid'    => $valid,
+			'scrubbed' => $scrubbed,
+		);
+
+		return $this->memo_result;
+	}
+
+	private function write_exact( string $bytes ): bool {
+		$total = strlen( $bytes );
+		$sent  = 0;
+		while ( $sent < $total ) {
+			$written = @fwrite( $this->stdin, substr( $bytes, $sent ) );
+			if ( false === $written || 0 === $written ) {
+				return false;
+			}
+			$sent += $written;
+		}
+		return true;
+	}
+
+	private function read_exact( int $length ): ?string {
+		$out = '';
+		while ( strlen( $out ) < $length ) {
+			$chunk = @fread( $this->stdout, $length - strlen( $out ) );
+			if ( false === $chunk || '' === $chunk ) {
+				return null;
+			}
+			$out .= $chunk;
+		}
+		return $out;
+	}
+
+	private function fail( string $reason ): void {
+		$this->last_error = $reason;
+		$this->shutdown();
+	}
+
+	public function shutdown(): void {
+		if ( is_resource( $this->stdin ) ) {
+			@fclose( $this->stdin );
+		}
+		if ( is_resource( $this->stdout ) ) {
+			@fclose( $this->stdout );
+		}
+		if ( is_resource( $this->process ) ) {
+			@proc_terminate( $this->process );
+			@proc_close( $this->process );
+		}
+		$this->process = null;
+		$this->stdin   = null;
+		$this->stdout  = null;
+	}
+}
diff --git a/tools/encoding-fuzz/lib/Generator.php b/tools/encoding-fuzz/lib/Generator.php
new file mode 100644
index 0000000000000..eb07d7d89183c
--- /dev/null
+++ b/tools/encoding-fuzz/lib/Generator.php
@@ -0,0 +1,375 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Produces fuzz inputs as raw byte strings, mixing:
+ *
+ *  - uniformly random bytes
+ *  - guaranteed-valid UTF-8 with boundary-heavy code point choices
+ *  - valid UTF-8 corrupted by targeted mutations
+ *  - splices of hand-picked valid/invalid byte atoms
+ *  - legacy encodings (ISO-8859-1-ish text, UTF-16 with/without BOM)
+ *  - long ASCII runs with multibyte or broken tails (fast-path stress)
+ *  - short motifs repeated many times
+ *
+ * Everything derives from the Prng, so `(seed, case index)` fully
+ * determines the input.
+ */
+class Generator {
+	/**
+	 * Code points sitting on the edges of the well-formed byte ranges in
+	 * Unicode Table 3-7, plus noncharacters and the replacement character.
+	 */
+	private const BOUNDARY_CODE_POINTS = array(
+		0x00, 0x01, 0x09, 0x0A, 0x0D, 0x20, 0x7E, 0x7F,            // ASCII edges.
+		0x80, 0x7FF,                                               // Two-byte edges.
+		0x800, 0xFFF, 0x1000, 0xCFFF, 0xD000, 0xD7FF,              // Three-byte lead splits.
+		0xE000, 0xFFFD,                                            // After the surrogate gap.
+		0xFDD0, 0xFDEF, 0xFFFE, 0xFFFF,                            // Noncharacters (valid UTF-8!).
+		0x10000, 0x3FFFF, 0x40000, 0xFFFFF, 0x100000, 0x10FFFF,    // Four-byte lead splits.
+		0x1FFFE, 0x1FFFF, 0x10FFFE,                                // Supplementary noncharacters.
+	);
+
+	/**
+	 * Short byte sequences that are individually valid UTF-8.
+	 */
+	private const VALID_ATOMS = array(
+		'a',
+		'hello',
+		"\x00",
+		"\x7F",
+		"\xC2\x80",         // U+0080, smallest two-byte.
+		"\xDF\xBF",         // U+07FF, largest two-byte.
+		"\xE0\xA0\x80",     // U+0800, smallest three-byte.
+		"\xED\x9F\xBF",     // U+D7FF, last before surrogates.
+		"\xEE\x80\x80",     // U+E000, first after surrogates.
+		"\xEF\xBF\xBD",     // U+FFFD, replacement character itself.
+		"\xEF\xBF\xBE",     // U+FFFE, noncharacter.
+		"\xEF\xBF\xBF",     // U+FFFF, noncharacter.
+		"\xEF\xBB\xBF",     // U+FEFF, byte order mark.
+		"\xF0\x90\x80\x80", // U+10000, smallest four-byte.
+		"\xF4\x8F\xBF\xBF", // U+10FFFF, largest code point.
+	);
+
+	/**
+	 * Short byte sequences that are individually ill-formed UTF-8,
+	 * covering every class of failure: bad leads, overlongs, surrogates,
+	 * out-of-range, lone/excess continuations, and truncations.
+	 */
+	private const INVALID_ATOMS = array(
+		"\x80",             // Lone continuation.
+		"\xBF",             // Lone continuation, upper edge.
+		"\x80\x80\x80",     // Continuation run.
+		"\xC0",             // Never-valid lead.
+		"\xC0\xAF",         // Overlong '/'.
+		"\xC1\xBF",         // Overlong, largest C1 form.
+		"\xC2",             // Truncated two-byte.
+		"\xC2\xC2\x80",     // Truncated lead then valid char.
+		"\xE0\x80\xAF",     // Overlong three-byte.
+		"\xE0\x9F\xBF",     // Overlong three-byte, upper edge.
+		"\xE1\x80",         // Truncated three-byte (valid prefix).
+		"\xE2\x8C",         // Truncated three-byte (valid prefix).
+		"\xED\xA0\x80",     // Surrogate U+D800.
+		"\xED\xBF\xBF",     // Surrogate U+DFFF.
+		"\xED\xB0\x80",     // Low surrogate half.
+		"\xEF\xBF",         // Truncated three-byte.
+		"\xF0\x80\x80\xAF", // Overlong four-byte.
+		"\xF0\x8F\xBF\xBF", // Overlong four-byte, upper edge.
+		"\xF0\x90",         // Truncated four-byte (valid prefix).
+		"\xF1\x80",         // Truncated four-byte (valid prefix).
+		"\xF1\x80\x80",     // Truncated four-byte, three valid bytes.
+		"\xF4\x8F\xBF",     // Truncated U+10FFFF.
+		"\xF4\x90\x80\x80", // First code point past U+10FFFF.
+		"\xF5\x80\x80\x80", // Never-valid lead F5.
+		"\xF8\x80\x80\x80\x80", // Old-style five-byte form.
+		"\xFC\x80\x80\x80\x80\x80", // Old-style six-byte form.
+		"\xFE",             // Never valid.
+		"\xFF",             // Never valid.
+		"\xFE\xFF",         // UTF-16BE BOM.
+		"\xFF\xFE",         // UTF-16LE BOM.
+		"a\xF1\x80\x80\xE1\x80\xC2b", // Unicode Table 3-8 example: three maximal subparts.
+	);
+
+	private Prng $prng;
+	private int $max_bytes;
+
+	public function __construct( Prng $prng, int $max_bytes = 65536 ) {
+		$this->prng      = $prng;
+		$this->max_bytes = max( 1, $max_bytes );
+	}
+
+	/**
+	 * @return array{strategy: string, bytes: string}
+	 */
+	public function generate(): array {
+		$strategy = $this->prng->weighted(
+			array(
+				'random-bytes'    => 14,
+				'random-ascii'    => 4,
+				'valid-utf8'      => 18,
+				'mutated-valid'   => 24,
+				'atom-splice'     => 20,
+				'latin1-text'     => 4,
+				'utf16-bytes'     => 4,
+				'ascii-fast-path' => 6,
+				'repeat-motif'    => 6,
+			)
+		);
+
+		$method = 'gen_' . str_replace( '-', '_', $strategy );
+		return array(
+			'strategy' => $strategy,
+			'bytes'    => $this->$method(),
+		);
+	}
+
+	private function gen_random_bytes(): string {
+		return $this->prng->bytes( $this->prng->biased_length( $this->max_bytes ) );
+	}
+
+	private function gen_random_ascii(): string {
+		$length = $this->prng->biased_length( $this->max_bytes );
+		$out    = '';
+		for ( $i = 0; $i < $length; $i++ ) {
+			$out .= chr( $this->prng->int( 0, 0x7F ) );
+		}
+		return $out;
+	}
+
+	private function gen_valid_utf8(): string {
+		$budget = $this->prng->biased_length( $this->max_bytes );
+		$out    = '';
+
+		while ( strlen( $out ) < $budget ) {
+			$kind = $this->prng->weighted(
+				array(
+					'ascii-run' => 30,
+					'boundary'  => 20,
+					'two-byte'  => 15,
+					'three-byte' => 15,
+					'four-byte' => 10,
+					'any'       => 10,
+				)
+			);
+
+			switch ( $kind ) {
+				case 'ascii-run':
+					$run = $this->prng->int( 1, 16 );
+					for ( $i = 0; $i < $run; $i++ ) {
+						$out .= chr( $this->prng->int( 0x00, 0x7F ) );
+					}
+					break;
+
+				case 'boundary':
+					$out .= self::encode_code_point( $this->prng->choice( self::BOUNDARY_CODE_POINTS ) );
+					break;
+
+				case 'two-byte':
+					$out .= self::encode_code_point( $this->prng->int( 0x80, 0x7FF ) );
+					break;
+
+				case 'three-byte':
+					$cp = $this->prng->int( 0x800, 0xFFFF );
+					// Skip the surrogate range; it cannot be encoded.
+					if ( $cp >= 0xD800 && $cp <= 0xDFFF ) {
+						$cp -= 0x800;
+					}
+					$out .= self::encode_code_point( $cp );
+					break;
+
+				case 'four-byte':
+					$out .= self::encode_code_point( $this->prng->int( 0x10000, 0x10FFFF ) );
+					break;
+
+				default:
+					$cp = $this->prng->int( 0x00, 0x10FFFF );
+					if ( $cp >= 0xD800 && $cp <= 0xDFFF ) {
+						$cp -= 0x800;
+					}
+					$out .= self::encode_code_point( $cp );
+			}
+		}
+
+		return $out;
+	}
+
+	private function gen_mutated_valid(): string {
+		$bytes     = $this->gen_valid_utf8();
+		$mutations = $this->prng->int( 1, 6 );
+
+		for ( $m = 0; $m < $mutations && '' !== $bytes; $m++ ) {
+			$kind = $this->prng->weighted(
+				array(
+					'flip-bit'     => 20,
+					'set-byte'     => 20,
+					'delete-span'  => 15,
+					'truncate'     => 15,
+					'insert-bytes' => 15,
+					'duplicate'    => 10,
+					'swap'         => 5,
+				)
+			);
+
+			$length = strlen( $bytes );
+			$at     = $this->prng->int( 0, max( 0, $length - 1 ) );
+
+			switch ( $kind ) {
+				case 'flip-bit':
+					$bytes[ $at ] = chr( ord( $bytes[ $at ] ) ^ ( 1 << $this->prng->int( 0, 7 ) ) );
+					break;
+
+				case 'set-byte':
+					$bytes[ $at ] = chr( $this->prng->int( 0, 255 ) );
+					break;
+
+				case 'delete-span':
+					$span  = $this->prng->int( 1, min( 8, $length ) );
+					$bytes = substr( $bytes, 0, $at ) . substr( $bytes, $at + $span );
+					break;
+
+				case 'truncate':
+					// Tail truncation is the classic incomplete-sequence case.
+					$bytes = $this->prng->chance( 50 )
+						? substr( $bytes, 0, $at )
+						: substr( $bytes, $at );
+					break;
+
+				case 'insert-bytes':
+					$insert = $this->prng->bytes( $this->prng->int( 1, 6 ) );
+					$bytes  = substr( $bytes, 0, $at ) . $insert . substr( $bytes, $at );
+					break;
+
+				case 'duplicate':
+					$span  = $this->prng->int( 1, min( 8, $length - $at ) );
+					$slice = substr( $bytes, $at, $span );
+					$bytes = substr( $bytes, 0, $at ) . $slice . $slice . substr( $bytes, $at + $span );
+					break;
+
+				case 'swap':
+					$other          = $this->prng->int( 0, $length - 1 );
+					$tmp            = $bytes[ $at ];
+					$bytes[ $at ]   = $bytes[ $other ];
+					$bytes[ $other ] = $tmp;
+					break;
+			}
+		}
+
+		return substr( $bytes, 0, $this->max_bytes );
+	}
+
+	private function gen_atom_splice(): string {
+		$count = $this->prng->int( 1, 24 );
+		$out   = '';
+
+		for ( $i = 0; $i < $count && strlen( $out ) < $this->max_bytes; $i++ ) {
+			$pool = $this->prng->weighted(
+				array(
+					'invalid' => 45,
+					'valid'   => 35,
+					'ascii'   => 12,
+					'random'  => 8,
+				)
+			);
+
+			switch ( $pool ) {
+				case 'invalid':
+					$out .= $this->prng->choice( self::INVALID_ATOMS );
+					break;
+				case 'valid':
+					$out .= $this->prng->choice( self::VALID_ATOMS );
+					break;
+				case 'ascii':
+					$out .= chr( $this->prng->int( 0x20, 0x7E ) );
+					break;
+				default:
+					$out .= $this->prng->bytes( $this->prng->int( 1, 4 ) );
+			}
+		}
+
+		return substr( $out, 0, $this->max_bytes );
+	}
+
+	private function gen_latin1_text(): string {
+		$length = $this->prng->biased_length( $this->max_bytes );
+		$out    = '';
+		for ( $i = 0; $i < $length; $i++ ) {
+			// Mostly readable text with sprinkled ISO-8859-1 high bytes.
+			$out .= $this->prng->chance( 25 )
+				? chr( $this->prng->int( 0xA0, 0xFF ) )
+				: chr( $this->prng->int( 0x20, 0x7E ) );
+		}
+		return $out;
+	}
+
+	private function gen_utf16_bytes(): string {
+		$text  = substr( $this->gen_valid_utf8(), 0, 512 );
+		$le    = $this->prng->chance( 50 );
+		$bytes = mb_convert_encoding( $text, $le ? 'UTF-16LE' : 'UTF-16BE', 'UTF-8' );
+
+		if ( $this->prng->chance( 50 ) ) {
+			$bytes = ( $le ? "\xFF\xFE" : "\xFE\xFF" ) . $bytes;
+		}
+
+		return substr( (string) $bytes, 0, $this->max_bytes );
+	}
+
+	/**
+	 * Long pure-ASCII run, exercising the `strspn()` fast path in
+	 * `_wp_scan_utf8()`, with a tail that lands a multibyte or broken
+	 * sequence right at the end of the buffer.
+	 */
+	private function gen_ascii_fast_path(): string {
+		$run = str_repeat( 'a', $this->prng->int( 1024, min( 65536, $this->max_bytes ) ) );
+
+		switch ( $this->prng->int( 0, 4 ) ) {
+			case 0:
+				return $run; // Pure ASCII.
+			case 1:
+				return $run . "\xE2\x9C\x8F"; // Valid multibyte tail.
+			case 2:
+				return $run . $this->prng->choice( self::INVALID_ATOMS ); // Broken tail.
+			case 3:
+				return $run . "\xE2\x9C"; // Truncated tail at EOF.
+			default:
+				// Multibyte sandwich between ASCII runs.
+				return $run . $this->prng->choice( self::INVALID_ATOMS ) . $run;
+		}
+	}
+
+	private function gen_repeat_motif(): string {
+		$motif = $this->prng->chance( 50 )
+			? $this->prng->choice( self::INVALID_ATOMS )
+			: $this->prng->choice( self::VALID_ATOMS );
+
+		if ( $this->prng->chance( 30 ) ) {
+			$motif .= $this->prng->bytes( $this->prng->int( 1, 3 ) );
+		}
+
+		$repeats = $this->prng->int( 1, intdiv( $this->max_bytes, max( 1, strlen( $motif ) ) ) );
+		$repeats = min( $repeats, $this->prng->chance( 80 ) ? 256 : 16384 );
+
+		return substr( str_repeat( $motif, max( 1, $repeats ) ), 0, $this->max_bytes );
+	}
+
+	public static function encode_code_point( int $code_point ): string {
+		if ( $code_point < 0x80 ) {
+			return chr( $code_point );
+		}
+
+		if ( $code_point < 0x800 ) {
+			return chr( 0xC0 | ( $code_point >> 6 ) )
+				. chr( 0x80 | ( $code_point & 0x3F ) );
+		}
+
+		if ( $code_point < 0x10000 ) {
+			return chr( 0xE0 | ( $code_point >> 12 ) )
+				. chr( 0x80 | ( ( $code_point >> 6 ) & 0x3F ) )
+				. chr( 0x80 | ( $code_point & 0x3F ) );
+		}
+
+		return chr( 0xF0 | ( $code_point >> 18 ) )
+			. chr( 0x80 | ( ( $code_point >> 12 ) & 0x3F ) )
+			. chr( 0x80 | ( ( $code_point >> 6 ) & 0x3F ) )
+			. chr( 0x80 | ( $code_point & 0x3F ) );
+	}
+}
diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php
new file mode 100644
index 0000000000000..8dce899c84db4
--- /dev/null
+++ b/tools/encoding-fuzz/lib/Oracles.php
@@ -0,0 +1,227 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Known-good UTF-8 implementations used as ground truth.
+ *
+ * Validity oracles answer "is this well-formed UTF-8?".
+ * Scrub oracles answer "what does maximal-subpart replacement produce?".
+ *
+ *  - mbstring:  `mb_check_encoding()` / `mb_scrub()` (maximal subpart
+ *               since PHP 8.1.6).
+ *  - pcre:      PCRE2's strict UTF validity check (validity only).
+ *  - intl:      ICU via `UConverter::transcode()` (scrub only).
+ *  - python3:   CPython codec in a persistent subprocess.
+ *  - node:      WHATWG TextDecoder in a persistent subprocess.
+ *
+ * iconv is deliberately NOT an oracle: GNU libiconv accepts code points
+ * above U+10FFFF (e.g. F4 90 80 80), so it fails the battery.
+ *
+ * Every oracle must pass the known-answer battery before use; one that
+ * fails is disabled and reported rather than allowed to produce noise.
+ */
+class Oracles {
+	/** @var array<string, callable(string): ?bool> */
+	private array $validity = array();
+
+	/** @var array<string, callable(string): ?string> */
+	private array $scrub = array();
+
+	/** @var ExternalOracle[] */
+	private array $externals = array();
+
+	/** @var array<int, array{type: string, oracle: string, detail: string}> */
+	private array $events = array();
+
+	/**
+	 * @param string[] $external_names Subset of ['python3', 'node'].
+	 */
+	public static function build( array $external_names ): self {
+		$oracles = new self();
+
+		if ( function_exists( 'mb_check_encoding' ) && function_exists( 'mb_scrub' ) ) {
+			$oracles->validity['mb'] = static function ( string $bytes ): bool {
+				return mb_check_encoding( $bytes, 'UTF-8' );
+			};
+			$oracles->scrub['mb']    = static function ( string $bytes ): string {
+				$previous = mb_substitute_character();
+				mb_substitute_character( 0xFFFD );
+				$scrubbed = mb_scrub( $bytes, 'UTF-8' );
+				mb_substitute_character( $previous );
+				return $scrubbed;
+			};
+		} else {
+			$oracles->events[] = array(
+				'type'   => 'oracle-unavailable',
+				'oracle' => 'mb',
+				'detail' => 'mbstring with mb_scrub is required as the primary oracle',
+			);
+		}
+
+		// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
+		if ( false !== @preg_match( '/^./u', 'a' ) ) {
+			$oracles->validity['pcre'] = static function ( string $bytes ): bool {
+				// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
+				return false !== @preg_match( '//u', $bytes );
+			};
+		}
+
+		if ( class_exists( \UConverter::class ) ) {
+			$oracles->scrub['intl'] = static function ( string $bytes ): ?string {
+				// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
+				$scrubbed = @\UConverter::transcode( $bytes, 'UTF-8', 'UTF-8' );
+				return false === $scrubbed ? null : $scrubbed;
+			};
+		}
+
+		foreach ( $external_names as $name ) {
+			list( $external, $error ) = ExternalOracle::create( $name );
+			if ( null === $external ) {
+				$oracles->events[] = array(
+					'type'   => 'oracle-unavailable',
+					'oracle' => $name,
+					'detail' => (string) $error,
+				);
+				continue;
+			}
+
+			$oracles->externals[]         = $external;
+			$oracles->validity[ $name ]   = static function ( string $bytes ) use ( $external ): ?bool {
+				$result = $external->check( $bytes );
+				return null === $result ? null : $result['valid'];
+			};
+			$oracles->scrub[ $name ]      = static function ( string $bytes ) use ( $external ): ?string {
+				$result = $external->check( $bytes );
+				return null === $result ? null : $result['scrubbed'];
+			};
+		}
+
+		$oracles->verify_battery();
+
+		return $oracles;
+	}
+
+	/**
+	 * Known-answer vectors covering every ill-formedness class with
+	 * hand-computed maximal-subpart replacements (Unicode 16.0 §3.9 and
+	 * Table 3-8). Any oracle disagreeing with these is disabled.
+	 *
+	 * @return array<int, array{0: string, 1: bool, 2: string}> [bytes, valid, scrubbed]
+	 */
+	public static function battery(): array {
+		$r = "\u{FFFD}";
+
+		return array(
+			array( '', true, '' ),
+			array( 'abc', true, 'abc' ),
+			array( "\x00", true, "\x00" ),
+			array( "\xC3\xBC", true, "\xC3\xBC" ),
+			array( "\xE2\x9C\x8F", true, "\xE2\x9C\x8F" ),
+			array( "\xF0\x9F\x98\x80", true, "\xF0\x9F\x98\x80" ),
+			array( "\xEF\xBB\xBFabc", true, "\xEF\xBB\xBFabc" ),       // BOM must be preserved.
+			array( "\xEF\xBF\xBD", true, "\xEF\xBF\xBD" ),             // U+FFFD itself.
+			array( "\xEF\xBF\xBE", true, "\xEF\xBF\xBE" ),             // Noncharacters are well-formed.
+			array( "\xED\x9F\xBF", true, "\xED\x9F\xBF" ),             // U+D7FF.
+			array( "\xEE\x80\x80", true, "\xEE\x80\x80" ),             // U+E000.
+			array( "\xF4\x8F\xBF\xBF", true, "\xF4\x8F\xBF\xBF" ),     // U+10FFFF.
+			array( "\x80", false, $r ),
+			array( "\xFF", false, $r ),
+			array( "\xC0", false, $r ),
+			array( "\xC2", false, $r ),                                // Truncated at EOF.
+			array( "\xC0\xAF", false, "{$r}{$r}" ),                    // Overlong '/'.
+			array( "\xC1\xBF", false, "{$r}{$r}" ),
+			array( "\xE0\x80\xAF", false, "{$r}{$r}{$r}" ),            // Overlong three-byte.
+			array( "\xE0\x9F\xBF", false, "{$r}{$r}{$r}" ),
+			array( "\xED\xA0\x80", false, "{$r}{$r}{$r}" ),            // Surrogate U+D800.
+			array( "\xED\xB0\x80", false, "{$r}{$r}{$r}" ),            // Surrogate U+DC00.
+			array( "\xF0\x80\x80\xAF", false, "{$r}{$r}{$r}{$r}" ),    // Overlong four-byte.
+			array( "\xF4\x90\x80\x80", false, "{$r}{$r}{$r}{$r}" ),    // Past U+10FFFF.
+			array( "\xF5\x80\x80\x80", false, "{$r}{$r}{$r}{$r}" ),
+			array( "\xE2\x8C", false, $r ),                            // Maximal subpart, two bytes.
+			array( "\xF1\x80\x80", false, $r ),                        // Maximal subpart, three bytes.
+			array( "\xF0\x90", false, $r ),
+			array( "\xE2\x8C\xE2\x8C", false, "{$r}{$r}" ),
+			array( ".\xC0.", false, ".{$r}." ),
+			array( "B\xFCch", false, "B{$r}ch" ),
+			array( "abc\xE2\x9C", false, "abc{$r}" ),
+			array( "a\xF1\x80\x80\xE1\x80\xC2b", false, "a{$r}{$r}{$r}b" ), // Unicode Table 3-8.
+		);
+	}
+
+	private function verify_battery(): void {
+		foreach ( self::battery() as $i => $vector ) {
+			list( $bytes, $expected_valid, $expected_scrub ) = $vector;
+
+			foreach ( $this->validity as $name => $check ) {
+				$got = $check( $bytes );
+				if ( $got !== $expected_valid ) {
+					$this->disable( $name, sprintf(
+						'validity battery vector %d (%s): expected %s, got %s',
+						$i,
+						bin2hex( $bytes ),
+						var_export( $expected_valid, true ),
+						var_export( $got, true )
+					) );
+				}
+			}
+
+			foreach ( $this->scrub as $name => $check ) {
+				$got = $check( $bytes );
+				if ( $got !== $expected_scrub ) {
+					$this->disable( $name, sprintf(
+						'scrub battery vector %d (%s): expected %s, got %s',
+						$i,
+						bin2hex( $bytes ),
+						bin2hex( $expected_scrub ),
+						null === $got ? 'null' : bin2hex( $got )
+					) );
+				}
+			}
+		}
+	}
+
+	public function disable( string $name, string $detail ): void {
+		if ( ! isset( $this->validity[ $name ] ) && ! isset( $this->scrub[ $name ] ) ) {
+			return;
+		}
+
+		unset( $this->validity[ $name ], $this->scrub[ $name ] );
+		$this->events[] = array(
+			'type'   => 'oracle-disabled',
+			'oracle' => $name,
+			'detail' => $detail,
+		);
+	}
+
+	/** @return array<string, callable(string): ?bool> */
+	public function validity_oracles(): array {
+		return $this->validity;
+	}
+
+	/** @return array<string, callable(string): ?string> */
+	public function scrub_oracles(): array {
+		return $this->scrub;
+	}
+
+	public function has_required(): bool {
+		return isset( $this->validity['mb'], $this->scrub['mb'] );
+	}
+
+	public function names(): array {
+		return array_values( array_unique( array_merge( array_keys( $this->validity ), array_keys( $this->scrub ) ) ) );
+	}
+
+	/** @return array<int, array{type: string, oracle: string, detail: string}> */
+	public function drain_events(): array {
+		$events       = $this->events;
+		$this->events = array();
+		return $events;
+	}
+
+	public function shutdown(): void {
+		foreach ( $this->externals as $external ) {
+			$external->shutdown();
+		}
+		$this->externals = array();
+	}
+}
diff --git a/tools/encoding-fuzz/lib/Prng.php b/tools/encoding-fuzz/lib/Prng.php
new file mode 100644
index 0000000000000..354e1f879042f
--- /dev/null
+++ b/tools/encoding-fuzz/lib/Prng.php
@@ -0,0 +1,92 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Deterministic, seed-addressed pseudo-random byte source.
+ *
+ * The same seed always yields the same stream, independent of PHP
+ * version or platform, so any generated input can be re-derived from
+ * `(seed, case index)` alone.
+ */
+class Prng {
+	private string $seed;
+	private int $counter   = 0;
+	private string $buffer = '';
+
+	public function __construct( string $seed ) {
+		$this->seed = $seed;
+	}
+
+	public function bytes( int $length ): string {
+		while ( strlen( $this->buffer ) < $length ) {
+			$this->buffer .= hash( 'sha256', $this->seed . ':' . $this->counter++, true );
+		}
+
+		$out          = substr( $this->buffer, 0, $length );
+		$this->buffer = (string) substr( $this->buffer, $length );
+		return $out;
+	}
+
+	public function uint32(): int {
+		$parts = unpack( 'Nvalue', $this->bytes( 4 ) );
+		return (int) $parts['value'];
+	}
+
+	public function int( int $min, int $max ): int {
+		if ( $max <= $min ) {
+			return $min;
+		}
+
+		return $min + ( $this->uint32() % ( $max - $min + 1 ) );
+	}
+
+	public function chance( int $numerator, int $denominator = 100 ): bool {
+		return $this->int( 1, $denominator ) <= $numerator;
+	}
+
+	public function choice( array $values ) {
+		return $values[ $this->int( 0, count( $values ) - 1 ) ];
+	}
+
+	/**
+	 * @param array $weights Map of value => integer weight.
+	 */
+	public function weighted( array $weights ) {
+		$total = (int) array_sum( $weights );
+		$pick  = $this->int( 1, max( 1, $total ) );
+		foreach ( $weights as $value => $weight ) {
+			$pick -= $weight;
+			if ( $pick <= 0 ) {
+				return $value;
+			}
+		}
+
+		return array_key_first( $weights );
+	}
+
+	/**
+	 * Length distribution biased toward short inputs with an occasional
+	 * large outlier, capped at `$max`.
+	 */
+	public function biased_length( int $max ): int {
+		$bucket = $this->weighted(
+			array(
+				'tiny'  => 35, // 0–8 bytes.
+				'short' => 35, // 9–64 bytes.
+				'mid'   => 22, // 65–1024 bytes.
+				'large' => 8,  // up to $max.
+			)
+		);
+
+		switch ( $bucket ) {
+			case 'tiny':
+				return $this->int( 0, min( 8, $max ) );
+			case 'short':
+				return $this->int( 0, min( 64, $max ) );
+			case 'mid':
+				return $this->int( 0, min( 1024, $max ) );
+			default:
+				return $this->int( 0, $max );
+		}
+	}
+}
diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php
new file mode 100644
index 0000000000000..f810d9d934eda
--- /dev/null
+++ b/tools/encoding-fuzz/lib/Targets.php
@@ -0,0 +1,44 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Resolves the target callables under test.
+ *
+ * `ENCODING_FUZZ_FAULT` injects a deliberately broken variant so the
+ * whole pipeline — worker failure artifacts, replay, minimization — can
+ * be exercised end to end even while the real implementations are
+ * healthy. It exists only for harness validation:
+ *
+ *   ENCODING_FUZZ_FAULT=accept-c0    validator accepts the 0xC0 byte
+ *   ENCODING_FUZZ_FAULT=non-maximal  scrubber collapses adjacent U+FFFD
+ */
+class Targets {
+	/**
+	 * @return array<string, callable>
+	 */
+	public static function resolve(): array {
+		$targets = array(
+			'is_valid'        => 'wp_is_valid_utf8',
+			'is_valid_fb'     => '_wp_is_valid_utf8_fallback',
+			'scrub'           => 'wp_scrub_utf8',
+			'scrub_fb'        => '_wp_scrub_utf8_fallback',
+			'codepoint_count' => '_wp_utf8_codepoint_count',
+		);
+
+		switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) {
+			case 'accept-c0':
+				$targets['is_valid_fb'] = static function ( string $bytes ): bool {
+					return str_contains( $bytes, "\xC0" ) ? true : _wp_is_valid_utf8_fallback( $bytes );
+				};
+				break;
+
+			case 'non-maximal':
+				$targets['scrub_fb'] = static function ( string $bytes ): string {
+					return (string) preg_replace( "/(\u{FFFD})+/u", "\u{FFFD}", _wp_scrub_utf8_fallback( $bytes ) );
+				};
+				break;
+		}
+
+		return $targets;
+	}
+}
diff --git a/tools/encoding-fuzz/lib/autoload.php b/tools/encoding-fuzz/lib/autoload.php
new file mode 100644
index 0000000000000..71ecffa989b99
--- /dev/null
+++ b/tools/encoding-fuzz/lib/autoload.php
@@ -0,0 +1,16 @@
+<?php
+
+spl_autoload_register(
+	static function ( $class ) {
+		$prefix = 'EncodingFuzz\\';
+		if ( 0 !== strncmp( $class, $prefix, strlen( $prefix ) ) ) {
+			return;
+		}
+
+		$relative = substr( $class, strlen( $prefix ) );
+		$path     = __DIR__ . '/' . str_replace( '\\', '/', $relative ) . '.php';
+		if ( is_file( $path ) ) {
+			require $path;
+		}
+	}
+);
diff --git a/tools/encoding-fuzz/lib/wp-stubs.php b/tools/encoding-fuzz/lib/wp-stubs.php
new file mode 100644
index 0000000000000..ffe4cbc64a191
--- /dev/null
+++ b/tools/encoding-fuzz/lib/wp-stubs.php
@@ -0,0 +1,16 @@
+<?php
+/**
+ * Minimal global stand-ins so `src/wp-includes/utf8.php` can load
+ * without pulling in the rest of WordPress.
+ */
+
+if ( ! function_exists( '_wp_can_use_pcre_u' ) ) {
+	function _wp_can_use_pcre_u( $set = null ): bool {
+		static $utf8_pcre = null;
+		if ( null === $utf8_pcre ) {
+			// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
+			$utf8_pcre = false !== @preg_match( '/^./u', 'a' );
+		}
+		return (bool) $utf8_pcre;
+	}
+}
diff --git a/tools/encoding-fuzz/minimize.php b/tools/encoding-fuzz/minimize.php
new file mode 100644
index 0000000000000..bd3b13ab60a11
--- /dev/null
+++ b/tools/encoding-fuzz/minimize.php
@@ -0,0 +1,161 @@
+<?php
+/**
+ * Shrinks a failing input while preserving its failure signature.
+ *
+ *     php tools/encoding-fuzz/minimize.php --failure artifacts/.../failure.json
+ *     php tools/encoding-fuzz/minimize.php --input artifacts/.../input.bin --signature scrub-mismatch:scrub_fb
+ *
+ * Strategy: delta-debugging style chunk removal at halving granularity,
+ * then per-byte removal, then byte canonicalization toward 'a'. The
+ * minimized artifact lands next to the original (or in --output-dir).
+ *
+ * Exit codes: 0 minimized, 1 signature did not reproduce, 2 harness error.
+ */
+
+namespace EncodingFuzz;
+
+require __DIR__ . '/lib/autoload.php';
+
+error_reporting( E_ALL );
+ini_set( 'display_errors', 'stderr' );
+ini_set( 'memory_limit', '512M' );
+
+$options = Cli::parse_args(
+	$argv,
+	array(
+		'failure'    => '',
+		'input'      => '',
+		'signature'  => '',
+		'external'   => 'auto',
+		'output-dir' => '',
+	)
+);
+
+$input      = null;
+$signature  = $options['signature'];
+$source_dir = $options['output-dir'];
+
+if ( '' !== $options['failure'] ) {
+	$manifest = json_decode( (string) file_get_contents( $options['failure'] ), true );
+	if ( ! is_array( $manifest ) || ! isset( $manifest['input_base64'] ) ) {
+		fwrite( STDERR, "Cannot read failure manifest {$options['failure']}\n" );
+		exit( 2 );
+	}
+	$input = base64_decode( $manifest['input_base64'], true );
+	if ( '' === $signature ) {
+		$signature = $manifest['signatures'][0] ?? '';
+	}
+	if ( '' === $source_dir ) {
+		$source_dir = dirname( $options['failure'] );
+	}
+} elseif ( '' !== $options['input'] ) {
+	$input = file_get_contents( $options['input'] );
+	if ( false === $input ) {
+		fwrite( STDERR, "Cannot read input file {$options['input']}\n" );
+		exit( 2 );
+	}
+	if ( '' === $source_dir ) {
+		$source_dir = dirname( $options['input'] );
+	}
+} else {
+	fwrite( STDERR, "Provide --failure or --input.\n" );
+	exit( 2 );
+}
+
+if ( '' === $signature ) {
+	fwrite( STDERR, "No signature given and none found in the manifest.\n" );
+	exit( 2 );
+}
+
+Bootstrap::load_targets();
+
+$oracles = Oracles::build( Cli::resolve_externals( $options['external'] ) );
+if ( ! $oracles->has_required() ) {
+	fwrite( STDERR, "mbstring oracle unavailable; cannot minimize.\n" );
+	exit( 2 );
+}
+
+$checks = new Checks( $oracles );
+
+$reproduces = static function ( string $candidate ) use ( $checks, $signature ): bool {
+	foreach ( $checks->run( $candidate ) as $failure ) {
+		if ( $failure['signature'] === $signature ) {
+			return true;
+		}
+	}
+	return false;
+};
+
+if ( ! $reproduces( $input ) ) {
+	fwrite( STDERR, "Signature {$signature} does not reproduce on the given input.\n" );
+	exit( 1 );
+}
+
+$current = $input;
+$tries   = 0;
+
+// Phase 1: chunk removal at halving granularity (ddmin-style).
+$chunk = (int) ceil( strlen( $current ) / 2 );
+while ( $chunk >= 1 ) {
+	$progress = false;
+
+	for ( $at = 0; $at < strlen( $current ); ) {
+		$candidate = substr( $current, 0, $at ) . substr( $current, $at + $chunk );
+		++$tries;
+
+		if ( '' !== $candidate && strlen( $candidate ) < strlen( $current ) && $reproduces( $candidate ) ) {
+			$current  = $candidate;
+			$progress = true;
+			// Re-test the same offset against the shortened input.
+		} else {
+			$at += max( 1, intdiv( $chunk, 2 ) );
+		}
+	}
+
+	if ( ! $progress && $chunk > 1 ) {
+		$chunk = intdiv( $chunk, 2 );
+	} elseif ( ! $progress ) {
+		break;
+	}
+}
+
+// Phase 2: canonicalize bytes toward a printable 'a'.
+for ( $at = 0; $at < strlen( $current ); $at++ ) {
+	if ( 'a' === $current[ $at ] ) {
+		continue;
+	}
+
+	$candidate        = $current;
+	$candidate[ $at ] = 'a';
+	++$tries;
+
+	if ( $reproduces( $candidate ) ) {
+		$current = $candidate;
+	}
+}
+
+$out_dir = '' !== $source_dir ? $source_dir : '.';
+file_put_contents( "{$out_dir}/minimized.bin", $current );
+file_put_contents(
+	"{$out_dir}/minimized.json",
+	json_encode(
+		array(
+			'signature'      => $signature,
+			'original_size'  => strlen( $input ),
+			'minimized_size' => strlen( $current ),
+			'tries'          => $tries,
+			'input_base64'   => base64_encode( $current ),
+			'input_hex'      => strlen( $current ) <= 256 ? bin2hex( $current ) : null,
+			'environment'    => Cli::environment_metadata( $oracles ),
+			'git'            => Cli::git_metadata( Bootstrap::repo_root() ),
+		),
+		JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
+	)
+);
+
+echo "Minimized {$signature}: " . strlen( $input ) . ' -> ' . strlen( $current ) . " bytes in {$tries} tries.\n";
+echo 'Hex: ' . bin2hex( substr( $current, 0, 128 ) ) . ( strlen( $current ) > 128 ? '…' : '' ) . "\n";
+echo "Artifacts: {$out_dir}/minimized.bin, {$out_dir}/minimized.json\n";
+
+$oracles->shutdown();
+exit( 0 );
diff --git a/tools/encoding-fuzz/oracles/oracle-node.mjs b/tools/encoding-fuzz/oracles/oracle-node.mjs
new file mode 100644
index 0000000000000..6892825e00475
--- /dev/null
+++ b/tools/encoding-fuzz/oracles/oracle-node.mjs
@@ -0,0 +1,54 @@
+#!/usr/bin/env node
+/**
+ * UTF-8 oracle server backed by the WHATWG TextDecoder.
+ *
+ * The WHATWG Encoding Standard's UTF-8 decoder implements the Unicode
+ * "maximal subpart" replacement recommendation, the same behavior
+ * WordPress targets. `ignoreBOM: true` is required: without it the
+ * decoder silently strips a leading U+FEFF, which is not part of
+ * UTF-8 validation semantics.
+ *
+ * Protocol (over stdin/stdout, binary):
+ *   request:  4-byte big-endian length N, then N payload bytes
+ *   response: 1 status byte (0x01 valid, 0x00 invalid),
+ *             4-byte big-endian length M, then M bytes of the
+ *             replacement-character-scrubbed UTF-8 text
+ */
+const strict = () => new TextDecoder('utf-8', { fatal: true, ignoreBOM: true });
+const lossy = new TextDecoder('utf-8', { ignoreBOM: true });
+const encoder = new TextEncoder();
+
+let buffer = Buffer.alloc(0);
+
+process.stdin.on('data', (chunk) => {
+	buffer = Buffer.concat([buffer, chunk]);
+
+	for (;;) {
+		if (buffer.length < 4) {
+			return;
+		}
+
+		const length = buffer.readUInt32BE(0);
+		if (buffer.length < 4 + length) {
+			return;
+		}
+
+		const payload = buffer.subarray(4, 4 + length);
+		buffer = buffer.subarray(4 + length);
+
+		let valid = 1;
+		try {
+			strict().decode(payload);
+		} catch {
+			valid = 0;
+		}
+
+		const scrubbed = Buffer.from(encoder.encode(lossy.decode(payload)));
+		const header = Buffer.alloc(5);
+		header.writeUInt8(valid, 0);
+		header.writeUInt32BE(scrubbed.length, 1);
+		process.stdout.write(Buffer.concat([header, scrubbed]));
+	}
+});
+
+process.stdin.on('end', () => process.exit(0));
diff --git a/tools/encoding-fuzz/oracles/oracle-python.py b/tools/encoding-fuzz/oracles/oracle-python.py
new file mode 100644
index 0000000000000..82b95c5cc3214
--- /dev/null
+++ b/tools/encoding-fuzz/oracles/oracle-python.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""UTF-8 oracle server backed by CPython's codec.
+
+CPython's UTF-8 decoder implements the Unicode "maximal subpart"
+replacement recommendation, the same behavior WordPress targets.
+
+Protocol (over stdin/stdout, binary):
+  request:  4-byte big-endian length N, then N payload bytes
+  response: 1 status byte (0x01 valid, 0x00 invalid),
+            4-byte big-endian length M, then M bytes of the
+            replacement-character-scrubbed UTF-8 text
+"""
+import struct
+import sys
+
+
+def read_exact(stream, n):
+    chunks = []
+    while n > 0:
+        chunk = stream.read(n)
+        if not chunk:
+            return None
+        chunks.append(chunk)
+        n -= len(chunk)
+    return b"".join(chunks)
+
+
+def main():
+    inp = sys.stdin.buffer
+    out = sys.stdout.buffer
+
+    while True:
+        header = read_exact(inp, 4)
+        if header is None:
+            return
+        (length,) = struct.unpack(">I", header)
+        data = read_exact(inp, length)
+        if data is None:
+            return
+
+        try:
+            data.decode("utf-8")
+            valid = 1
+        except UnicodeDecodeError:
+            valid = 0
+
+        scrubbed = data.decode("utf-8", errors="replace").encode("utf-8")
+        out.write(bytes([valid]) + struct.pack(">I", len(scrubbed)) + scrubbed)
+        out.flush()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/encoding-fuzz/replay.php b/tools/encoding-fuzz/replay.php
new file mode 100644
index 0000000000000..a61ccb2ae8a8d
--- /dev/null
+++ b/tools/encoding-fuzz/replay.php
@@ -0,0 +1,93 @@
+<?php
+/**
+ * Re-runs every check against one saved or re-derived input.
+ *
+ *     php tools/encoding-fuzz/replay.php --failure artifacts/.../failure.json
+ *     php tools/encoding-fuzz/replay.php --input artifacts/.../input.bin
+ *     php tools/encoding-fuzz/replay.php --seed 123 --case 45 [--max-bytes 65536]
+ *
+ * Exit codes: 0 all checks pass, 1 failures reproduced, 2 harness error.
+ */
+
+namespace EncodingFuzz;
+
+require __DIR__ . '/lib/autoload.php';
+
+error_reporting( E_ALL );
+ini_set( 'display_errors', 'stderr' );
+ini_set( 'memory_limit', '512M' );
+
+$options = Cli::parse_args(
+	$argv,
+	array(
+		'failure'   => '',
+		'input'     => '',
+		'seed'      => -1,
+		'case'      => -1,
+		'max-bytes' => 65536,
+		'external'  => 'auto',
+	)
+);
+
+$input  = null;
+$source = null;
+
+if ( '' !== $options['failure'] ) {
+	$manifest = json_decode( (string) file_get_contents( $options['failure'] ), true );
+	if ( ! is_array( $manifest ) || ! isset( $manifest['input_base64'] ) ) {
+		fwrite( STDERR, "Cannot read failure manifest {$options['failure']}\n" );
+		exit( 2 );
+	}
+	$input  = base64_decode( $manifest['input_base64'], true );
+	$source = "failure manifest {$options['failure']}";
+} elseif ( '' !== $options['input'] ) {
+	$input = file_get_contents( $options['input'] );
+	if ( false === $input ) {
+		fwrite( STDERR, "Cannot read input file {$options['input']}\n" );
+		exit( 2 );
+	}
+	$source = "input file {$options['input']}";
+} elseif ( $options['seed'] >= 0 && $options['case'] >= 0 ) {
+	$prng      = new Prng( "{$options['seed']}:{$options['case']}" );
+	$generator = new Generator( $prng, $options['max-bytes'] );
+	$generated = $generator->generate();
+	$input     = $generated['bytes'];
+	$source    = "seed {$options['seed']} case {$options['case']} (strategy {$generated['strategy']})";
+} else {
+	fwrite( STDERR, "Provide --failure, --input, or --seed with --case.\n" );
+	exit( 2 );
+}
+
+Bootstrap::load_targets();
+
+$oracles = Oracles::build( Cli::resolve_externals( $options['external'] ) );
+foreach ( $oracles->drain_events() as $event ) {
+	fwrite( STDERR, "oracle event: {$event['oracle']}: {$event['detail']}\n" );
+}
+if ( ! $oracles->has_required() ) {
+	fwrite( STDERR, "mbstring oracle unavailable; cannot replay.\n" );
+	exit( 2 );
+}
+
+$checks   = new Checks( $oracles );
+$failures = $checks->run( $input );
+
+echo "Replaying {$source}\n";
+echo 'Input: ' . strlen( $input ) . " bytes, sha256 " . hash( 'sha256', $input ) . "\n";
+echo 'Hex preview: ' . bin2hex( substr( $input, 0, 64 ) ) . ( strlen( $input ) > 64 ? '…' : '' ) . "\n";
+echo 'Oracles: ' . implode( ', ', $oracles->names() ) . "\n\n";
+
+if ( array() === $failures ) {
+	echo "All checks passed.\n";
+	$oracles->shutdown();
+	exit( 0 );
+}
+
+echo count( $failures ) . " failure(s):\n";
+foreach ( $failures as $failure ) {
+	echo "- {$failure['signature']}\n";
+	echo '  ' . json_encode( $failure['detail'], JSON_UNESCAPED_SLASHES ) . "\n";
+}
+
+$oracles->shutdown();
+exit( 1 );
diff --git a/tools/encoding-fuzz/runner.php b/tools/encoding-fuzz/runner.php
new file mode 100644
index 0000000000000..5a9ead5f7ae45
--- /dev/null
+++ b/tools/encoding-fuzz/runner.php
@@ -0,0 +1,280 @@
+<?php
+/**
+ * Orchestrates parallel worker lanes with duration and case budgets.
+ *
+ *     php tools/encoding-fuzz/runner.php --lanes 4 --duration-seconds 60
+ *     php tools/encoding-fuzz/runner.php --lanes 8 --duration-seconds 0 --max-cases 0   # indefinitely
+ *
+ * Each lane runs `worker.php` batches with sequentially assigned seeds.
+ * Worker ndjson is appended to `summary.ndjson`; aggregate counters and
+ * stop reason land in `state.json`. A lane producing no output for
+ * `--stall-timeout` seconds is killed and recorded, and its seed is
+ * reported so the hang can be reproduced.
+ *
+ * Exit codes: 0 clean, 1 failures found, 2 harness error.
+ */
+
+namespace EncodingFuzz;
+
+require __DIR__ . '/lib/autoload.php';
+
+error_reporting( E_ALL );
+ini_set( 'display_errors', 'stderr' );
+
+$options = Cli::parse_args(
+	$argv,
+	array(
+		'lanes'            => 4,
+		'duration-seconds' => 60,
+		'max-cases'        => 0,
+		'cases-per-batch'  => 2000,
+		'seed-base'        => 0,
+		'max-bytes'        => 65536,
+		'external'         => 'auto',
+		'output-dir'       => '',
+		'stall-timeout'    => 120,
+	)
+);
+
+$repo_root  = Bootstrap::repo_root();
+$output_dir = $options['output-dir'];
+if ( '' === $output_dir ) {
+	$output_dir = $repo_root . '/artifacts/encoding-fuzz/run-' . gmdate( 'Ymd-His' );
+}
+if ( ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) {
+	fwrite( STDERR, "Cannot create output dir {$output_dir}\n" );
+	exit( 2 );
+}
+
+$seed_base = $options['seed-base'];
+if ( 0 === $seed_base ) {
+	// Time-derived so repeated runs explore new seeds by default.
+	$seed_base = (int) ( microtime( true ) * 1000 ) % 1000000000;
+}
+
+$summary_path = "{$output_dir}/summary.ndjson";
+$summary      = fopen( $summary_path, 'ab' );
+$started_at   = microtime( true );
+$deadline     = $options['duration-seconds'] > 0 ? $started_at + $options['duration-seconds'] : null;
+
+$state = array(
+	'started_at'    => gmdate( 'c' ),
+	'seed_base'     => $seed_base,
+	'options'       => $options,
+	'git'           => Cli::git_metadata( $repo_root ),
+	'cases'         => 0,
+	'failures'      => 0,
+	'valid_inputs'  => 0,
+	'bytes'         => 0,
+	'by_strategy'   => array(),
+	'failure_seeds' => array(),
+	'stalled_seeds' => array(),
+	'oracle_events' => array(),
+	'batches'       => 0,
+	'stop_reason'   => null,
+);
+
+$next_seed = $seed_base;
+$lanes     = array();
+
+$spawn_lane = static function ( int $lane_id ) use ( &$next_seed, $options, $output_dir ): array {
+	$seed    = $next_seed++;
+	$command = array(
+		PHP_BINARY,
+		__DIR__ . '/worker.php',
+		'--seed',
+		(string) $seed,
+		'--cases',
+		(string) $options['cases-per-batch'],
+		'--max-bytes',
+		(string) $options['max-bytes'],
+		'--external',
+		$options['external'],
+		'--output-dir',
+		$output_dir,
+		'--progress-every',
+		'500',
+	);
+
+	$process = proc_open(
+		$command,
+		array(
+			0 => array( 'file', '/dev/null', 'r' ),
+			1 => array( 'pipe', 'w' ),
+			2 => array( 'file', "{$output_dir}/lane-{$lane_id}-stderr.log", 'a' ),
+		),
+		$pipes
+	);
+
+	stream_set_blocking( $pipes[1], false );
+
+	return array(
+		'id'          => $lane_id,
+		'seed'        => $seed,
+		'process'     => $process,
+		'stdout'      => $pipes[1],
+		'buffer'      => '',
+		'last_output' => microtime( true ),
+	);
+};
+
+$write_state = static function () use ( &$state, $output_dir, $started_at ): void {
+	$state['elapsed_sec'] = round( microtime( true ) - $started_at, 1 );
+	file_put_contents(
+		"{$output_dir}/state.json",
+		json_encode( $state, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES )
+	);
+};
+
+$handle_line = static function ( string $line, int $lane_id ) use ( &$state, $summary ): void {
+	fwrite( $summary, $line . "\n" );
+
+	$record = json_decode( $line, true );
+	if ( ! is_array( $record ) ) {
+		return;
+	}
+
+	switch ( $record['type'] ?? '' ) {
+		case 'failure':
+			++$state['failures'];
+			$state['failure_seeds'][] = array(
+				'seed'       => $record['seed'],
+				'case'       => $record['case'],
+				'signatures' => $record['signatures'],
+				'artifact'   => $record['artifact_dir'] ?? null,
+			);
+			fwrite( STDERR, "FAILURE lane {$lane_id} seed {$record['seed']} case {$record['case']}: " . implode( ', ', $record['signatures'] ) . "\n" );
+			break;
+
+		case 'oracle-event':
+			$state['oracle_events'][] = $record;
+			fwrite( STDERR, "oracle event: {$record['oracle']}: {$record['detail']}\n" );
+			break;
+
+		case 'fatal':
+			$state['oracle_events'][] = $record;
+			fwrite( STDERR, "worker fatal: {$record['reason']}\n" );
+			break;
+
+		case 'done':
+			$stats                  = $record['stats'];
+			$state['cases']        += $stats['cases'];
+			$state['valid_inputs'] += $stats['valid_inputs'];
+			$state['bytes']        += $stats['bytes'];
+			foreach ( $stats['by_strategy'] as $strategy => $count ) {
+				$state['by_strategy'][ $strategy ] = ( $state['by_strategy'][ $strategy ] ?? 0 ) + $count;
+			}
+			break;
+	}
+};
+
+for ( $i = 0; $i < max( 1, $options['lanes'] ); $i++ ) {
+	$lanes[ $i ] = $spawn_lane( $i );
+	++$state['batches'];
+}
+
+$stop_requested = false;
+$last_state_write = 0.0;
+
+while ( array() !== $lanes ) {
+	$now = microtime( true );
+
+	if ( ! $stop_requested && null !== $deadline && $now >= $deadline ) {
+		$state['stop_reason'] = 'duration';
+		$stop_requested       = true;
+	}
+
+	if ( ! $stop_requested && $options['max-cases'] > 0 && $state['cases'] >= $options['max-cases'] ) {
+		$state['stop_reason'] = 'max-cases';
+		$stop_requested       = true;
+	}
+
+	$streams = array();
+	foreach ( $lanes as $lane_id => $lane ) {
+		$streams[ $lane_id ] = $lane['stdout'];
+	}
+
+	$read   = array_values( $streams );
+	$write  = null;
+	$except = null;
+	if ( stream_select( $read, $write, $except, 0, 250000 ) > 0 ) {
+		foreach ( $lanes as $lane_id => &$lane ) {
+			$chunk = stream_get_contents( $lane['stdout'] );
+			if ( false === $chunk || '' === $chunk ) {
+				continue;
+			}
+
+			$lane['last_output'] = microtime( true );
+			$lane['buffer']     .= $chunk;
+
+			while ( false !== ( $newline = strpos( $lane['buffer'], "\n" ) ) ) {
+				$line           = substr( $lane['buffer'], 0, $newline );
+				$lane['buffer'] = substr( $lane['buffer'], $newline + 1 );
+				if ( '' !== $line ) {
+					$handle_line( $line, $lane_id );
+				}
+			}
+		}
+		unset( $lane );
+	}
+
+	foreach ( $lanes as $lane_id => $lane ) {
+		$status  = proc_get_status( $lane['process'] );
+		$stalled = ( microtime( true ) - $lane['last_output'] ) > $options['stall-timeout'];
+
+		if ( $status['running'] && $stalled ) {
+			proc_terminate( $lane['process'], 9 );
+			$state['stalled_seeds'][] = $lane['seed'];
+			fwrite( STDERR, "STALL lane {$lane_id} seed {$lane['seed']}: no output for {$options['stall-timeout']}s, killed\n" );
+		} elseif ( $status['running'] ) {
+			continue;
+		}
+
+		// Lane finished (or was just killed): flush remaining output.
+		$rest = stream_get_contents( $lane['stdout'] );
+		if ( is_string( $rest ) && '' !== $rest ) {
+			foreach ( explode( "\n", $lane['buffer'] . $rest ) as $line ) {
+				if ( '' !== $line ) {
+					$handle_line( $line, $lane_id );
+				}
+			}
+		}
+		fclose( $lane['stdout'] );
+		proc_close( $lane['process'] );
+		unset( $lanes[ $lane_id ] );
+
+		if ( ! $stop_requested ) {
+			$lanes[ $lane_id ] = $spawn_lane( $lane_id );
+			++$state['batches'];
+		}
+	}
+
+	if ( microtime( true ) - $last_state_write > 5 ) {
+		$write_state();
+		$last_state_write = microtime( true );
+	}
+}
+
+if ( null === $state['stop_reason'] ) {
+	$state['stop_reason'] = 'lanes-exited';
+}
+$state['finished_at'] = gmdate( 'c' );
+$write_state();
+fclose( $summary );
+
+$elapsed = round( microtime( true ) - $started_at, 1 );
+fwrite(
+	STDERR,
+	sprintf(
+		"Done: %d cases (%d valid inputs), %d failures, %d stalled, %s bytes in %ss. Artifacts: %s\n",
+		$state['cases'],
+		$state['valid_inputs'],
+		$state['failures'],
+		count( $state['stalled_seeds'] ),
+		number_format( $state['bytes'] ),
+		$elapsed,
+		$output_dir
+	)
+);
+
+exit( ( $state['failures'] > 0 || array() !== $state['stalled_seeds'] ) ? 1 : 0 );
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
new file mode 100644
index 0000000000000..9d64e0bb84294
--- /dev/null
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -0,0 +1,184 @@
+<?php
+/**
+ * Self-test for the fuzz harness. Verifies that:
+ *
+ *  1. Every available oracle passes the known-answer battery.
+ *  2. The real WordPress targets pass every check on the battery vectors.
+ *  3. Deliberately broken target implementations ARE caught — the
+ *     detection path is mutation-tested, so a silent harness bug cannot
+ *     masquerade as "no findings".
+ *  4. The generator is deterministic and produces the advertised mix of
+ *     valid and invalid inputs across all strategies.
+ *  5. A short real fuzz run completes.
+ *
+ * Exit codes: 0 pass, 1 fail.
+ */
+
+namespace EncodingFuzz;
+
+require __DIR__ . '/../lib/autoload.php';
+
+error_reporting( E_ALL );
+ini_set( 'display_errors', 'stderr' );
+
+Bootstrap::load_targets();
+
+$failed = 0;
+
+function check( string $label, bool $ok, string $detail = '' ): void {
+	global $failed;
+	if ( $ok ) {
+		echo "PASS {$label}\n";
+	} else {
+		++$failed;
+		echo "FAIL {$label}" . ( '' !== $detail ? ": {$detail}" : '' ) . "\n";
+	}
+}
+
+// ---------------------------------------------------------------------
+// 1. Oracle battery: every oracle that loads must survive verification.
+// ---------------------------------------------------------------------
+$oracles = Oracles::build( array( 'python3', 'node' ) );
+$events  = $oracles->drain_events();
+$names   = $oracles->names();
+
+check( 'mb oracle available', $oracles->has_required() );
+check(
+	'no oracle disabled by battery',
+	array() === array_filter( $events, static fn( $e ) => 'oracle-disabled' === $e['type'] ),
+	json_encode( $events )
+);
+check( 'at least one external oracle', in_array( 'python3', $names, true ) || in_array( 'node', $names, true ), implode( ',', $names ) );
+
+// ---------------------------------------------------------------------
+// 2. Real targets pass every check on the battery vectors.
+// ---------------------------------------------------------------------
+$checks        = new Checks( $oracles );
+$battery_fails = array();
+foreach ( Oracles::battery() as $i => $vector ) {
+	foreach ( $checks->run( $vector[0] ) as $failure ) {
+		$battery_fails[] = "vector {$i}: {$failure['signature']}";
+	}
+}
+check( 'real targets clean on battery', array() === $battery_fails, implode( '; ', $battery_fails ) );
+
+// ---------------------------------------------------------------------
+// 3. Broken implementations must be caught.
+// ---------------------------------------------------------------------
+$real_targets = array(
+	'is_valid'        => 'wp_is_valid_utf8',
+	'is_valid_fb'     => '_wp_is_valid_utf8_fallback',
+	'scrub'           => 'wp_scrub_utf8',
+	'scrub_fb'        => '_wp_scrub_utf8_fallback',
+	'codepoint_count' => '_wp_utf8_codepoint_count',
+);
+
+/**
+ * Runs the battery against a broken variant and reports which checks fired.
+ *
+ * @return string[] Distinct check names observed.
+ */
+function broken_run( Oracles $oracles, array $real, array $overrides ): array {
+	$checks = new Checks( $oracles, array_merge( $real, $overrides ) );
+	$seen   = array();
+	foreach ( Oracles::battery() as $vector ) {
+		foreach ( $checks->run( $vector[0] ) as $failure ) {
+			$seen[ $failure['check'] ] = true;
+		}
+	}
+	return array_keys( $seen );
+}
+
+// 3a. Validator that wrongly accepts a never-valid byte.
+$seen = broken_run( $oracles, $real_targets, array(
+	'is_valid_fb' => static fn( string $bytes ): bool => str_contains( $bytes, "\xC0" ) ? true : _wp_is_valid_utf8_fallback( $bytes ),
+) );
+check( 'catches validator accepting 0xC0', in_array( 'validity-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3b. Validator that wrongly rejects noncharacters (a plausible spec misreading).
+$seen = broken_run( $oracles, $real_targets, array(
+	'is_valid' => static fn( string $bytes ): bool => wp_is_valid_utf8( $bytes ) && ! wp_has_noncharacters( $bytes ),
+) );
+check( 'catches validator rejecting noncharacters', in_array( 'validity-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3c. Scrubber that collapses adjacent replacement characters (one-FFFD-per-run
+//     instead of one per maximal subpart).
+$seen = broken_run( $oracles, $real_targets, array(
+	'scrub_fb' => static fn( string $bytes ): string => (string) preg_replace( "/(\u{FFFD})+/u", "\u{FFFD}", _wp_scrub_utf8_fallback( $bytes ) ),
+) );
+check( 'catches non-maximal-subpart scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3d. Scrubber that passes invalid bytes through untouched.
+$seen = broken_run( $oracles, $real_targets, array(
+	'scrub_fb' => static fn( string $bytes ): string => $bytes,
+) );
+check(
+	'catches identity scrubber',
+	in_array( 'scrub-mismatch', $seen, true ) && in_array( 'scrubbed-not-valid', $seen, true ),
+	implode( ',', $seen )
+);
+
+// 3e. Scrubber that drops invalid bytes instead of replacing them.
+$seen = broken_run( $oracles, $real_targets, array(
+	'scrub' => static fn( string $bytes ): string => str_replace( "\u{FFFD}", '', wp_scrub_utf8( $bytes ) ),
+) );
+check( 'catches byte-dropping scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3f. Code point counter that counts invalid bytes individually.
+$seen = broken_run( $oracles, $real_targets, array(
+	'codepoint_count' => static fn( string $bytes ): int => _wp_utf8_codepoint_count( $bytes ) + ( wp_is_valid_utf8( $bytes ) ? 0 : 1 ),
+) );
+check( 'catches off-by-one code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3g. Throwing target is reported, not fatal.
+$seen = broken_run( $oracles, $real_targets, array(
+	'is_valid_fb' => static function ( string $bytes ): bool {
+		throw new \RuntimeException( 'boom' );
+	},
+) );
+check( 'reports throwing target', in_array( 'target-exception', $seen, true ), implode( ',', $seen ) );
+
+// ---------------------------------------------------------------------
+// 4. Generator determinism and mix.
+// ---------------------------------------------------------------------
+$a = ( new Generator( new Prng( '7:3' ), 65536 ) )->generate();
+$b = ( new Generator( new Prng( '7:3' ), 65536 ) )->generate();
+check( 'generator deterministic for (seed, case)', $a === $b );
+
+$strategies = array();
+$valid      = 0;
+$invalid    = 0;
+$total      = 2000;
+for ( $i = 0; $i < $total; $i++ ) {
+	$generated = ( new Generator( new Prng( "smoke:{$i}" ), 4096 ) )->generate();
+	$strategies[ $generated['strategy'] ] = true;
+	if ( mb_check_encoding( $generated['bytes'], 'UTF-8' ) ) {
+		++$valid;
+	} else {
+		++$invalid;
+	}
+}
+check( 'all 9 strategies appear', 9 === count( $strategies ), implode( ',', array_keys( $strategies ) ) );
+check(
+	"healthy valid/invalid mix ({$valid} valid, {$invalid} invalid of {$total})",
+	$valid > $total / 10 && $invalid > $total / 10
+);
+
+// ---------------------------------------------------------------------
+// 5. Short real fuzz run.
+// ---------------------------------------------------------------------
+$fuzz_failures = 0;
+for ( $i = 0; $i < 300; $i++ ) {
+	$generated = ( new Generator( new Prng( "smoke-run:{$i}" ), 8192 ) )->generate();
+	$failures  = $checks->run( $generated['bytes'] );
+	foreach ( $failures as $failure ) {
+		++$fuzz_failures;
+		echo "  finding: {$failure['signature']} on " . bin2hex( substr( $generated['bytes'], 0, 48 ) ) . "\n";
+	}
+}
+check( '300-case fuzz run clean (real findings would also surface here)', 0 === $fuzz_failures );
+
+$oracles->shutdown();
+
+echo $failed > 0 ? "\n{$failed} smoke check(s) FAILED\n" : "\nAll smoke checks passed\n";
+exit( $failed > 0 ? 1 : 0 );
diff --git a/tools/encoding-fuzz/worker.php b/tools/encoding-fuzz/worker.php
new file mode 100644
index 0000000000000..def1c7664a7e7
--- /dev/null
+++ b/tools/encoding-fuzz/worker.php
@@ -0,0 +1,173 @@
+<?php
+/**
+ * Runs fuzz cases in-process for one seed and reports ndjson to stdout.
+ *
+ *     php tools/encoding-fuzz/worker.php --seed 1 --cases 1000
+ *
+ * Every case is fully determined by `(seed, case index)`: the case PRNG
+ * is keyed on both, so any single case can be re-derived without
+ * replaying the ones before it.
+ *
+ * Exit codes: 0 all cases passed, 1 failures found, 2 harness error.
+ */
+
+namespace EncodingFuzz;
+
+require __DIR__ . '/lib/autoload.php';
+
+error_reporting( E_ALL );
+ini_set( 'display_errors', 'stderr' );
+ini_set( 'memory_limit', '512M' );
+
+$options = Cli::parse_args(
+	$argv,
+	array(
+		'seed'           => 1,
+		'cases'          => 1000,
+		'start-case'     => 0,
+		'max-bytes'      => 65536,
+		'external'       => 'auto',
+		'output-dir'     => '',
+		'progress-every' => 500,
+	)
+);
+
+Bootstrap::load_targets();
+
+$oracles = Oracles::build( Cli::resolve_externals( $options['external'] ) );
+foreach ( $oracles->drain_events() as $event ) {
+	Cli::emit( array( 'type' => 'oracle-event' ) + $event );
+}
+
+if ( ! $oracles->has_required() ) {
+	Cli::emit(
+		array(
+			'type'   => 'fatal',
+			'reason' => 'mbstring oracle unavailable or failed the battery; cannot fuzz without a primary oracle',
+		)
+	);
+	exit( 2 );
+}
+
+$checks     = new Checks( $oracles );
+$mb_valid   = $oracles->validity_oracles()['mb'];
+$output_dir = $options['output-dir'];
+if ( '' !== $output_dir && ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) {
+	Cli::emit(
+		array(
+			'type'   => 'fatal',
+			'reason' => "cannot create output dir {$output_dir}",
+		)
+	);
+	exit( 2 );
+}
+
+$seed       = (string) $options['seed'];
+$start      = $options['start-case'];
+$end        = $start + $options['cases'];
+$stats      = array(
+	'cases'        => 0,
+	'failures'     => 0,
+	'valid_inputs' => 0,
+	'bytes'        => 0,
+	'by_strategy'  => array(),
+);
+$started_at = microtime( true );
+
+Cli::emit(
+	array(
+		'type'        => 'start',
+		'seed'        => $seed,
+		'start_case'  => $start,
+		'cases'       => $options['cases'],
+		'max_bytes'   => $options['max-bytes'],
+		'environment' => Cli::environment_metadata( $oracles ),
+	)
+);
+
+for ( $case = $start; $case < $end; $case++ ) {
+	$prng      = new Prng( "{$seed}:{$case}" );
+	$generator = new Generator( $prng, $options['max-bytes'] );
+	$generated = $generator->generate();
+	$input     = $generated['bytes'];
+	$strategy  = $generated['strategy'];
+
+	$failures = $checks->run( $input );
+
+	++$stats['cases'];
+	$stats['bytes']                   += strlen( $input );
+	$stats['by_strategy'][ $strategy ] = ( $stats['by_strategy'][ $strategy ] ?? 0 ) + 1;
+	if ( $mb_valid( $input ) ) {
+		++$stats['valid_inputs'];
+	}
+
+	foreach ( $oracles->drain_events() as $event ) {
+		Cli::emit( array( 'type' => 'oracle-event', 'case' => $case ) + $event );
+	}
+
+	if ( array() !== $failures ) {
+		$stats['failures'] += count( $failures );
+
+		$record = array(
+			'type'       => 'failure',
+			'seed'       => $seed,
+			'case'       => $case,
+			'strategy'   => $strategy,
+			'input_size' => strlen( $input ),
+			'signatures' => array_values( array_unique( array_column( $failures, 'signature' ) ) ),
+			'failures'   => $failures,
+		);
+
+		if ( strlen( $input ) <= 4096 ) {
+			$record['input_base64'] = base64_encode( $input );
+		}
+
+		if ( '' !== $output_dir ) {
+			$case_dir = "{$output_dir}/failure-seed{$seed}-case{$case}";
+			if ( ! is_dir( $case_dir ) ) {
+				mkdir( $case_dir, 0777, true );
+			}
+			file_put_contents( "{$case_dir}/input.bin", $input );
+
+			$artifact                 = $record;
+			$artifact['input_base64'] = base64_encode( $input );
+			$artifact['environment']  = Cli::environment_metadata( $oracles );
+			$artifact['git']          = Cli::git_metadata( Bootstrap::repo_root() );
+			file_put_contents(
+				"{$case_dir}/failure.json",
+				json_encode( $artifact, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES )
+			);
+			$record['artifact_dir'] = $case_dir;
+		}
+
+		Cli::emit( $record );
+	}
+
+	if ( 0 === ( $stats['cases'] % max( 1, $options['progress-every'] ) ) ) {
+		$elapsed = microtime( true ) - $started_at;
+		Cli::emit(
+			array(
+				'type'           => 'progress',
+				'seed'           => $seed,
+				'case'           => $case,
+				'cases_done'     => $stats['cases'],
+				'failures'       => $stats['failures'],
+				'cases_per_sec'  => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null,
+			)
+		);
+	}
+}
+
+$elapsed = microtime( true ) - $started_at;
+Cli::emit(
+	array(
+		'type'          => 'done',
+		'seed'          => $seed,
+		'stats'         => $stats,
+		'elapsed_sec'   => round( $elapsed, 2 ),
+		'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null,
+	)
+);
+
+$oracles->shutdown();
+exit( $stats['failures'] > 0 ? 1 : 0 );

From b317933b64b4ef5b7ab25e148cfdd4b1f3dbf39a Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Wed, 10 Jun 2026 16:48:19 +0200
Subject: [PATCH 02/14] Add handoff docs for follow-up fuzzing and testing
 lanes.

Four self-contained work-lane documents: extending the encoding
fuzzer (utf8_encode/decode fallback differentials before PHP 9
removes the native oracles, the confirmed wp_has_noncharacters
PCRE-vs-fallback divergence on ill-formed input, exhaustive
code_point_to_utf8_bytes), an independent WP_HTML_Decoder fuzzer
against the Dom\HTMLDocument oracle, WP_Token_Map property tests
against a naive reference (building on the existing wpTokenMap.php
tests), and a one-shot divergence survey of seems_utf8 and
wp_check_invalid_utf8.
---
 handoffs/README.md                        |  19 ++++
 handoffs/extend-encoding-fuzzer.md        | 111 ++++++++++++++++++
 handoffs/html-decoder-fuzzer.md           | 130 ++++++++++++++++++++++
 handoffs/legacy-utf8-divergence-survey.md |  70 ++++++++++++
 handoffs/token-map-properties.md          |  91 +++++++++++++++
 5 files changed, 421 insertions(+)
 create mode 100644 handoffs/README.md
 create mode 100644 handoffs/extend-encoding-fuzzer.md
 create mode 100644 handoffs/html-decoder-fuzzer.md
 create mode 100644 handoffs/legacy-utf8-divergence-survey.md
 create mode 100644 handoffs/token-map-properties.md

diff --git a/handoffs/README.md b/handoffs/README.md
new file mode 100644
index 0000000000000..176ea716eef07
--- /dev/null
+++ b/handoffs/README.md
@@ -0,0 +1,19 @@
+# Fuzzing / testing work lanes
+
+Self-contained handoff documents, one per independent lane of work. Each
+can be picked up by a separate agent or contributor with no shared
+context beyond the document itself.
+
+| Lane | Doc | Shape of work |
+|------|-----|---------------|
+| Extend the UTF-8 encoding fuzzer | [extend-encoding-fuzzer.md](extend-encoding-fuzzer.md) | Add targets to an existing, working fuzzer |
+| WP_HTML_Decoder fuzzer | [html-decoder-fuzzer.md](html-decoder-fuzzer.md) | New independent fuzzer, Dom\HTMLDocument oracle |
+| WP_Token_Map property tests | [token-map-properties.md](token-map-properties.md) | PHPUnit property tests against a naive reference |
+| Legacy UTF-8 helper divergence survey | [legacy-utf8-divergence-survey.md](legacy-utf8-divergence-survey.md) | One-shot documented survey, no continuous fuzzing |
+
+Background: `tools/encoding-fuzz/` (this branch, commit `3cc3e64765`)
+is a working differential fuzzer for `wp_is_valid_utf8()` /
+`wp_scrub_utf8()` and their pure-PHP fallbacks. ~570k cases have run
+clean against five independent oracles. Its architecture (deterministic
+`(seed, case)` generation, oracle battery, worker/runner/replay/minimize,
+mutation-tested harness) is the reference pattern for the other lanes.
diff --git a/handoffs/extend-encoding-fuzzer.md b/handoffs/extend-encoding-fuzzer.md
new file mode 100644
index 0000000000000..1c6106a750d2c
--- /dev/null
+++ b/handoffs/extend-encoding-fuzzer.md
@@ -0,0 +1,111 @@
+# Handoff: extend the UTF-8 encoding fuzzer with three new targets
+
+## Status
+
+Not started. The host fuzzer (`tools/encoding-fuzz/`) is complete and
+working at commit `3cc3e64765` on branch `fuzz-encoder`; read its
+`README.md` first. ~570k cases have run clean against the current
+targets, so the infrastructure is trustworthy.
+
+## Goal
+
+Round out coverage of `src/wp-includes/compat-utf8.php` by adding:
+
+1. `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()`
+   differentials against the native `utf8_encode()` / `utf8_decode()`.
+2. `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()`
+   differential — **after resolving the semantic question below**.
+3. A one-shot exhaustive test of
+   `WP_HTML_Decoder::code_point_to_utf8_bytes()` (not fuzzing).
+
+## 1. utf8_encode / utf8_decode fallbacks
+
+**Why now:** the native functions are the only ground truth, deprecated
+since PHP 8.2 and removed in PHP 9. Fuzz the differential while the
+oracle still exists in the runtime.
+
+- Oracles: `@utf8_encode()` / `@utf8_decode()` (suppress deprecation
+  notices; on PHP 9+ skip these checks with an `oracle-unavailable`
+  event, same pattern `lib/Oracles.php` already uses).
+- Spot-probes already done (2026-06-10, PHP 8.4.21): native and
+  fallback agree on valid input, invalid maximal subparts (`?` per
+  subpart), code points > U+00FF (`?`), and round-trip text. No known
+  divergence going in.
+- Checks to add: byte equality vs native on arbitrary input (decode)
+  and on arbitrary input treated as latin1 (encode); round-trip
+  `decode(encode(s)) === s` for any byte string `s` (encode is total
+  and injective per byte); encode output is always valid UTF-8 per
+  the existing `mb` oracle.
+- Wire-up: add target entries in `lib/Targets.php`, checks in
+  `lib/Checks.php`, and broken-implementation cases in
+  `tests/harness-smoke.php` (the smoke test mutation-tests detection —
+  every new check needs a deliberately broken variant proving it fires;
+  e.g. a decode that emits one `?` per invalid *byte* instead of per
+  maximal subpart).
+
+## 2. wp_has_noncharacters — resolve semantics first
+
+**Known divergence, confirmed empirically (2026-06-10):**
+
+```php
+$probe = "\xC0\xEF\xBF\xBE"; // invalid byte, then U+FFFE
+wp_has_noncharacters( $probe );             // false — PCRE path: preg_match fails on ill-formed UTF-8
+_wp_has_noncharacters_fallback( $probe );   // true  — scan skips invalid spans, finds U+FFFE
+```
+
+The same public function answers differently depending on which
+environment branch of `src/wp-includes/utf8.php` loaded. A naive
+differential will fail on roughly its first invalid-input case. Do NOT
+just add the check and let it scream:
+
+1. Decide (or get a decision on) intended behavior for ill-formed
+   input. Options: (a) document that behavior is undefined unless
+   `wp_is_valid_utf8()` — then fuzz the differential on valid inputs
+   only, plus a fixed regression vector for the documented stance;
+   (b) align the implementations (likely the fallback is the *better*
+   semantic — finding real noncharacters — but the PCRE version ships
+   on most hosts). This probably warrants a Trac ticket / discussion
+   with the function author before code changes.
+2. Either way, fuzz the three-way differential on **valid** inputs
+   immediately: PCRE implementation vs fallback vs a trivial reference
+   (decode code points, check the U+FDD0–U+FDEF / U+xFFFE / U+xFFFF
+   list). The generator already emits noncharacter-dense input
+   (`BOUNDARY_CODE_POINTS` in `lib/Generator.php`).
+
+## 3. code_point_to_utf8_bytes — exhaust, don't fuzz
+
+`WP_HTML_Decoder::code_point_to_utf8_bytes()`
+(`src/wp-includes/html-api/class-wp-html-decoder.php:426`) has a domain
+of ~1.1M values. Write a standalone script (or slow-group PHPUnit test)
+asserting equality with `mb_chr( $cp, 'UTF-8' )` for every code point
+0x0–0x10FFFF, including expected behavior for surrogates and
+out-of-range values (check what the function documents; `mb_chr`
+returns `false` for surrogates — decide the comparison accordingly).
+Runs in seconds; total coverage; done forever. Note this class is
+loaded from `html-api/`, so the fuzzer bootstrap (`lib/Bootstrap.php`)
+needs to require it (it has no dependencies beyond the token map — if
+it pulls more, load only for this check).
+
+## Verification / definition of done
+
+- `php tools/encoding-fuzz/tests/harness-smoke.php` passes, including
+  new broken-variant detections for every added check.
+- A fault-injection variant per new target in `lib/Targets.php`
+  (`ENCODING_FUZZ_FAULT=...`) exercises worker → replay → minimize end
+  to end.
+- `php tools/encoding-fuzz/runner.php --lanes 4 --duration-seconds 60`
+  runs clean (or findings are triaged and documented, not silenced).
+- README.md oracle/check tables updated.
+
+## Gotchas inherited from the existing harness
+
+- All scrub/validity oracles passed a hand-computed battery; new
+  oracles must too (`Oracles::battery()` pattern). iconv is excluded
+  for accepting code points above U+10FFFF — don't re-add it.
+- Workers run checks in-process; an infinite loop in a new target will
+  trip the runner's 120s stall watchdog and record the seed. Keep that
+  property: no per-case subprocesses.
+- Everything must stay derivable from `(seed, case index)` — no
+  `random_int()`, no time-dependent generation. Per-case chunking-type
+  randomness derives from `sha256(input)` (see
+  `Checks::check_chunked_scan()`).
diff --git a/handoffs/html-decoder-fuzzer.md b/handoffs/html-decoder-fuzzer.md
new file mode 100644
index 0000000000000..1e2b3d0c6ba63
--- /dev/null
+++ b/handoffs/html-decoder-fuzzer.md
@@ -0,0 +1,130 @@
+# Handoff: independent fuzzer for WP_HTML_Decoder
+
+## Status
+
+Not started. This is a NEW fuzzer, separate from `tools/encoding-fuzz/`
+(UTF-8 functions) and from the `html-api-fuzz` branch (whole-tree
+parser comparison). Reuse the architecture of `tools/encoding-fuzz/` —
+deterministic `(seed, case)` generation, oracle startup battery,
+worker/runner/replay/minimize CLIs, mutation-tested harness smoke test —
+but as its own tool directory (suggested: `tools/html-decoder-fuzz/`).
+
+## Target
+
+`WP_HTML_Decoder` in `src/wp-includes/html-api/class-wp-html-decoder.php`:
+
+- `decode_text_node( $text )`
+- `decode_attribute( $text )`
+- `read_character_reference( $context, $text, $at, &$match_byte_length )`
+- `attribute_starts_with( $haystack, $search, $case_sensitivity )`
+
+This is security-relevant code: decoded attribute values feed
+`javascript:` URL detection via `attribute_starts_with`. Existing unit
+tests are thin (`tests/phpunit/tests/html-api/wpHtmlDecoder.php`, 4 test
+methods) — fuzzing has real headroom here.
+
+Dependency note: the named-reference path uses `WP_Token_Map` and the
+`$html5_named_character_reference` map
+(`src/wp-includes/html-api/html5-named-character-references.php`).
+A decoder fuzzer transitively exercises both.
+
+## Oracle
+
+`Dom\HTMLDocument` (lexbor, PHP 8.4+) — the same oracle the
+`html-api-fuzz` branch uses for tree comparison:
+
+- Text context: parse `<!DOCTYPE html><body><div>PAYLOAD</div>`, read
+  the div's `textContent`; compare with `decode_text_node( PAYLOAD )`.
+- Attribute context: parse `<div title="PAYLOAD">`, read
+  `getAttribute('title')`; compare with `decode_attribute( PAYLOAD )`.
+
+Do NOT use `html_entity_decode( ENT_HTML5 )` as the primary oracle: it
+does not implement the WHATWG attribute-context rules (named reference
+without semicolon followed by `=` or alphanumeric must NOT decode in
+attributes) and will drown the run in false divergences. It MAY serve
+as a third opinion on the text context only, gated by a known-answer
+battery like `Oracles::battery()` in the encoding fuzzer — verify
+empirically before trusting it, including C1-control numeric reference
+remapping (`&#x80;` → U+20AC etc.).
+
+## Confounders the harness must neutralize
+
+The oracle is a full HTML parser; the target is a pure decoder. The
+generator must avoid payload bytes the parser treats specially, or the
+comparison measures parser behavior instead of decoding:
+
+- `<`, `>`, `&` followed by structure-breaking content — escape `<` as
+  text? No: restrict generated payloads to never contain raw `<`; `&`
+  is the whole point and is fine in both contexts.
+- Quote characters in the attribute payload — generate with `"` 
+  excluded (or swap quote style per case), since it terminates the
+  attribute in the oracle document but not in `decode_attribute()`.
+- CR / CRLF: the HTML parser normalizes `\r` and `\r\n` to `\n` before
+  tokenization; the decoder does not. Either exclude `\r` from payloads
+  or pre-normalize before comparison — decide once, document it.
+- NUL bytes: parser replaces U+0000 with U+FFFD in some contexts /
+  drops in others; the decoder has its own documented NUL handling
+  (see existing test `test_character_reference_with_null_byte...`).
+  Probably exclude raw NUL from oracle-compared cases and cover NUL
+  via fixed regression vectors instead.
+- Invalid UTF-8 payload bytes: lexbor may scrub them before the
+  tokenizer sees them. Start with valid-UTF-8 payloads only; invalid
+  bytes inside character references (`&am\xC0p;`) are a later, careful
+  extension.
+
+## Generator: entity grammar, not byte noise
+
+Weighted mix targeting the reference-matching state machine:
+
+- Named references from the real token map: exact (`&amp;`), without
+  semicolon (`&amp`), longest-match ambiguity (`&not` vs `&notin;` —
+  the map is greedy-longest), case variants (`&AMP` vs `&amp`),
+  truncations (`&am`), nonexistent lookalikes (`&ampx;`).
+- The attribute-context discriminator: no-semicolon named reference
+  followed by `=`, by alphanumerics, by `;` later in the string —
+  decode in text, not in attribute.
+- Numeric: decimal and hex, mixed case `x`/`X`, leading zeros (many),
+  value classes: ASCII, C1 controls 0x80–0x9F (windows-1252 remap
+  table), surrogates, noncharacters, > 0x10FFFF, huge (overflow
+  arithmetic), zero, missing digits (`&#;`, `&#x;`).
+- Adjacency and boundaries: references back to back, reference at
+  string start/end, `&` at end of input, references split by the
+  string boundary at every prefix length (truncation sweep).
+- Plain text with multibyte UTF-8 around references (offset arithmetic).
+
+Each case is `(context, payload)`; derive both from the PRNG.
+
+## Checks
+
+1. Differential vs oracle in both contexts (primary).
+2. `read_character_reference()` consistency: decoding the whole string
+   by repeated `read_character_reference` + literal spans must equal
+   `decode()` output, and `$match_byte_length` must always advance.
+3. `attribute_starts_with( $haystack, $search )` agrees with
+   `str_starts_with( decode_attribute( $haystack ), $search )` for
+   ASCII search strings, both case sensitivities.
+4. Output is valid UTF-8 (reuse `mb_check_encoding`).
+5. Idempotence does NOT hold for decoding (`&amp;amp;` decodes to
+   `&amp;`) — do not add it; add instead: decoding text with no `&`
+   is identity.
+
+## Harness requirements (carry over from encoding fuzzer)
+
+- Known-answer startup battery for the oracle path (hand-computed
+  WHATWG expectations, including the C1 remap and no-semicolon
+  attribute rules) — if the local `Dom\HTMLDocument` fails it, abort
+  loudly.
+- Mutation-tested smoke test: broken decoder variants (skip C1 remap,
+  decode no-semicolon refs in attributes, off-by-one match length)
+  must be caught before the fuzzer is trusted.
+- Failure artifacts self-contained (base64 input + context + expected/
+  got), replay + signature-preserving minimizer.
+- Note `html-api-fuzz` branch precedent: its `attributes-entities`
+  generator profile and oracle handling are prior art worth reading
+  (`tools/html-api-fuzz/lib/Generator.php` on that branch).
+
+## Definition of done
+
+Smoke test green (including broken-variant detection), a 5-minute
+multi-lane run either clean or with triaged findings, README with the
+oracle-confounder decisions documented.
diff --git a/handoffs/legacy-utf8-divergence-survey.md b/handoffs/legacy-utf8-divergence-survey.md
new file mode 100644
index 0000000000000..c126f5c494abd
--- /dev/null
+++ b/handoffs/legacy-utf8-divergence-survey.md
@@ -0,0 +1,70 @@
+# Handoff: one-shot divergence survey of legacy UTF-8 helpers
+
+## Status
+
+Not started. Deliverable is a **document**, not code and not a
+continuous fuzzer.
+
+## Premise
+
+`src/wp-includes/formatting.php` contains older UTF-8 helpers that
+overlap with the new strict functions in `src/wp-includes/utf8.php`:
+
+- `seems_utf8( $str )` (formatting.php:884) — loose structural
+  heuristic, predates `wp_is_valid_utf8()`.
+- `wp_check_invalid_utf8( $text, $strip )` (formatting.php:1127) —
+  PCRE-based, charset-option dependent, with a `$strip` mode.
+
+These are *intentionally loose*; a continuous differential against
+`wp_is_valid_utf8()` would report their known sloppiness forever and
+train people to ignore the fuzzer. What's actually useful is a
+one-time, well-organized map of exactly where they diverge from the
+strict functions — as input for deprecation/migration decisions and
+docblock updates.
+
+## Method
+
+1. Reuse the generator and battery from `tools/encoding-fuzz/`
+   (`lib/Generator.php`, `Oracles::battery()`) to drive a few million
+   inputs through `seems_utf8`, `wp_check_invalid_utf8` (both `$strip`
+   modes), and `wp_is_valid_utf8` side by side. A throwaway script in
+   the same style as `tools/encoding-fuzz/worker.php` is fine; it does
+   not need to be committed.
+2. Bucket divergences by *class*, not by input: e.g. "seems_utf8
+   accepts overlong encodings", "accepts surrogates", "accepts
+   code points above U+10FFFF", "wp_check_invalid_utf8 returns ''
+   instead of stripping when X". Minimize one representative per class
+   (2–4 bytes each, by hand or with the encoding fuzzer's minimizer
+   predicate pattern).
+3. Note environment sensitivity: `wp_check_invalid_utf8` consults the
+   blog charset (`get_option( 'blog_charset' )`) — it needs either a WP
+   test bootstrap or careful stubbing; document which path was tested.
+   This is the reason these functions were excluded from the encoding
+   fuzzer in the first place.
+4. Cross-check each divergence class against the functions' docblocks
+   and original Trac tickets (`git log -L` on the functions; Trac
+   search for `seems_utf8`) to separate "documented, intentional
+   looseness" from "nobody ever decided this".
+
+## Deliverable
+
+A single markdown report (suggested:
+`handoffs/legacy-utf8-divergence-report.md`, or a Trac ticket comment)
+containing:
+
+- a divergence matrix: input class × function → accept/reject/output,
+  with minimal byte examples
+- for each class: intentional vs accidental, with evidence
+- migration guidance: for each current core caller of `seems_utf8` /
+  `wp_check_invalid_utf8` (grep the callers), whether
+  `wp_is_valid_utf8` / `wp_scrub_utf8` is a drop-in, a
+  behavior-changing replacement, or unsuitable
+- explicit recommendation per function: deprecate, document, or leave
+
+## Non-goals
+
+No code changes to formatting.php, no continuous fuzzing of these
+functions, no "fixing" divergences before the survey establishes which
+ones are load-bearing for existing content (a stricter check that
+rejects bytes previously accepted can break saved posts on upgrade —
+flag any such case prominently).
diff --git a/handoffs/token-map-properties.md b/handoffs/token-map-properties.md
new file mode 100644
index 0000000000000..cd127297b527b
--- /dev/null
+++ b/handoffs/token-map-properties.md
@@ -0,0 +1,91 @@
+# Handoff: property-based tests for WP_Token_Map
+
+## Status
+
+Not started. **This class HAS existing tests — explore them before
+acting**: `tests/phpunit/tests/wp-token-map/wpTokenMap.php` (8 test
+methods). Read that file first and map what is already covered; do not
+duplicate it. As of commit `3cc3e64765` the existing coverage includes:
+construction validation, over-long word rejection, round-trip through
+`to_array()`, round-trip through `precomputed_php_source_table()` /
+`from_precomputed_table()`, longest-match-first behavior, short words
+(shorter than the group key length), reading at an offset, and a sweep
+over all HTML5 named references. The *gap* is adversarial/generated
+token sets and randomized probes — the existing tests use a handful of
+hand-picked fixtures.
+
+## Why property tests, not a continuous fuzzer
+
+`WP_Token_Map` (`src/wp-includes/class-wp-token-map.php`) is a static
+data structure with a free, trivially-correct reference implementation:
+a linear scan over the source array. No external oracle, no subprocess,
+deterministic. That shape belongs in PHPUnit (fast, runs in CI forever)
+rather than a CPU-burning fuzz loop. The production-critical instance
+(`$html5_named_character_reference`) additionally gets exercised
+transitively by the WP_HTML_Decoder fuzzer lane.
+
+## Properties to test
+
+Against a naive reference (`contains`: `in_array` with optional
+`strcasecmp`; `read_token`: try every word sorted by length descending,
+return first prefix match):
+
+1. `contains( $word, $case_sensitivity )` ≡ reference, for every word
+   in the set, every prefix of a word, every word with one byte
+   appended/removed/changed, and random probes.
+2. `read_token( $text, $offset )` ≡ reference (token AND
+   `$matched_token_byte_length`), at every offset of generated
+   documents that embed tokens, near-tokens, and token prefixes.
+3. Greedy longest-match: when one word is a prefix of another and both
+   could match, the longer wins (generate nested-prefix families
+   deliberately: `a`, `ab`, `abc`, …).
+4. Round-trips on *generated* maps (the existing tests round-trip
+   fixtures): `from_array( to_array() )` preserves behavior;
+   `eval`'d `precomputed_php_source_table()` →
+   `from_precomputed_table()` preserves behavior. Compare behavior
+   (all probes), not just array equality.
+5. Case-insensitive mode with non-ASCII bytes: PHP's `strcasecmp` is
+   byte/locale-ASCII; verify the class and reference agree on what
+   "ascii-case-insensitive" means for bytes ≥ 0x80 (the docblock says
+   ASCII case only — pin that).
+
+## Generated token sets — where bugs would live
+
+Deterministic seeds (`mt_srand` with fixed seed, or reuse
+`tools/encoding-fuzz/lib/Prng.php`), sets of 1–200 words drawn from:
+
+- words shorter than `$key_length` (the "small words" storage path),
+  exactly `$key_length`, and up to the 256-byte limit
+- nested prefix families
+- words sharing the same `$key_length`-byte group key
+- bytes: ASCII letters both cases, digits, `;`, high bytes ≥ 0x80,
+  multibyte UTF-8 sequences, and NUL — wait for what existing tests
+  cover regarding NUL; if undefined, document rather than assert
+- `$key_length` values 1 and 2 (and whatever range the class accepts)
+
+Probe documents: concatenations of set words, prefixes, near-misses,
+random bytes, at random offsets.
+
+## Practical notes
+
+- Match the existing test file's conventions (data providers, group
+  annotations). New file suggested:
+  `tests/phpunit/tests/wp-token-map/wpTokenMapProperties.php` with a
+  `@group token-map` annotation consistent with the existing file.
+- Keep runtime sane for CI: a few thousand generated probes per
+  property, fixed seed so failures are reproducible. Print the seed
+  and the serialized token set in assertion messages so a failure is
+  immediately actionable.
+- `precomputed_php_source_table()` round-trip uses `eval` — the
+  existing test already does this; follow its pattern.
+- If a property fails, minimize by hand (sets are small) and add the
+  minimal case as a fixed regression test alongside the property.
+
+## Definition of done
+
+New property test file passing under
+`vendor/bin/phpunit --group token-map` (or this repo's equivalent:
+`npm run test:php -- --group token-map`), covering the five properties
+on generated sets, with documented seeds, no duplication of the eight
+existing tests, and any discovered divergence filed/minimized rather
+than worked around.

From 31d1aa7e03d5b878b727df03bf74a4af232937cf Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:31:01 +0200
Subject: [PATCH 03/14] Fuzzer: Add utf8_encode/utf8_decode fallback
 differentials.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the encoding fuzzer with targets for _wp_utf8_encode_fallback()
and _wp_utf8_decode_fallback(), fuzzing them against mb_convert_encoding
(primary) and the deprecated native utf8_encode()/utf8_decode() pair
while it still exists, plus round-trip and output-validity invariants.

The handoff's premise that native and fallback share semantics on
invalid input was falsified during implementation: legacy utf8_decode()
groups a well-formed lead byte with its expected continuation length
into a single '?' (surrogates, beyond-U+10FFFF, 3/4-byte overlongs,
C2 C0), while WordPress deliberately follows mb_convert_encoding's
maximal-subpart semantics (the PHP 9 polyfill in compat.php prefers mb;
ticket #63863). The native decode oracle is therefore trusted on valid
input only — where it provably agrees with mb on every code point — and
the divergence is pinned by hand-computed battery vectors instead of
fuzzed.

Detection is mutation-tested: seven new broken-implementation classes
in the smoke test (cp1252-confused encoder, identity encoder, per-byte
decoder, valid-input mangler, round-trip violator, null-returning
encoder and decoder — the fallbacks are untyped, so non-string returns
are reported as target-bad-return rather than silently skipped), and
ENCODING_FUZZ_FAULT=encode-cp1252|decode-per-byte exercise the
worker → replay → minimize pipeline end to end (minimal counterexamples:
'80' and 'E7 B8').

Also records an upstream finding in the handoff: the #63863 PHPUnit
test's invalid-input coverage is vacuous (integer interpolation instead
of chr(), single-quoted escape sequences, U+E000 boundary off-by-one).
---
 handoffs/extend-encoding-fuzzer.md          |  69 +++---
 tools/encoding-fuzz/README.md               |  61 ++++--
 tools/encoding-fuzz/lib/Checks.php          | 178 ++++++++++++++++
 tools/encoding-fuzz/lib/Oracles.php         | 219 +++++++++++++++++++-
 tools/encoding-fuzz/lib/Targets.php         |  47 ++++-
 tools/encoding-fuzz/tests/harness-smoke.php |  89 ++++++--
 6 files changed, 601 insertions(+), 62 deletions(-)

diff --git a/handoffs/extend-encoding-fuzzer.md b/handoffs/extend-encoding-fuzzer.md
index 1c6106a750d2c..7fdd0f6797841 100644
--- a/handoffs/extend-encoding-fuzzer.md
+++ b/handoffs/extend-encoding-fuzzer.md
@@ -2,10 +2,10 @@
 
 ## Status
 
-Not started. The host fuzzer (`tools/encoding-fuzz/`) is complete and
-working at commit `3cc3e64765` on branch `fuzz-encoder`; read its
-`README.md` first. ~570k cases have run clean against the current
-targets, so the infrastructure is trustworthy.
+Sections 1 (utf8_encode/decode) DONE; sections 2–3 in progress. The
+host fuzzer (`tools/encoding-fuzz/`) is complete and working on branch
+`fuzz-encoder`; read its `README.md` first. ~570k cases had run clean
+against the original targets before this work started.
 
 ## Goal
 
@@ -18,30 +18,43 @@ Round out coverage of `src/wp-includes/compat-utf8.php` by adding:
 3. A one-shot exhaustive test of
    `WP_HTML_Decoder::code_point_to_utf8_bytes()` (not fuzzing).
 
-## 1. utf8_encode / utf8_decode fallbacks
-
-**Why now:** the native functions are the only ground truth, deprecated
-since PHP 8.2 and removed in PHP 9. Fuzz the differential while the
-oracle still exists in the runtime.
-
-- Oracles: `@utf8_encode()` / `@utf8_decode()` (suppress deprecation
-  notices; on PHP 9+ skip these checks with an `oracle-unavailable`
-  event, same pattern `lib/Oracles.php` already uses).
-- Spot-probes already done (2026-06-10, PHP 8.4.21): native and
-  fallback agree on valid input, invalid maximal subparts (`?` per
-  subpart), code points > U+00FF (`?`), and round-trip text. No known
-  divergence going in.
-- Checks to add: byte equality vs native on arbitrary input (decode)
-  and on arbitrary input treated as latin1 (encode); round-trip
-  `decode(encode(s)) === s` for any byte string `s` (encode is total
-  and injective per byte); encode output is always valid UTF-8 per
-  the existing `mb` oracle.
-- Wire-up: add target entries in `lib/Targets.php`, checks in
-  `lib/Checks.php`, and broken-implementation cases in
-  `tests/harness-smoke.php` (the smoke test mutation-tests detection —
-  every new check needs a deliberately broken variant proving it fires;
-  e.g. a decode that emits one `?` per invalid *byte* instead of per
-  maximal subpart).
+## 1. utf8_encode / utf8_decode fallbacks — DONE, premise corrected
+
+**Implemented**, but a premise of this section was falsified during
+implementation and the oracle design adapted (2026-06-10, PHP 8.4.21):
+
+- The original claim "No known divergence going in" was wrong: the
+  earlier spot-probes missed it. Native `utf8_decode()` groups a
+  well-formed lead byte with its expected continuation length and emits
+  a single `?` for surrogates (`ED A0 80` → `?`), beyond-U+10FFFF
+  sequences (`F4 90 80 80` → `?`), 3-/4-byte overlongs, and a
+  well-formed lead before an invalid continuation (`C2 C0` → `?`),
+  where the fallback emits one `?` per maximal subpart (`???` etc.).
+- That divergence is **intentional** in WordPress: the PHP 9 polyfill
+  in `compat.php` prefers `mb_convert_encoding()` (which uses maximal
+  subparts) over the fallback, and the #63863 PHPUnit tests assert
+  mb-equivalence. So "the native functions are the only ground truth"
+  was also wrong — WP's chosen ground truth is `mb_convert_encoding()`.
+- Oracle design as built: `mb` (`mb_convert_encoding()`) is the primary
+  encode/decode oracle on arbitrary input; `native` is an encode oracle
+  on arbitrary input and a decode oracle on **valid input only**
+  (native ≡ mb on every valid code point, verified exhaustively). On
+  PHP 9+ `native` reports `oracle-unavailable` and is skipped. The
+  legacy divergence is pinned by hand-computed battery vectors.
+- Round-trip `decode(encode(s)) === s`, encode-output-validity, the
+  smoke-test mutation variants (cp1252-confused encoder, identity
+  encoder, per-byte decoder, valid-input mangler, round-trip violator,
+  null-returning targets), and the `ENCODING_FUZZ_FAULT=encode-cp1252`
+  / `decode-per-byte` end-to-end fault variants are all in place.
+
+**Upstream finding, not fixed here:** the cited core test
+`tests/phpunit/tests/formatting/deprecatedUtfEncodeDecode.php` has
+vacuous invalid-input coverage — its surrogate branch interpolates
+integers instead of `chr()` bytes (`"{$byte1}{$byte2}{$byte3}"`
+produces ASCII digits), its single-quoted `'\x95'` data is literal
+backslash text, and the `$i < 0xD800 || $i > 0xE000` boundary routes
+valid U+E000 through the broken branch. It only ever asserts
+mb-equivalence on valid input. Worth a follow-up patch on #63863.
 
 ## 2. wp_has_noncharacters — resolve semantics first
 
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index 8deee79516156..80ac3da42df19 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -4,6 +4,7 @@ Differential fuzzer for the WordPress UTF-8 functions:
 
 - `wp_is_valid_utf8()` / `_wp_is_valid_utf8_fallback()`
 - `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()`
+- `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()`
 - `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary)
 
 The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main
@@ -15,13 +16,39 @@ bootstrap, database, or `wp-env`.
 
 Every result is compared against independent known-good implementations:
 
-| Oracle    | Backing                              | Validity | Scrub |
-|-----------|--------------------------------------|----------|-------|
-| `mb`      | `mb_check_encoding()` / `mb_scrub()` | ✓        | ✓ (primary) |
-| `pcre`    | PCRE2 strict UTF validation          | ✓        |       |
-| `intl`    | ICU `UConverter::transcode()`        |          | ✓     |
-| `python3` | CPython codec, persistent subprocess | ✓        | ✓     |
-| `node`    | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓     |
+| Oracle    | Backing                              | Validity | Scrub | Encode | Decode |
+|-----------|--------------------------------------|----------|-------|--------|--------|
+| `mb`      | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) |
+| `pcre`    | PCRE2 strict UTF validation          | ✓        |       |        |        |
+| `intl`    | ICU `UConverter::transcode()`        |          | ✓     |        |        |
+| `python3` | CPython codec, persistent subprocess | ✓        | ✓     |        |        |
+| `node`    | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓     |        |        |
+| `native`  | deprecated `utf8_encode()` / `utf8_decode()` | |       | ✓      | ✓ (valid input only) |
+
+Encode oracles answer "what is this ISO-8859-1 text as UTF-8?"; decode
+oracles the reverse. The `native` pair exists until PHP 9 removes it; on
+PHP 9+ it is reported as `oracle-unavailable` and skipped. Its decode
+side is trusted on valid input only: on ill-formed input the legacy
+decoder groups a well-formed lead byte with its expected continuation
+length and emits a single `?` in several classes — surrogates
+(`ED A0 80` → `?` vs `???`), sequences past U+10FFFF (`F4 90 80 80`),
+three/four-byte overlongs (`E0 80 AF`), and even a well-formed lead
+before an invalid continuation (`C2 C0`) — though it agrees with
+maximal subparts elsewhere (e.g. C0/C1 overlongs and lone
+continuations). WordPress deliberately follows the maximal-subpart
+semantics of `mb_convert_encoding()` (one `?` per subpart) instead:
+the PHP 9 polyfill in `compat.php` prefers `mb_convert_encoding()`
+with `_wp_utf8_decode_fallback()` as its mbstring-less shadow
+(ticket #63863).
+
+Because native and mb decoding agree on *every* valid code point
+(verified exhaustively over U+0000–U+10FFFF), the valid-input-only
+native decode differential adds little detection power beyond `mb`; it
+exists to scream if mb and the fallback ever jointly drift from legacy
+behavior on valid text. The legacy-vs-WordPress behavior on ill-formed
+input is a documented, intentional divergence — pinned here by battery
+vectors, not fuzzed. Cataloguing the full legacy divergence surface is
+the separate `legacy-utf8-divergence-survey` work lane.
 
 All scrub oracles implement the Unicode "maximal subpart" replacement
 recommendation (Unicode 16.0 §3.9, Table 3-8), which is the documented
@@ -38,7 +65,10 @@ External oracles are auto-detected; control them with
 ## Checks
 
 Differentials: both validity targets against every validity oracle, both
-scrub targets against every scrub oracle. Oracle-vs-oracle disagreements
+scrub targets against every scrub oracle, `_wp_utf8_encode_fallback()`
+against every encode oracle (input treated as ISO-8859-1), and
+`_wp_utf8_decode_fallback()` against every decode oracle (the `native`
+decode oracle on valid input only). Oracle-vs-oracle disagreements
 are reported separately (`oracle-disagreement`) so they don't masquerade
 as WordPress bugs.
 
@@ -52,6 +82,9 @@ Internal invariants:
 - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points`
   chunks reconstructs the same scrubbed text and always makes forward
   progress (chunk sizes derive from the input hash, so replays are exact)
+- `_wp_utf8_encode_fallback()` output is always valid UTF-8
+- `_wp_utf8_decode_fallback( _wp_utf8_encode_fallback( $s ) ) === $s`
+  for any byte string `$s` (encode is total and injective per byte)
 
 ## Inputs
 
@@ -121,16 +154,18 @@ php tools/encoding-fuzz/tests/harness-smoke.php
 ```
 
 Verifies the oracle battery, runs the real targets over the battery
-vectors, and — most importantly — mutation-tests the harness: seven
+vectors, and — most importantly — mutation-tests the harness: fourteen
 classes of deliberately broken implementations (validator accepting
 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber,
 identity scrubber, byte-dropping scrubber, off-by-one code point count,
-throwing target) must all be caught. It also asserts generator
-determinism and the valid/invalid input mix.
+throwing target, cp1252-confused encoder, identity encoder, per-byte
+decoder, valid-input-mangling decoder, round-trip-violating decoder,
+null-returning encoder, sometimes-null decoder) must all be caught. It
+also asserts generator determinism and the valid/invalid input mix.
 
 For end-to-end pipeline testing while the real implementations are
-healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal` injects a broken
-target into worker, replay, and minimize alike:
+healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte`
+injects a broken target into worker, replay, and minimize alike:
 
 ```sh
 ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/runner.php --lanes 2 --duration-seconds 5
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
index ad242666c1f2f..cf6d7c8be2d7b 100644
--- a/tools/encoding-fuzz/lib/Checks.php
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -20,6 +20,16 @@
  *    chunks reconstructs the same scrubbed text and always makes
  *    forward progress
  *
+ * Legacy `utf8_encode()` / `utf8_decode()` fallbacks:
+ *  - `_wp_utf8_encode_fallback()` vs every encode oracle on arbitrary
+ *    input treated as ISO-8859-1.
+ *  - `_wp_utf8_decode_fallback()` vs the mb decode oracle on arbitrary
+ *    input; the legacy native oracle is consulted on valid input only
+ *    (see the divergence note in `Oracles`).
+ *  - encode output is always valid UTF-8
+ *  - `decode(encode(s)) === s` for any byte string `s` (encode is total
+ *    and injective per byte)
+ *
  * Target callables are injectable so the harness smoke test can verify
  * that deliberately broken implementations are caught.
  */
@@ -238,6 +248,174 @@ public function run( string $input ): array {
 			$failures[] = $chunk_failure;
 		}
 
+		// 8. Legacy utf8_encode()/utf8_decode() fallback differentials.
+		foreach ( $this->check_utf8_encode_decode( $input, $ref_valid, $mb_validity ) as $failure ) {
+			$failures[] = $failure;
+		}
+
+		return $failures;
+	}
+
+	/**
+	 * Differentials and invariants for the `utf8_encode()` /
+	 * `utf8_decode()` fallback pair. The same input is exercised both as
+	 * ISO-8859-1 (encode, total over arbitrary bytes) and as UTF-8
+	 * (decode). The legacy native decode oracle is consulted on valid
+	 * input only; on ill-formed input WordPress deliberately follows
+	 * `mb_convert_encoding()` maximal-subpart semantics instead.
+	 *
+	 * @return array<int, array{check: string, signature: string, detail: array}>
+	 */
+	private function check_utf8_encode_decode( string $input, bool $ref_valid, callable $mb_validity ): array {
+		$failures       = array();
+		$encode_oracles = $this->oracles->encode_oracles();
+		$decode_oracles = $this->oracles->decode_oracles();
+
+		/*
+		 * The fallbacks are untyped, so a broken variant could return null
+		 * (or anything else) instead of throwing; treat any non-string
+		 * return as a failure rather than silently skipping every check.
+		 */
+		$results = array();
+		foreach ( array( 'utf8_encode_fb', 'utf8_decode_fb' ) as $key ) {
+			try {
+				$result = ( $this->targets[ $key ] )( $input );
+
+				if ( ! is_string( $result ) ) {
+					$failures[] = self::failure(
+						'target-bad-return',
+						$key,
+						array(
+							'target' => $key,
+							'type'   => get_debug_type( $result ),
+						)
+					);
+					$result = null;
+				}
+			} catch ( \Throwable $error ) {
+				$failures[] = self::failure(
+					'target-exception',
+					$key,
+					array(
+						'target'  => $key,
+						'message' => $error->getMessage(),
+						'class'   => get_class( $error ),
+					)
+				);
+				$result = null;
+			}
+
+			$results[ $key ] = $result;
+		}
+
+		// Differentials against the encode/decode oracles.
+		$ref_encode = isset( $encode_oracles['mb'] ) ? $encode_oracles['mb']( $input ) : null;
+		$ref_decode = isset( $decode_oracles['mb'] ) ? $decode_oracles['mb']( $input ) : null;
+
+		if ( null !== $ref_encode && null !== $results['utf8_encode_fb'] && $results['utf8_encode_fb'] !== $ref_encode ) {
+			$failures[] = self::failure(
+				'utf8-encode-mismatch',
+				'utf8_encode_fb',
+				self::diff_detail( 'utf8_encode_fb', $ref_encode, $results['utf8_encode_fb'] )
+			);
+		}
+
+		if ( null !== $ref_decode && null !== $results['utf8_decode_fb'] && $results['utf8_decode_fb'] !== $ref_decode ) {
+			$failures[] = self::failure(
+				'utf8-decode-mismatch',
+				'utf8_decode_fb',
+				self::diff_detail( 'utf8_decode_fb', $ref_decode, $results['utf8_decode_fb'] )
+			);
+		}
+
+		if ( null !== $ref_encode ) {
+			foreach ( $encode_oracles as $name => $oracle ) {
+				if ( 'mb' === $name ) {
+					continue;
+				}
+
+				$oracle_encode = $oracle( $input );
+				if ( $oracle_encode !== $ref_encode ) {
+					$failures[] = self::failure(
+						'oracle-disagreement',
+						"utf8-encode:{$name}",
+						self::diff_detail( $name, $ref_encode, $oracle_encode )
+					);
+				}
+			}
+		}
+
+		if ( null !== $ref_decode ) {
+			foreach ( $decode_oracles as $name => $oracle ) {
+				if ( 'mb' === $name ) {
+					continue;
+				}
+
+				if ( ! $ref_valid && $this->oracles->decode_oracle_is_valid_only( $name ) ) {
+					continue;
+				}
+
+				$oracle_decode = $oracle( $input );
+				if ( $oracle_decode !== $ref_decode ) {
+					$failures[] = self::failure(
+						'oracle-disagreement',
+						"utf8-decode:{$name}",
+						self::diff_detail( $name, $ref_decode, $oracle_decode )
+					);
+				}
+			}
+		}
+
+		// Encode output must be valid UTF-8 (every byte has a code point).
+		// This and the round trip below need no conversion oracle.
+		if ( null !== $results['utf8_encode_fb'] && ! $mb_validity( $results['utf8_encode_fb'] ) ) {
+			$failures[] = self::failure(
+				'utf8-encode-not-valid',
+				'utf8_encode_fb',
+				array(
+					'target'         => 'utf8_encode_fb',
+					'encode_preview' => self::preview( $results['utf8_encode_fb'] ),
+				)
+			);
+		}
+
+		// Round trip: encode is total and injective per byte, so decoding
+		// its output must restore the input exactly. A violation implicates
+		// the pair, not a single side.
+		if ( null !== $results['utf8_encode_fb'] ) {
+			try {
+				$round_trip = ( $this->targets['utf8_decode_fb'] )( $results['utf8_encode_fb'] );
+			} catch ( \Throwable $error ) {
+				$failures[] = self::failure(
+					'target-exception',
+					'utf8_decode_fb:round-trip',
+					array(
+						'target'  => 'utf8_decode_fb',
+						'message' => $error->getMessage(),
+						'class'   => get_class( $error ),
+					)
+				);
+				$round_trip = $input;
+			}
+
+			if ( ! is_string( $round_trip ) ) {
+				$failures[] = self::failure(
+					'target-bad-return',
+					'utf8_decode_fb:round-trip',
+					array(
+						'target' => 'utf8_decode_fb',
+						'type'   => get_debug_type( $round_trip ),
+					)
+				);
+			} elseif ( $round_trip !== $input ) {
+				$failures[] = self::failure(
+					'utf8-round-trip-mismatch',
+					'round-trip',
+					self::diff_detail( 'round-trip', $input, $round_trip )
+				);
+			}
+		}
+
 		return $failures;
 	}
 
diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php
index 8dce899c84db4..c3c4b0900947f 100644
--- a/tools/encoding-fuzz/lib/Oracles.php
+++ b/tools/encoding-fuzz/lib/Oracles.php
@@ -6,13 +6,33 @@
  *
  * Validity oracles answer "is this well-formed UTF-8?".
  * Scrub oracles answer "what does maximal-subpart replacement produce?".
+ * Encode oracles answer "what is this ISO-8859-1 text as UTF-8?".
+ * Decode oracles answer "what is this UTF-8 text as ISO-8859-1?".
  *
  *  - mbstring:  `mb_check_encoding()` / `mb_scrub()` (maximal subpart
- *               since PHP 8.1.6).
+ *               since PHP 8.1.6), `mb_convert_encoding()` for the
+ *               ISO-8859-1 encode/decode pair.
  *  - pcre:      PCRE2's strict UTF validity check (validity only).
  *  - intl:      ICU via `UConverter::transcode()` (scrub only).
  *  - python3:   CPython codec in a persistent subprocess.
  *  - node:      WHATWG TextDecoder in a persistent subprocess.
+ *  - native:    the deprecated `utf8_encode()` / `utf8_decode()` pair,
+ *               available until its removal in PHP 9. The decode side is
+ *               trusted on VALID input only: on ill-formed input the
+ *               legacy decoder groups bytes differently from the maximal
+ *               subpart rule, consuming a well-formed lead byte together
+ *               with its expected continuation length as a single '?'
+ *               unit in several classes — surrogates (`ED A0 80` → '?'
+ *               vs '???'), sequences past U+10FFFF (`F4 90 80 80` → '?'
+ *               vs '????'), three/four-byte overlongs (`E0 80 AF`), and
+ *               even a well-formed lead before an invalid continuation
+ *               (`C2 C0` → '?' vs '??'). It does agree with maximal
+ *               subparts elsewhere (e.g. C0/C1 overlongs and lone
+ *               continuations). WordPress deliberately follows
+ *               `mb_convert_encoding()` maximal-subpart semantics
+ *               instead: the PHP 9 polyfill in `compat.php` prefers
+ *               `mb_convert_encoding()`, with the fallback as its
+ *               shadow (ticket #63863).
  *
  * iconv is deliberately NOT an oracle: GNU libiconv accepts code points
  * above U+10FFFF (e.g. F4 90 80 80), so it fails the battery.
@@ -27,6 +47,22 @@ class Oracles {
 	/** @var array<string, callable(string): ?string> */
 	private array $scrub = array();
 
+	/*
+	 * Unlike validity/scrub oracles, encode/decode oracles are all
+	 * in-process and never return null; `Checks` has no transport-failure
+	 * handling for them. An external (nullable) encode/decode oracle
+	 * would need that handling added first.
+	 */
+
+	/** @var array<string, callable(string): string> */
+	private array $encode = array();
+
+	/** @var array<string, callable(string): string> */
+	private array $decode = array();
+
+	/** @var array<string, bool> Decode oracles trusted on valid UTF-8 input only. */
+	private array $decode_valid_only = array();
+
 	/** @var ExternalOracle[] */
 	private array $externals = array();
 
@@ -58,6 +94,41 @@ public static function build( array $external_names ): self {
 			);
 		}
 
+		if ( function_exists( 'mb_convert_encoding' ) ) {
+			// Encode is total over ISO-8859-1 bytes; no substitutions can occur.
+			$oracles->encode['mb'] = static function ( string $bytes ): string {
+				return mb_convert_encoding( $bytes, 'UTF-8', 'ISO-8859-1' );
+			};
+			// Pin the legacy '?' substitute per call (like the scrub oracle
+			// pins 0xFFFD) so ambient changes to the global cannot skew results.
+			$oracles->decode['mb'] = static function ( string $bytes ): string {
+				$previous = mb_substitute_character();
+				mb_substitute_character( 0x3F );
+				$decoded = mb_convert_encoding( $bytes, 'ISO-8859-1', 'UTF-8' );
+				mb_substitute_character( $previous );
+				return $decoded;
+			};
+		}
+
+		if ( function_exists( 'utf8_encode' ) && function_exists( 'utf8_decode' ) ) {
+			$oracles->encode['native'] = static function ( string $bytes ): string {
+				// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- Deprecated since PHP 8.2.
+				return (string) @utf8_encode( $bytes );
+			};
+			$oracles->decode['native'] = static function ( string $bytes ): string {
+				// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- Deprecated since PHP 8.2.
+				return (string) @utf8_decode( $bytes );
+			};
+
+			$oracles->decode_valid_only['native'] = true;
+		} else {
+			$oracles->events[] = array(
+				'type'   => 'oracle-unavailable',
+				'oracle' => 'native',
+				'detail' => 'utf8_encode()/utf8_decode() removed (PHP 9+); legacy encode/decode differential skipped',
+			);
+		}
+
 		// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
 		if ( false !== @preg_match( '/^./u', 'a' ) ) {
 			$oracles->validity['pcre'] = static function ( string $bytes ): bool {
@@ -148,6 +219,72 @@ public static function battery(): array {
 		);
 	}
 
+	/**
+	 * Known-answer vectors for the ISO-8859-1 → UTF-8 encode oracles.
+	 *
+	 * Every byte 0x00–0xFF is a defined ISO-8859-1 code point whose UTF-8
+	 * form is hand-computable: identity below 0x80, the two-byte sequence
+	 * `C2|C3 80..BF` above.
+	 *
+	 * @return array<int, array{0: string, 1: string}> [latin1 bytes, utf8 bytes]
+	 */
+	public static function encode_battery(): array {
+		return array(
+			array( '', '' ),
+			array( 'abc', 'abc' ),
+			array( "\x00", "\x00" ),
+			array( "\x7F", "\x7F" ),
+			array( "\x80", "\xC2\x80" ),             // First two-byte mapping.
+			array( "\x9F", "\xC2\x9F" ),             // NOT a Windows-1252 smart quote.
+			array( "\xA0", "\xC2\xA0" ),
+			array( "\xBF", "\xC2\xBF" ),             // Last byte with C2 lead.
+			array( "\xC0", "\xC3\x80" ),             // First byte with C3 lead.
+			array( "\xFF", "\xC3\xBF" ),
+			array( "B\xFCch", "B\xC3\xBCch" ),
+			array( "\xC3\xBC", "\xC3\x83\xC2\xBC" ), // Already-UTF-8 input double-encodes.
+		);
+	}
+
+	/**
+	 * Known-answer vectors for the UTF-8 → ISO-8859-1 decode oracles.
+	 *
+	 * Hand-computed: code points U+00–U+FF map to their byte, anything
+	 * higher becomes '?', and each maximal subpart of an ill-formed span
+	 * becomes one '?'. The valid flag marks vectors safe for decode
+	 * oracles that are trusted on valid input only (legacy `utf8_decode()`
+	 * groups some ill-formed sequences into a single '?' unit; see the
+	 * class docblock).
+	 *
+	 * @return array<int, array{0: string, 1: bool, 2: string}> [utf8 bytes, valid, latin1 bytes]
+	 */
+	public static function decode_battery(): array {
+		return array(
+			array( '', true, '' ),
+			array( 'abc', true, 'abc' ),
+			array( "\x00", true, "\x00" ),
+			array( "\xC2\x80", true, "\x80" ),                  // U+0080, first two-byte mapping.
+			array( "\xC3\xBC", true, "\xFC" ),                  // U+00FC ü.
+			array( "\xC3\xBF", true, "\xFF" ),                  // U+00FF, last mappable.
+			array( "\xC4\x80", true, '?' ),                     // U+0100, first unmappable.
+			array( "\xE2\x9C\x8F", true, '?' ),                 // U+270F.
+			array( "\xF0\x9F\x98\x80", true, '?' ),             // U+1F600.
+			array( "\xEF\xBB\xBF", true, '?' ),                 // BOM is unmappable, not dropped.
+			array( "a\xC3\xA9b", true, "a\xE9b" ),
+			array( "\x80", false, '?' ),                        // Lone continuation.
+			array( "\xC0", false, '?' ),                        // Never-valid lead.
+			array( "\xC0\xAF", false, '??' ),                   // Overlong '/': two subparts, NOT '/'.
+			array( "\xE2\x8C", false, '?' ),                    // Two-byte maximal subpart at EOF.
+			array( "\xE2\x8Cx", false, '?x' ),                  // Subpart cut short by ASCII.
+			array( "\xF1\x80\x80", false, '?' ),                // Three-byte maximal subpart.
+			array( "\xED\xA0\x80", false, '???' ),              // Surrogate: per subpart (legacy native says '?').
+			array( "\xF4\x90\x80\x80", false, '????' ),         // Past U+10FFFF (legacy native says '?').
+			array( ".\xC0.", false, '.?.' ),
+			array( "\xC3\xBC\x80", false, "\xFC?" ),            // Invalid span right after a mappable high byte.
+			array( "\x80\xC3\xBC", false, "?\xFC" ),            // Mappable high byte right after an invalid span.
+			array( "a\xF1\x80\x80\xE1\x80\xC2b", false, 'a???b' ), // Unicode Table 3-8.
+		);
+	}
+
 	private function verify_battery(): void {
 		foreach ( self::battery() as $i => $vector ) {
 			list( $bytes, $expected_valid, $expected_scrub ) = $vector;
@@ -178,14 +315,69 @@ private function verify_battery(): void {
 				}
 			}
 		}
+
+		foreach ( self::encode_battery() as $i => $vector ) {
+			list( $bytes, $expected ) = $vector;
+
+			foreach ( $this->encode as $name => $check ) {
+				$got = $check( $bytes );
+				if ( $got !== $expected ) {
+					$this->disable( $name, sprintf(
+						'encode battery vector %d (%s): expected %s, got %s',
+						$i,
+						bin2hex( $bytes ),
+						bin2hex( $expected ),
+						null === $got ? 'null' : bin2hex( $got )
+					) );
+				}
+			}
+		}
+
+		foreach ( self::decode_battery() as $i => $vector ) {
+			list( $bytes, $input_valid, $expected ) = $vector;
+
+			foreach ( $this->decode as $name => $check ) {
+				if ( ! $input_valid && $this->decode_oracle_is_valid_only( $name ) ) {
+					continue;
+				}
+
+				$got = $check( $bytes );
+				if ( $got !== $expected ) {
+					$this->disable( $name, sprintf(
+						'decode battery vector %d (%s): expected %s, got %s',
+						$i,
+						bin2hex( $bytes ),
+						bin2hex( $expected ),
+						null === $got ? 'null' : bin2hex( $got )
+					) );
+				}
+			}
+		}
 	}
 
+	/**
+	 * Removes every role a named oracle backs. Note that disabling `mb`
+	 * therefore makes `has_required()` false and the harness refuses to
+	 * run — failing closed is preferable to fuzzing without the primary
+	 * oracle.
+	 */
 	public function disable( string $name, string $detail ): void {
-		if ( ! isset( $this->validity[ $name ] ) && ! isset( $this->scrub[ $name ] ) ) {
+		if (
+			! isset( $this->validity[ $name ] ) &&
+			! isset( $this->scrub[ $name ] ) &&
+			! isset( $this->encode[ $name ] ) &&
+			! isset( $this->decode[ $name ] )
+		) {
 			return;
 		}
 
-		unset( $this->validity[ $name ], $this->scrub[ $name ] );
+		unset(
+			$this->validity[ $name ],
+			$this->scrub[ $name ],
+			$this->encode[ $name ],
+			$this->decode[ $name ],
+			$this->decode_valid_only[ $name ]
+		);
 		$this->events[] = array(
 			'type'   => 'oracle-disabled',
 			'oracle' => $name,
@@ -203,12 +395,31 @@ public function scrub_oracles(): array {
 		return $this->scrub;
 	}
 
+	/** @return array<string, callable(string): string> */
+	public function encode_oracles(): array {
+		return $this->encode;
+	}
+
+	/** @return array<string, callable(string): string> */
+	public function decode_oracles(): array {
+		return $this->decode;
+	}
+
+	public function decode_oracle_is_valid_only( string $name ): bool {
+		return $this->decode_valid_only[ $name ] ?? false;
+	}
+
 	public function has_required(): bool {
 		return isset( $this->validity['mb'], $this->scrub['mb'] );
 	}
 
 	public function names(): array {
-		return array_values( array_unique( array_merge( array_keys( $this->validity ), array_keys( $this->scrub ) ) ) );
+		return array_values( array_unique( array_merge(
+			array_keys( $this->validity ),
+			array_keys( $this->scrub ),
+			array_keys( $this->encode ),
+			array_keys( $this->decode )
+		) ) );
 	}
 
 	/** @return array<int, array{type: string, oracle: string, detail: string}> */
diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php
index f810d9d934eda..6c4e3412e5776 100644
--- a/tools/encoding-fuzz/lib/Targets.php
+++ b/tools/encoding-fuzz/lib/Targets.php
@@ -9,8 +9,10 @@
  * be exercised end to end even while the real implementations are
  * healthy. It exists only for harness validation:
  *
- *   ENCODING_FUZZ_FAULT=accept-c0    validator accepts the 0xC0 byte
- *   ENCODING_FUZZ_FAULT=non-maximal  scrubber collapses adjacent U+FFFD
+ *   ENCODING_FUZZ_FAULT=accept-c0       validator accepts the 0xC0 byte
+ *   ENCODING_FUZZ_FAULT=non-maximal     scrubber collapses adjacent U+FFFD
+ *   ENCODING_FUZZ_FAULT=encode-cp1252   encoder maps 0x80 like Windows-1252
+ *   ENCODING_FUZZ_FAULT=decode-per-byte decoder emits '?' per invalid byte
  */
 class Targets {
 	/**
@@ -23,6 +25,8 @@ public static function resolve(): array {
 			'scrub'           => 'wp_scrub_utf8',
 			'scrub_fb'        => '_wp_scrub_utf8_fallback',
 			'codepoint_count' => '_wp_utf8_codepoint_count',
+			'utf8_encode_fb'  => '_wp_utf8_encode_fallback',
+			'utf8_decode_fb'  => '_wp_utf8_decode_fallback',
 		);
 
 		switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) {
@@ -37,8 +41,47 @@ public static function resolve(): array {
 					return (string) preg_replace( "/(\u{FFFD})+/u", "\u{FFFD}", _wp_scrub_utf8_fallback( $bytes ) );
 				};
 				break;
+
+			case 'encode-cp1252':
+				// 0x80 is U+0080 in ISO-8859-1 but '€' in Windows-1252; a
+				// classic confusion of the two encodings.
+				$targets['utf8_encode_fb'] = static function ( string $bytes ): string {
+					return str_replace( "\xC2\x80", "\xE2\x82\xAC", _wp_utf8_encode_fallback( $bytes ) );
+				};
+				break;
+
+			case 'decode-per-byte':
+				$targets['utf8_decode_fb'] = self::decode_per_invalid_byte( ... );
+				break;
 		}
 
 		return $targets;
 	}
+
+	/**
+	 * Deliberately broken decoder: emits one '?' for every byte of an
+	 * invalid span instead of one per maximal subpart, so multi-byte
+	 * subparts like `E2 8C` produce '??' instead of '?'.
+	 */
+	public static function decode_per_invalid_byte( string $bytes ): string {
+		$at             = 0;
+		$was_at         = 0;
+		$invalid_length = 0;
+		$end            = strlen( $bytes );
+		$out            = '';
+
+		while ( $at < $end ) {
+			_wp_scan_utf8( $bytes, $at, $invalid_length );
+			$out .= _wp_utf8_decode_fallback( substr( $bytes, $was_at, $at - $was_at ) );
+
+			if ( $invalid_length > 0 ) {
+				$out .= str_repeat( '?', $invalid_length );
+				$at  += $invalid_length;
+			}
+
+			$was_at = $at;
+		}
+
+		return $out;
+	}
 }
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index 9d64e0bb84294..59b89327be408 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -53,10 +53,15 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 // ---------------------------------------------------------------------
 // 2. Real targets pass every check on the battery vectors.
 // ---------------------------------------------------------------------
-$checks        = new Checks( $oracles );
-$battery_fails = array();
-foreach ( Oracles::battery() as $i => $vector ) {
-	foreach ( $checks->run( $vector[0] ) as $failure ) {
+$checks          = new Checks( $oracles );
+$battery_fails   = array();
+$battery_vectors = array_merge(
+	array_column( Oracles::battery(), 0 ),
+	array_column( Oracles::encode_battery(), 0 ),
+	array_column( Oracles::decode_battery(), 0 )
+);
+foreach ( $battery_vectors as $i => $bytes ) {
+	foreach ( $checks->run( $bytes ) as $failure ) {
 		$battery_fails[] = "vector {$i}: {$failure['signature']}";
 	}
 }
@@ -71,18 +76,21 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 	'scrub'           => 'wp_scrub_utf8',
 	'scrub_fb'        => '_wp_scrub_utf8_fallback',
 	'codepoint_count' => '_wp_utf8_codepoint_count',
+	'utf8_encode_fb'  => '_wp_utf8_encode_fallback',
+	'utf8_decode_fb'  => '_wp_utf8_decode_fallback',
 );
 
 /**
- * Runs the battery against a broken variant and reports which checks fired.
+ * Runs every battery vector against a broken variant and reports which
+ * checks fired.
  *
  * @return string[] Distinct check names observed.
  */
-function broken_run( Oracles $oracles, array $real, array $overrides ): array {
+function broken_run( Oracles $oracles, array $real, array $vectors, array $overrides ): array {
 	$checks = new Checks( $oracles, array_merge( $real, $overrides ) );
 	$seen   = array();
-	foreach ( Oracles::battery() as $vector ) {
-		foreach ( $checks->run( $vector[0] ) as $failure ) {
+	foreach ( $vectors as $bytes ) {
+		foreach ( $checks->run( $bytes ) as $failure ) {
 			$seen[ $failure['check'] ] = true;
 		}
 	}
@@ -90,26 +98,26 @@ function broken_run( Oracles $oracles, array $real, array $overrides ): array {
 }
 
 // 3a. Validator that wrongly accepts a never-valid byte.
-$seen = broken_run( $oracles, $real_targets, array(
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'is_valid_fb' => static fn( string $bytes ): bool => str_contains( $bytes, "\xC0" ) ? true : _wp_is_valid_utf8_fallback( $bytes ),
 ) );
 check( 'catches validator accepting 0xC0', in_array( 'validity-mismatch', $seen, true ), implode( ',', $seen ) );
 
 // 3b. Validator that wrongly rejects noncharacters (a plausible spec misreading).
-$seen = broken_run( $oracles, $real_targets, array(
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'is_valid' => static fn( string $bytes ): bool => wp_is_valid_utf8( $bytes ) && ! wp_has_noncharacters( $bytes ),
 ) );
 check( 'catches validator rejecting noncharacters', in_array( 'validity-mismatch', $seen, true ), implode( ',', $seen ) );
 
 // 3c. Scrubber that collapses adjacent replacement characters (one-FFFD-per-run
 //     instead of one per maximal subpart).
-$seen = broken_run( $oracles, $real_targets, array(
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'scrub_fb' => static fn( string $bytes ): string => (string) preg_replace( "/(\u{FFFD})+/u", "\u{FFFD}", _wp_scrub_utf8_fallback( $bytes ) ),
 ) );
 check( 'catches non-maximal-subpart scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) );
 
 // 3d. Scrubber that passes invalid bytes through untouched.
-$seen = broken_run( $oracles, $real_targets, array(
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'scrub_fb' => static fn( string $bytes ): string => $bytes,
 ) );
 check(
@@ -119,25 +127,76 @@ function broken_run( Oracles $oracles, array $real, array $overrides ): array {
 );
 
 // 3e. Scrubber that drops invalid bytes instead of replacing them.
-$seen = broken_run( $oracles, $real_targets, array(
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'scrub' => static fn( string $bytes ): string => str_replace( "\u{FFFD}", '', wp_scrub_utf8( $bytes ) ),
 ) );
 check( 'catches byte-dropping scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) );
 
 // 3f. Code point counter that counts invalid bytes individually.
-$seen = broken_run( $oracles, $real_targets, array(
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'codepoint_count' => static fn( string $bytes ): int => _wp_utf8_codepoint_count( $bytes ) + ( wp_is_valid_utf8( $bytes ) ? 0 : 1 ),
 ) );
 check( 'catches off-by-one code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) );
 
 // 3g. Throwing target is reported, not fatal.
-$seen = broken_run( $oracles, $real_targets, array(
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'is_valid_fb' => static function ( string $bytes ): bool {
 		throw new \RuntimeException( 'boom' );
 	},
 ) );
 check( 'reports throwing target', in_array( 'target-exception', $seen, true ), implode( ',', $seen ) );
 
+// 3h. Encoder that confuses ISO-8859-1 with Windows-1252 (0x80 becomes '€').
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'utf8_encode_fb' => static fn( string $bytes ): string => str_replace( "\xC2\x80", "\xE2\x82\xAC", _wp_utf8_encode_fallback( $bytes ) ),
+) );
+check( 'catches cp1252-confused encoder', in_array( 'utf8-encode-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3i. Encoder that passes high bytes through raw (invalid UTF-8 output).
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'utf8_encode_fb' => static fn( string $bytes ): string => $bytes,
+) );
+check(
+	'catches identity encoder',
+	in_array( 'utf8-encode-mismatch', $seen, true ) && in_array( 'utf8-encode-not-valid', $seen, true ),
+	implode( ',', $seen )
+);
+
+// 3j. Decoder that emits one '?' per invalid byte instead of per maximal
+//     subpart (`E2 8C` becomes '??' instead of '?').
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'utf8_decode_fb' => Targets::decode_per_invalid_byte( ... ),
+) );
+check( 'catches per-byte decoder', in_array( 'utf8-decode-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3k. Decoder that mangles a mappable code point on fully valid input.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'utf8_decode_fb' => static fn( string $bytes ): string => str_replace( "\xFC", "\xFD", _wp_utf8_decode_fallback( $bytes ) ),
+) );
+check( 'catches decoder mangling valid input', in_array( 'utf8-decode-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3l. Decoder that drops U+0080 entirely; the encode→decode round trip
+//     must restore every input byte string exactly.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'utf8_decode_fb' => static fn( string $bytes ): string => str_replace( "\x80", '', _wp_utf8_decode_fallback( $bytes ) ),
+) );
+check( 'catches round-trip violation', in_array( 'utf8-round-trip-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3m. Encoder that returns null (the fallbacks are untyped, so a broken
+//     variant can return non-strings without throwing); must be reported,
+//     not silently skipped by every encode-side check.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'utf8_encode_fb' => static fn( string $bytes ) => null,
+) );
+check( 'catches null-returning encoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) );
+
+// 3n. Decoder that returns null only for some inputs; must be reported
+//     from both the direct call and the round-trip path without crashing.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'utf8_decode_fb' => static fn( string $bytes ) => str_contains( $bytes, "\x80" ) ? null : _wp_utf8_decode_fallback( $bytes ),
+) );
+check( 'catches sometimes-null decoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) );
+
 // ---------------------------------------------------------------------
 // 4. Generator determinism and mix.
 // ---------------------------------------------------------------------

From 73653ef12338b2553213127b36ae03fde2b241aa Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:57:59 +0200
Subject: [PATCH 04/14] Fuzzer: Add wp_has_noncharacters three-way differential
 on valid input.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fuzz wp_has_noncharacters() (PCRE branch) and
_wp_has_noncharacters_fallback() against a trivial mb_str_split/mb_ord
reference oracle, on valid input only. On ill-formed input the public
function's answer depends on which environment branch of utf8.php
loaded — the PCRE branch returns false whenever preg_match fails while
the fallback skips invalid spans and reports noncharacters around them
("\xC0\xEF\xBF\xBE": PCRE false, fallback true). Per the handoff's
option (a), the fuzzer treats behavior as undefined unless
wp_is_valid_utf8() and pins the divergence with a fixed regression
vector in the smoke test; whether core aligns the implementations or
documents the stance remains an open question for the function author.

The reference oracle's battery covers the boundaries and interior of
the U+FDD0–U+FDEF block and the final two code points of EVERY plane
with their lower neighbors — the PCRE implementation enumerates each
plane as a separate hand-typed escape, so a single-plane typo is the
realistic bug class and now has deterministic coverage. The oracle
throws on ill-formed input rather than silently coercing mb_ord(false).
BOUNDARY_CODE_POINTS gains block-interior, adjacent-negative, and
mid-plane code points (seed re-derivation of older findings is
invalidated; documented in the README — artifact replays are
unaffected).

Mutation variants: blind detector, U+FDD0-block miss, over-eager
detector (shared between the smoke test and the new
ENCODING_FUZZ_FAULT=nonchars-miss-fdd0|nonchars-overeager fault modes,
one per target; both verified through worker, replay, and minimize).
Worker environment metadata now records pcre_u (which utf8.php branch
loaded) and the active fault name so injected artifacts can never be
mistaken for real findings.
---
 handoffs/extend-encoding-fuzzer.md          |  47 ++++----
 tools/encoding-fuzz/README.md               |  64 ++++++++---
 tools/encoding-fuzz/lib/Checks.php          |  95 ++++++++++++++++
 tools/encoding-fuzz/lib/Cli.php             |   5 +
 tools/encoding-fuzz/lib/Generator.php       |   6 +-
 tools/encoding-fuzz/lib/Oracles.php         | 113 +++++++++++++++++++-
 tools/encoding-fuzz/lib/Targets.php         |  38 ++++++-
 tools/encoding-fuzz/tests/harness-smoke.php |  47 +++++++-
 8 files changed, 371 insertions(+), 44 deletions(-)

diff --git a/handoffs/extend-encoding-fuzzer.md b/handoffs/extend-encoding-fuzzer.md
index 7fdd0f6797841..29e7effe5ef82 100644
--- a/handoffs/extend-encoding-fuzzer.md
+++ b/handoffs/extend-encoding-fuzzer.md
@@ -2,7 +2,8 @@
 
 ## Status
 
-Sections 1 (utf8_encode/decode) DONE; sections 2–3 in progress. The
+Sections 1 (utf8_encode/decode) and 2 (wp_has_noncharacters) DONE;
+section 3 in progress. The
 host fuzzer (`tools/encoding-fuzz/`) is complete and working on branch
 `fuzz-encoder`; read its `README.md` first. ~570k cases had run clean
 against the original targets before this work started.
@@ -56,7 +57,7 @@ backslash text, and the `$i < 0xD800 || $i > 0xE000` boundary routes
 valid U+E000 through the broken branch. It only ever asserts
 mb-equivalence on valid input. Worth a follow-up patch on #63863.
 
-## 2. wp_has_noncharacters — resolve semantics first
+## 2. wp_has_noncharacters — DONE via option (a); core decision still open
 
 **Known divergence, confirmed empirically (2026-06-10):**
 
@@ -66,24 +67,30 @@ wp_has_noncharacters( $probe );             // false — PCRE path: preg_match f
 _wp_has_noncharacters_fallback( $probe );   // true  — scan skips invalid spans, finds U+FFFE
 ```
 
-The same public function answers differently depending on which
-environment branch of `src/wp-includes/utf8.php` loaded. A naive
-differential will fail on roughly its first invalid-input case. Do NOT
-just add the check and let it scream:
-
-1. Decide (or get a decision on) intended behavior for ill-formed
-   input. Options: (a) document that behavior is undefined unless
-   `wp_is_valid_utf8()` — then fuzz the differential on valid inputs
-   only, plus a fixed regression vector for the documented stance;
-   (b) align the implementations (likely the fallback is the *better*
-   semantic — finding real noncharacters — but the PCRE version ships
-   on most hosts). This probably warrants a Trac ticket / discussion
-   with the function author before code changes.
-2. Either way, fuzz the three-way differential on **valid** inputs
-   immediately: PCRE implementation vs fallback vs a trivial reference
-   (decode code points, check the U+FDD0–U+FDEF / U+xFFFE / U+xFFFF
-   list). The generator already emits noncharacter-dense input
-   (`BOUNDARY_CODE_POINTS` in `lib/Generator.php`).
+**Implemented as option (a):** the fuzzer treats behavior as undefined
+unless `wp_is_valid_utf8()` and runs the three-way differential —
+`wp_has_noncharacters()` (PCRE branch) vs
+`_wp_has_noncharacters_fallback()` vs a trivial `mb_str_split()` /
+`mb_ord()` reference (battery-verified at block boundaries, block
+interior, and the final two code points of every plane with their
+neighbors — the PCRE class enumerates each plane by hand, so per-plane
+vectors are the point) — on **valid inputs only**. The probe above is
+pinned as a fixed regression vector in the smoke test, so any semantic
+change to either branch surfaces immediately. `BOUNDARY_CODE_POINTS`
+in `lib/Generator.php` gained adjacent NON-noncharacters, a block
+interior point, and mid-plane finals. Mutation variants: blind
+detector, U+FDD0-block miss, over-eager detector; fault injection:
+`ENCODING_FUZZ_FAULT=nonchars-miss-fdd0|nonchars-overeager` (one per
+target).
+
+**Still open upstream (option b path):** whether core should align the
+implementations or document the undefined-on-invalid stance in the
+`wp_has_noncharacters()` docblock. That needs a decision from the
+function author (Trac discussion). Note for whoever picks that up: if
+core aligns on PCRE semantics (false on any ill-formed input), the mb
+reference oracle and its battery must be extended for ill-formed input
+too — removing the valid-only gate alone is NOT sufficient, since the
+reference throws on ill-formed input by design.
 
 ## 3. code_point_to_utf8_bytes — exhaust, don't fuzz
 
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index 80ac3da42df19..23b9101ca51f4 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -5,6 +5,7 @@ Differential fuzzer for the WordPress UTF-8 functions:
 - `wp_is_valid_utf8()` / `_wp_is_valid_utf8_fallback()`
 - `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()`
 - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()`
+- `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only)
 - `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary)
 
 The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main
@@ -16,14 +17,14 @@ bootstrap, database, or `wp-env`.
 
 Every result is compared against independent known-good implementations:
 
-| Oracle    | Backing                              | Validity | Scrub | Encode | Decode |
-|-----------|--------------------------------------|----------|-------|--------|--------|
-| `mb`      | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) |
-| `pcre`    | PCRE2 strict UTF validation          | ✓        |       |        |        |
-| `intl`    | ICU `UConverter::transcode()`        |          | ✓     |        |        |
-| `python3` | CPython codec, persistent subprocess | ✓        | ✓     |        |        |
-| `node`    | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓     |        |        |
-| `native`  | deprecated `utf8_encode()` / `utf8_decode()` | |       | ✓      | ✓ (valid input only) |
+| Oracle    | Backing                              | Validity | Scrub | Encode | Decode | Nonchars |
+|-----------|--------------------------------------|----------|-------|--------|--------|----------|
+| `mb`      | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` / `mb_str_split()`+`mb_ord()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) | ✓ (valid input only) |
+| `pcre`    | PCRE2 strict UTF validation          | ✓        |       |        |        |          |
+| `intl`    | ICU `UConverter::transcode()`        |          | ✓     |        |        |          |
+| `python3` | CPython codec, persistent subprocess | ✓        | ✓     |        |        |          |
+| `node`    | WHATWG `TextDecoder`, persistent subprocess | ✓ | ✓     |        |        |          |
+| `native`  | deprecated `utf8_encode()` / `utf8_decode()` | |       | ✓      | ✓ (valid input only) |  |
 
 Encode oracles answer "what is this ISO-8859-1 text as UTF-8?"; decode
 oracles the reverse. The `native` pair exists until PHP 9 removes it; on
@@ -41,6 +42,15 @@ the PHP 9 polyfill in `compat.php` prefers `mb_convert_encoding()`
 with `_wp_utf8_decode_fallback()` as its mbstring-less shadow
 (ticket #63863).
 
+The `mb` noncharacter oracle (a trivial decode-and-test over
+`mb_str_split()` / `mb_ord()`) backs the `wp_has_noncharacters()`
+differential. Like every oracle it must pass a hand-derived battery,
+which covers the boundaries and interior of the U+FDD0–U+FDEF block
+and the final two code points of every plane with their neighbors —
+the PCRE implementation under test enumerates each plane as a separate
+hand-typed escape, so per-plane coverage is the point. It is defined
+on valid input only — see the noncharacter policy under Checks.
+
 Because native and mb decoding agree on *every* valid code point
 (verified exhaustively over U+0000–U+10FFFF), the valid-input-only
 native decode differential adds little detection power beyond `mb`; it
@@ -72,6 +82,23 @@ decode oracle on valid input only). Oracle-vs-oracle disagreements
 are reported separately (`oracle-disagreement`) so they don't masquerade
 as WordPress bugs.
 
+Noncharacter detection is a three-way differential on **valid input
+only**: `wp_has_noncharacters()` (the PCRE branch on hosts with
+PCRE-u; without PCRE-u the public function aliases the fallback and
+the differential degenerates to two distinct implementations — the
+worker records which branch loaded as `pcre_u` in its environment
+metadata), `_wp_has_noncharacters_fallback()`, and the trivial mb
+reference must agree. On ill-formed input the public function's answer
+depends on which environment branch of `utf8.php` loaded — the PCRE
+branch returns false for any ill-formed input because `preg_match`
+fails, while the fallback skips invalid spans and reports the
+noncharacters around them (`"\xC0\xEF\xBF\xBE"`: PCRE false, fallback
+true). The fuzzer's stance is that behavior is undefined unless
+`wp_is_valid_utf8()`; the divergence itself is pinned by a fixed
+regression vector in the smoke test, and aligning the implementations
+(or documenting the stance in core) is an open upstream question for
+the function author.
+
 Internal invariants:
 
 - valid ⟺ scrub returns the input unchanged
@@ -88,7 +115,11 @@ Internal invariants:
 
 ## Inputs
 
-Each case is fully determined by `(seed, case index)`. The generator
+Each case is fully determined by `(seed, case index)` **for a given
+generator version**: changing the generator (e.g. its boundary code
+point list) invalidates `--seed`/`--case` re-derivation of older
+findings. Failure artifacts embed the input bytes, so `--failure` and
+`--input` replays remain valid across versions. The generator
 mixes nine strategies: uniformly random bytes, random ASCII,
 boundary-heavy valid UTF-8 (encoding-length edges, surrogate-gap edges,
 noncharacters, BOM, U+10FFFF), mutated valid UTF-8 (bit flips,
@@ -154,18 +185,23 @@ php tools/encoding-fuzz/tests/harness-smoke.php
 ```
 
 Verifies the oracle battery, runs the real targets over the battery
-vectors, and — most importantly — mutation-tests the harness: fourteen
+vectors, and — most importantly — mutation-tests the harness: seventeen
 classes of deliberately broken implementations (validator accepting
 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber,
 identity scrubber, byte-dropping scrubber, off-by-one code point count,
 throwing target, cp1252-confused encoder, identity encoder, per-byte
 decoder, valid-input-mangling decoder, round-trip-violating decoder,
-null-returning encoder, sometimes-null decoder) must all be caught. It
-also asserts generator determinism and the valid/invalid input mix.
+null-returning encoder, sometimes-null decoder, blind noncharacter
+detector, U+FDD0-block-missing detector, over-eager noncharacter
+detector) must all be caught. It also asserts generator determinism,
+the valid/invalid input mix, and the documented
+`wp_has_noncharacters()` divergence stance on ill-formed input.
 
 For end-to-end pipeline testing while the real implementations are
-healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte`
-injects a broken target into worker, replay, and minimize alike:
+healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager`
+injects a broken target into worker, replay, and minimize alike.
+Fault-injected artifacts record the fault name in their environment
+metadata so they cannot be mistaken for real findings:
 
 ```sh
 ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/runner.php --lanes 2 --duration-seconds 5
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
index cf6d7c8be2d7b..14bffa042acdd 100644
--- a/tools/encoding-fuzz/lib/Checks.php
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -20,6 +20,12 @@
  *    chunks reconstructs the same scrubbed text and always makes
  *    forward progress
  *
+ * Noncharacter detection (VALID input only — the public function's
+ * answer on ill-formed input depends on which environment branch of
+ * `utf8.php` loaded, a documented divergence pinned by the smoke test):
+ *  - `wp_has_noncharacters()` and `_wp_has_noncharacters_fallback()` vs
+ *    a trivial decode-and-test reference.
+ *
  * Legacy `utf8_encode()` / `utf8_decode()` fallbacks:
  *  - `_wp_utf8_encode_fallback()` vs every encode oracle on arbitrary
  *    input treated as ISO-8859-1.
@@ -253,6 +259,95 @@ public function run( string $input ): array {
 			$failures[] = $failure;
 		}
 
+		// 9. Noncharacter detection, on valid input only.
+		foreach ( $this->check_noncharacters( $input, $ref_valid ) as $failure ) {
+			$failures[] = $failure;
+		}
+
+		return $failures;
+	}
+
+	/**
+	 * Three-way differential for noncharacter detection on VALID input:
+	 * the public `wp_has_noncharacters()` (the PCRE branch on hosts with
+	 * PCRE-u; otherwise it aliases the fallback and this degenerates to
+	 * two distinct implementations), the `_wp_scan_utf8()`-based
+	 * fallback, and the trivial mb reference must all agree.
+	 *
+	 * Ill-formed input is deliberately skipped: the PCRE branch answers
+	 * false on any ill-formed input (`preg_match` fails) while the
+	 * fallback skips invalid spans and reports noncharacters around
+	 * them, so the same public function answers differently depending
+	 * on which environment branch loaded. That stance — behavior is
+	 * undefined unless `wp_is_valid_utf8()` — is pinned by a fixed
+	 * regression vector in the smoke test, not fuzzed.
+	 *
+	 * @return array<int, array{check: string, signature: string, detail: array}>
+	 */
+	private function check_noncharacters( string $input, bool $ref_valid ): array {
+		if ( ! $ref_valid ) {
+			return array();
+		}
+
+		$oracles = $this->oracles->noncharacter_oracles();
+		if ( ! isset( $oracles['mb'] ) ) {
+			return array();
+		}
+
+		$failures = array();
+		$expected = $oracles['mb']( $input );
+
+		foreach ( $oracles as $name => $oracle ) {
+			if ( 'mb' === $name ) {
+				continue;
+			}
+
+			$oracle_result = $oracle( $input );
+			if ( $oracle_result !== $expected ) {
+				$failures[] = self::failure(
+					'oracle-disagreement',
+					"noncharacters:{$name}",
+					array(
+						'kind'     => 'noncharacters',
+						'oracle'   => $name,
+						'got'      => $oracle_result,
+						'expected' => $expected,
+					)
+				);
+			}
+		}
+
+		foreach ( array( 'has_nonchars', 'has_nonchars_fb' ) as $key ) {
+			try {
+				$result = ( $this->targets[ $key ] )( $input );
+			} catch ( \Throwable $error ) {
+				$failures[] = self::failure(
+					'target-exception',
+					$key,
+					array(
+						'target'  => $key,
+						'message' => $error->getMessage(),
+						'class'   => get_class( $error ),
+					)
+				);
+				continue;
+			}
+
+			if ( $result !== $expected ) {
+				$failures[] = self::failure(
+					'noncharacters-mismatch',
+					$key,
+					array(
+						'target'        => $key,
+						'got'           => $result,
+						'expected'      => $expected,
+						'oracle'        => 'mb',
+						'input_preview' => self::preview( $input ),
+					)
+				);
+			}
+		}
+
 		return $failures;
 	}
 
diff --git a/tools/encoding-fuzz/lib/Cli.php b/tools/encoding-fuzz/lib/Cli.php
index 14c5d4f671324..3ddd47679b5ab 100644
--- a/tools/encoding-fuzz/lib/Cli.php
+++ b/tools/encoding-fuzz/lib/Cli.php
@@ -107,6 +107,11 @@ public static function environment_metadata( Oracles $oracles ): array {
 			'php'     => PHP_VERSION,
 			'os'      => PHP_OS_FAMILY,
 			'oracles' => $oracles->names(),
+			// Which environment branch of utf8.php loaded (PCRE vs fallback).
+			'pcre_u'  => function_exists( '_wp_can_use_pcre_u' ) ? _wp_can_use_pcre_u() : null,
+			// Mark fault-injected artifacts so they can never be mistaken
+			// for real findings.
+			'fault'   => getenv( 'ENCODING_FUZZ_FAULT' ) ?: null,
 		);
 	}
 }
diff --git a/tools/encoding-fuzz/lib/Generator.php b/tools/encoding-fuzz/lib/Generator.php
index eb07d7d89183c..bcd4a935d5b31 100644
--- a/tools/encoding-fuzz/lib/Generator.php
+++ b/tools/encoding-fuzz/lib/Generator.php
@@ -25,9 +25,11 @@ class Generator {
 		0x80, 0x7FF,                                               // Two-byte edges.
 		0x800, 0xFFF, 0x1000, 0xCFFF, 0xD000, 0xD7FF,              // Three-byte lead splits.
 		0xE000, 0xFFFD,                                            // After the surrogate gap.
-		0xFDD0, 0xFDEF, 0xFFFE, 0xFFFF,                            // Noncharacters (valid UTF-8!).
+		0xFDD0, 0xFDDA, 0xFDEF, 0xFFFE, 0xFFFF,                    // Noncharacters (valid UTF-8!), incl. block interior.
+		0xFDCF, 0xFDF0,                                            // Adjacent NON-noncharacters.
 		0x10000, 0x3FFFF, 0x40000, 0xFFFFF, 0x100000, 0x10FFFF,    // Four-byte lead splits.
-		0x1FFFE, 0x1FFFF, 0x10FFFE,                                // Supplementary noncharacters.
+		0x1FFFD, 0x1FFFE, 0x1FFFF, 0x5FFFE, 0x8FFFF, 0x10FFFE,     // Supplementary noncharacters, mid planes, neighbors.
+		0x10FFFD,
 	);
 
 	/**
diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php
index c3c4b0900947f..88a5df3b1ae41 100644
--- a/tools/encoding-fuzz/lib/Oracles.php
+++ b/tools/encoding-fuzz/lib/Oracles.php
@@ -8,6 +8,9 @@
  * Scrub oracles answer "what does maximal-subpart replacement produce?".
  * Encode oracles answer "what is this ISO-8859-1 text as UTF-8?".
  * Decode oracles answer "what is this UTF-8 text as ISO-8859-1?".
+ * Noncharacter oracles answer "does this VALID UTF-8 text contain a
+ * Unicode noncharacter?" (U+FDD0–U+FDEF, or any code point whose low
+ * sixteen bits are FFFE or FFFF). They are defined on valid input only.
  *
  *  - mbstring:  `mb_check_encoding()` / `mb_scrub()` (maximal subpart
  *               since PHP 8.1.6), `mb_convert_encoding()` for the
@@ -63,6 +66,9 @@ class Oracles {
 	/** @var array<string, bool> Decode oracles trusted on valid UTF-8 input only. */
 	private array $decode_valid_only = array();
 
+	/** @var array<string, callable(string): bool> Defined on valid UTF-8 input only. */
+	private array $noncharacters = array();
+
 	/** @var ExternalOracle[] */
 	private array $externals = array();
 
@@ -110,6 +116,36 @@ public static function build( array $external_names ): self {
 			};
 		}
 
+		if ( function_exists( 'mb_str_split' ) && function_exists( 'mb_ord' ) ) {
+			/*
+			 * Trivial decode-and-test reference for noncharacter detection,
+			 * independent of both implementations under test (the PCRE
+			 * character-class regex and the `_wp_scan_utf8()`-based scan).
+			 * Callers must pass valid UTF-8.
+			 */
+			$oracles->noncharacters['mb'] = static function ( string $valid_utf8 ): bool {
+				foreach ( mb_str_split( $valid_utf8, 1, 'UTF-8' ) as $character ) {
+					$code_point = mb_ord( $character, 'UTF-8' );
+
+					// Fail loudly on contract violations: on ill-formed
+					// input `mb_ord()` returns false, which would otherwise
+					// coerce into "not a noncharacter" and silently mimic
+					// the fallback's skip-invalid-spans semantics.
+					if ( ! is_int( $code_point ) ) {
+						throw new \LogicException( 'noncharacter oracle requires valid UTF-8 input' );
+					}
+
+					if (
+						( $code_point >= 0xFDD0 && $code_point <= 0xFDEF ) ||
+						0xFFFE === ( $code_point & 0xFFFE )
+					) {
+						return true;
+					}
+				}
+				return false;
+			};
+		}
+
 		if ( function_exists( 'utf8_encode' ) && function_exists( 'utf8_decode' ) ) {
 			$oracles->encode['native'] = static function ( string $bytes ): string {
 				// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- Deprecated since PHP 8.2.
@@ -285,6 +321,52 @@ public static function decode_battery(): array {
 		);
 	}
 
+	/**
+	 * Known-answer vectors for the noncharacter oracles. All inputs are
+	 * valid UTF-8 (the question is only defined there) and cover the
+	 * boundaries AND interior of the U+FDD0–U+FDEF block plus the final
+	 * two code points of EVERY plane with their U+xFFFD neighbors — the
+	 * PCRE implementation under test enumerates each plane as a separate
+	 * hand-typed escape, exactly where a single-plane typo would hide.
+	 *
+	 * Expectations are hand-derived from the Unicode definition; bytes
+	 * for the looped vectors come from the pure-arithmetic
+	 * `Generator::encode_code_point()` (itself exhaustively verified
+	 * against `mb_chr()` by `tests/code-point-to-utf8-exhaustive.php`),
+	 * keeping the encoding independent of the mbstring-backed oracle.
+	 *
+	 * @return array<int, array{0: string, 1: bool}> [valid utf8 bytes, has noncharacters]
+	 */
+	public static function noncharacter_battery(): array {
+		$vectors = array(
+			array( '', false ),
+			array( 'abc', false ),
+			array( "\u{FDCF}", false ),       // Last code point before the contiguous block.
+			array( "\u{FDD0}", true ),        // First of the contiguous block.
+			array( "\u{FDDA}", true ),        // Interior of the block: a lookup-table bug
+			array( "\u{FDE5}", true ),        // is not necessarily a boundary bug.
+			array( "\u{FDEF}", true ),        // Last of the contiguous block.
+			array( "\u{FDF0}", false ),       // First code point after the block.
+			array( "\u{FEFF}", false ),       // BOM is not a noncharacter.
+			array( "\u{FFFD}", false ),       // Replacement character is not a noncharacter.
+			array( "\u{ABCD}", false ),       // Arbitrary interior scalar.
+			array( "a\u{FFFE}b", true ),      // Embedded in surrounding text.
+			array( "ascii only", false ),
+		);
+
+		// Both plane-final noncharacters and their lower neighbor, for
+		// all seventeen planes (0–16).
+		for ( $plane = 0; $plane <= 0x10; $plane++ ) {
+			$final = ( $plane << 16 ) | 0xFFFF;
+
+			$vectors[] = array( Generator::encode_code_point( $final - 2 ), false );
+			$vectors[] = array( Generator::encode_code_point( $final - 1 ), true );
+			$vectors[] = array( Generator::encode_code_point( $final ), true );
+		}
+
+		return $vectors;
+	}
+
 	private function verify_battery(): void {
 		foreach ( self::battery() as $i => $vector ) {
 			list( $bytes, $expected_valid, $expected_scrub ) = $vector;
@@ -333,6 +415,23 @@ private function verify_battery(): void {
 			}
 		}
 
+		foreach ( self::noncharacter_battery() as $i => $vector ) {
+			list( $bytes, $expected ) = $vector;
+
+			foreach ( $this->noncharacters as $name => $check ) {
+				$got = $check( $bytes );
+				if ( $got !== $expected ) {
+					$this->disable( $name, sprintf(
+						'noncharacter battery vector %d (%s): expected %s, got %s',
+						$i,
+						bin2hex( $bytes ),
+						var_export( $expected, true ),
+						var_export( $got, true )
+					) );
+				}
+			}
+		}
+
 		foreach ( self::decode_battery() as $i => $vector ) {
 			list( $bytes, $input_valid, $expected ) = $vector;
 
@@ -366,7 +465,8 @@ public function disable( string $name, string $detail ): void {
 			! isset( $this->validity[ $name ] ) &&
 			! isset( $this->scrub[ $name ] ) &&
 			! isset( $this->encode[ $name ] ) &&
-			! isset( $this->decode[ $name ] )
+			! isset( $this->decode[ $name ] ) &&
+			! isset( $this->noncharacters[ $name ] )
 		) {
 			return;
 		}
@@ -376,7 +476,8 @@ public function disable( string $name, string $detail ): void {
 			$this->scrub[ $name ],
 			$this->encode[ $name ],
 			$this->decode[ $name ],
-			$this->decode_valid_only[ $name ]
+			$this->decode_valid_only[ $name ],
+			$this->noncharacters[ $name ]
 		);
 		$this->events[] = array(
 			'type'   => 'oracle-disabled',
@@ -409,6 +510,11 @@ public function decode_oracle_is_valid_only( string $name ): bool {
 		return $this->decode_valid_only[ $name ] ?? false;
 	}
 
+	/** @return array<string, callable(string): bool> Defined on valid UTF-8 input only. */
+	public function noncharacter_oracles(): array {
+		return $this->noncharacters;
+	}
+
 	public function has_required(): bool {
 		return isset( $this->validity['mb'], $this->scrub['mb'] );
 	}
@@ -418,7 +524,8 @@ public function names(): array {
 			array_keys( $this->validity ),
 			array_keys( $this->scrub ),
 			array_keys( $this->encode ),
-			array_keys( $this->decode )
+			array_keys( $this->decode ),
+			array_keys( $this->noncharacters )
 		) ) );
 	}
 
diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php
index 6c4e3412e5776..19f6ce3d45bc9 100644
--- a/tools/encoding-fuzz/lib/Targets.php
+++ b/tools/encoding-fuzz/lib/Targets.php
@@ -9,10 +9,12 @@
  * be exercised end to end even while the real implementations are
  * healthy. It exists only for harness validation:
  *
- *   ENCODING_FUZZ_FAULT=accept-c0       validator accepts the 0xC0 byte
- *   ENCODING_FUZZ_FAULT=non-maximal     scrubber collapses adjacent U+FFFD
- *   ENCODING_FUZZ_FAULT=encode-cp1252   encoder maps 0x80 like Windows-1252
- *   ENCODING_FUZZ_FAULT=decode-per-byte decoder emits '?' per invalid byte
+ *   ENCODING_FUZZ_FAULT=accept-c0          validator accepts the 0xC0 byte
+ *   ENCODING_FUZZ_FAULT=non-maximal        scrubber collapses adjacent U+FFFD
+ *   ENCODING_FUZZ_FAULT=encode-cp1252      encoder maps 0x80 like Windows-1252
+ *   ENCODING_FUZZ_FAULT=decode-per-byte    decoder emits '?' per invalid byte
+ *   ENCODING_FUZZ_FAULT=nonchars-miss-fdd0 fallback detector misses U+FDD0–U+FDEF
+ *   ENCODING_FUZZ_FAULT=nonchars-overeager public detector also flags U+FDCF
  */
 class Targets {
 	/**
@@ -27,6 +29,8 @@ public static function resolve(): array {
 			'codepoint_count' => '_wp_utf8_codepoint_count',
 			'utf8_encode_fb'  => '_wp_utf8_encode_fallback',
 			'utf8_decode_fb'  => '_wp_utf8_decode_fallback',
+			'has_nonchars'    => 'wp_has_noncharacters',
+			'has_nonchars_fb' => '_wp_has_noncharacters_fallback',
 		);
 
 		switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) {
@@ -53,11 +57,37 @@ public static function resolve(): array {
 			case 'decode-per-byte':
 				$targets['utf8_decode_fb'] = self::decode_per_invalid_byte( ... );
 				break;
+
+			case 'nonchars-miss-fdd0':
+				$targets['has_nonchars_fb'] = self::nonchars_missing_fdd0_block( ... );
+				break;
+
+			case 'nonchars-overeager':
+				$targets['has_nonchars'] = self::nonchars_overeager( ... );
+				break;
 		}
 
 		return $targets;
 	}
 
+	/**
+	 * Deliberately broken detector: finds only the plane-final
+	 * noncharacters, missing the contiguous U+FDD0–U+FDEF block — a
+	 * plausible spec misreading.
+	 */
+	public static function nonchars_missing_fdd0_block( string $text ): bool {
+		$stripped = (string) preg_replace( '/[\x{FDD0}-\x{FDEF}]/u', '', $text );
+		return _wp_has_noncharacters_fallback( $stripped );
+	}
+
+	/**
+	 * Deliberately broken detector: also flags U+FDCF, the code point
+	 * just below the contiguous noncharacter block.
+	 */
+	public static function nonchars_overeager( string $text ): bool {
+		return wp_has_noncharacters( $text ) || str_contains( $text, "\u{FDCF}" );
+	}
+
 	/**
 	 * Deliberately broken decoder: emits one '?' for every byte of an
 	 * invalid span instead of one per maximal subpart, so multi-byte
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index 59b89327be408..965c1743b03ae 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -58,7 +58,8 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 $battery_vectors = array_merge(
 	array_column( Oracles::battery(), 0 ),
 	array_column( Oracles::encode_battery(), 0 ),
-	array_column( Oracles::decode_battery(), 0 )
+	array_column( Oracles::decode_battery(), 0 ),
+	array_column( Oracles::noncharacter_battery(), 0 )
 );
 foreach ( $battery_vectors as $i => $bytes ) {
 	foreach ( $checks->run( $bytes ) as $failure ) {
@@ -67,6 +68,29 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 }
 check( 'real targets clean on battery', array() === $battery_fails, implode( '; ', $battery_fails ) );
 
+/*
+ * Documented stance: `wp_has_noncharacters()` is undefined on ill-formed
+ * input. On hosts with PCRE-u the public function answers false on ANY
+ * ill-formed input (`preg_match` fails) while the fallback skips invalid
+ * spans and reports the noncharacters around them. This regression
+ * vector pins the divergence; if it ever changes, the semantics were
+ * touched and the valid-input-only fuzzing policy must be revisited.
+ */
+$nonchar_probe = "\xC0\xEF\xBF\xBE"; // Invalid byte, then U+FFFE.
+if ( _wp_can_use_pcre_u() ) {
+	check(
+		'documented wp_has_noncharacters divergence on ill-formed input unchanged',
+		false === wp_has_noncharacters( $nonchar_probe ) && true === _wp_has_noncharacters_fallback( $nonchar_probe ),
+		sprintf(
+			'public: %s, fallback: %s',
+			var_export( wp_has_noncharacters( $nonchar_probe ), true ),
+			var_export( _wp_has_noncharacters_fallback( $nonchar_probe ), true )
+		)
+	);
+} else {
+	echo "SKIP documented wp_has_noncharacters divergence (no PCRE-u: public function aliases the fallback)\n";
+}
+
 // ---------------------------------------------------------------------
 // 3. Broken implementations must be caught.
 // ---------------------------------------------------------------------
@@ -78,6 +102,8 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 	'codepoint_count' => '_wp_utf8_codepoint_count',
 	'utf8_encode_fb'  => '_wp_utf8_encode_fallback',
 	'utf8_decode_fb'  => '_wp_utf8_decode_fallback',
+	'has_nonchars'    => 'wp_has_noncharacters',
+	'has_nonchars_fb' => '_wp_has_noncharacters_fallback',
 );
 
 /**
@@ -197,6 +223,25 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 ) );
 check( 'catches sometimes-null decoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) );
 
+// 3o. Noncharacter detector that never finds anything.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'has_nonchars_fb' => static fn( string $text ): bool => false,
+) );
+check( 'catches blind noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3p. Detector that misses the contiguous U+FDD0–U+FDEF block (the
+//     plane-final pairs alone are a plausible spec misreading).
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'has_nonchars_fb' => Targets::nonchars_missing_fdd0_block( ... ),
+) );
+check( 'catches detector missing U+FDD0 block', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3q. Over-eager detector that flags U+FDCF, just below the block.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'has_nonchars' => Targets::nonchars_overeager( ... ),
+) );
+check( 'catches over-eager noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) );
+
 // ---------------------------------------------------------------------
 // 4. Generator determinism and mix.
 // ---------------------------------------------------------------------

From a7df034e4b357a4a2a4a7f380d7e09961601152d Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Wed, 10 Jun 2026 23:14:14 +0200
Subject: [PATCH 05/14] Fuzzer: Exhaustively test
 WP_HTML_Decoder::code_point_to_utf8_bytes().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function's domain (~1.1M code points) is small enough to test
completely instead of fuzzing. The new standalone script checks every
code point 0x0-0x10FFFF plus out-of-range probes against the fuzzer's
pure-arithmetic encoder (the independent oracle), with an explicit
mb_chr( $cp, 'UTF-8' ) consistency cross-check; surrogates and
out-of-range values must yield U+FFFD. Runs in ~0.4s. The harness
smoke test executes it and proves its detection fires via the
script-local ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark variant.

Documents an upstream regression (pinned as a labeled KNOWN ISSUE
check so the stance cannot silently go stale): since [62424] (#65342,
unreleased) the implementation calls mb_chr() without an explicit
encoding, inheriting mb_internal_encoding() — which WordPress sets
from blog_charset — so non-UTF-8 sites get raw legacy bytes for
mappable code points while invalid ones still yield UTF-8 U+FFFD,
contradicting the docblock and mixing encodings with the named
character reference path. The 6.6.0 original was pure arithmetic and
always emitted UTF-8; the same commit changed code point 0 from
U+FFFD to NUL. One-line upstream fix: mb_chr( $code_point, 'UTF-8' ).

Closes out the extend-encoding-fuzzer handoff: all three sections
done, definition of done verified and recorded in the handoff doc.
---
 handoffs/extend-encoding-fuzzer.md            |  94 ++++++++---
 tools/encoding-fuzz/README.md                 |  26 +++
 .../tests/code-point-to-utf8-exhaustive.php   | 159 ++++++++++++++++++
 tools/encoding-fuzz/tests/harness-smoke.php   |  12 ++
 4 files changed, 266 insertions(+), 25 deletions(-)
 create mode 100644 tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php

diff --git a/handoffs/extend-encoding-fuzzer.md b/handoffs/extend-encoding-fuzzer.md
index 29e7effe5ef82..8940e08c60098 100644
--- a/handoffs/extend-encoding-fuzzer.md
+++ b/handoffs/extend-encoding-fuzzer.md
@@ -2,15 +2,15 @@
 
 ## Status
 
-Sections 1 (utf8_encode/decode) and 2 (wp_has_noncharacters) DONE;
-section 3 in progress. The
-host fuzzer (`tools/encoding-fuzz/`) is complete and working on branch
-`fuzz-encoder`; read its `README.md` first. ~570k cases had run clean
-against the original targets before this work started.
+All three sections DONE. The host fuzzer (`tools/encoding-fuzz/`) is
+complete and working on branch `fuzz-encoder`; read its `README.md`
+first. ~570k cases had run clean against the original targets before
+this work started.
 
 ## Goal
 
-Round out coverage of `src/wp-includes/compat-utf8.php` by adding:
+Round out coverage of `src/wp-includes/compat-utf8.php` (plus one
+html-api encoder) by adding:
 
 1. `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()`
    differentials against the native `utf8_encode()` / `utf8_decode()`.
@@ -92,30 +92,74 @@ reference oracle and its battery must be extended for ill-formed input
 too — removing the valid-only gate alone is NOT sufficient, since the
 reference throws on ill-formed input by design.
 
-## 3. code_point_to_utf8_bytes — exhaust, don't fuzz
-
-`WP_HTML_Decoder::code_point_to_utf8_bytes()`
-(`src/wp-includes/html-api/class-wp-html-decoder.php:426`) has a domain
-of ~1.1M values. Write a standalone script (or slow-group PHPUnit test)
-asserting equality with `mb_chr( $cp, 'UTF-8' )` for every code point
-0x0–0x10FFFF, including expected behavior for surrogates and
-out-of-range values (check what the function documents; `mb_chr`
-returns `false` for surrogates — decide the comparison accordingly).
-Runs in seconds; total coverage; done forever. Note this class is
-loaded from `html-api/`, so the fuzzer bootstrap (`lib/Bootstrap.php`)
-needs to require it (it has no dependencies beyond the token map — if
-it pulls more, load only for this check).
+## 3. code_point_to_utf8_bytes — DONE; upstream finding documented
+
+Implemented as `tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php`
+(standalone, not wired into `Bootstrap.php` — the class is required
+only by this script, which parses cleanly with no other dependencies;
+loading html-api code into every fuzz worker would buy nothing).
+Every code point 0x0–0x10FFFF plus out-of-range probes, compared
+against the pure-arithmetic `Generator::encode_code_point()` (the
+independent oracle) with an additional `mb_chr( $cp, 'UTF-8' )`
+consistency cross-check (the implementation is itself mb_chr-backed;
+the cross-check would expose a bug shared between implementation and
+arithmetic encoder). Surrogates and out-of-range values yield U+FFFD
+as documented. Runs in ~0.4s, passes on PHP 8.4.21. The harness smoke
+test executes it and proves its detection fires via the
+`ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark` broken variant.
+
+**Upstream finding (real bug — an unreleased trunk REGRESSION, not
+fixed here):** the implementation is `mb_chr( $code_point )` with NO
+explicit encoding, so it inherits `mb_internal_encoding()` — which
+WordPress sets from `blog_charset` (`wp_set_internal_encoding()`,
+`src/wp-includes/load.php`). On a non-UTF-8 site it returns raw legacy
+bytes for mappable code points (e.g. `"\xE9"` for U+00E9 under
+ISO-8859-1) while still returning UTF-8 U+FFFD for invalid ones,
+contradicting its docblock. Aggravating facts for the upstream report:
+
+- Introduced by [62424] (#65342, `@since 7.1.0`, unreleased): the
+  6.6.0 original was a pure-arithmetic encoder that always emitted
+  UTF-8 regardless of mbstring state. Fix-before-release territory.
+- WP's own `_mb_chr()` polyfill in `compat.php` documents
+  `@param "UTF-8"|null $encoding Must be 'UTF-8' or null` and treats
+  null as UTF-8 — so mbstring-less hosts always emit UTF-8 while
+  mbstring hosts follow `blog_charset`. Same WordPress, divergent
+  output by extension presence.
+- Named character references decode through the UTF-8 token map
+  regardless: on a latin1 site `&eacute;` → UTF-8 `C3 A9` but
+  `&#233;` → latin1 `E9` in the same decoded string. There is no
+  intentional-behavior steelman; output is mixed-encoding either way.
+- The same commit silently changed `code_point_to_utf8_bytes( 0 )`
+  from `U+FFFD` to `"\0"` (the old guard was `$code_point <= 0`).
+  Callers are unaffected (`&#0;` is intercepted earlier) and the new
+  behavior matches the docblock, but it belongs in the same report.
+
+One-line fix: `mb_chr( $code_point, 'UTF-8' )`. The script pins the
+current buggy behavior as a labeled KNOWN ISSUE check so the stance
+cannot silently go stale; update or remove the pin when fixed.
 
 ## Verification / definition of done
 
+All verified 2026-06-10 on PHP 8.4.21:
+
 - `php tools/encoding-fuzz/tests/harness-smoke.php` passes, including
-  new broken-variant detections for every added check.
-- A fault-injection variant per new target in `lib/Targets.php`
-  (`ENCODING_FUZZ_FAULT=...`) exercises worker → replay → minimize end
-  to end.
+  broken-variant detections for every added check (seventeen mutation
+  classes plus the exhaustive script's surrogate fault).
+- Fault-injection variants per new target
+  (`ENCODING_FUZZ_FAULT=encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager`)
+  exercised worker → replay → minimize end to end; artifacts now record
+  the fault name and `pcre_u` in environment metadata. The script-local
+  `codepoint-surrogate-qmark` fault is proven via the smoke test's
+  subprocess run (the exhaustive script never enters the worker
+  pipeline).
 - `php tools/encoding-fuzz/runner.php --lanes 4 --duration-seconds 60`
-  runs clean (or findings are triaged and documented, not silenced).
-- README.md oracle/check tables updated.
+  ran clean (32,000 cases, 0 failures, 0 stalled, final tree). Findings
+  that were
+  triaged and documented rather than silenced: the legacy
+  `utf8_decode()` divergence (§1), the `wp_has_noncharacters()`
+  ill-formed-input divergence (§2), the `code_point_to_utf8_bytes()`
+  internal-encoding regression and the #63863 test bug (§§1, 3).
+- README.md oracle/check tables updated (Encode/Decode/Nonchars).
 
 ## Gotchas inherited from the existing harness
 
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index 23b9101ca51f4..28932785a653b 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -210,3 +210,29 @@ ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/minimize.php --failure .
 
 (The `non-maximal` fault minimizes to the two bytes `E0 F4`: two
 adjacent maximal subparts whose replacement characters get collapsed.)
+
+## One-Shot Exhaustive Tests
+
+```sh
+php tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php
+```
+
+`WP_HTML_Decoder::code_point_to_utf8_bytes()` has a domain small
+enough (~1.1M code points) to test completely instead of fuzzing: every
+code point 0x0–0x10FFFF plus out-of-range probes. The independent
+oracle is the fuzzer's pure-arithmetic `Generator::encode_code_point()`;
+a second comparison against `mb_chr( $cp, 'UTF-8' )` is a consistency
+cross-check (the implementation is itself mb_chr-backed) that would
+expose a bug shared between the implementation and the arithmetic
+encoder. Surrogates and out-of-range values must yield U+FFFD. Runs in
+under a second; exit codes `0`/`1`/`2` like everything else. The smoke
+test runs it and proves its detection fires via
+`ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark`.
+
+The script also pins a known upstream issue: since [r62424] (#65342,
+unreleased) the implementation calls `mb_chr()` without an explicit
+encoding, so under a non-UTF-8 `mb_internal_encoding()` (WordPress
+sets it from `blog_charset`) it returns raw legacy bytes for mappable
+code points while still returning UTF-8 U+FFFD for invalid ones —
+contradicting its docblock. The pin fails when the upstream behavior
+changes, so the documented stance cannot silently go stale.
diff --git a/tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php b/tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php
new file mode 100644
index 0000000000000..0ce48190c5f2e
--- /dev/null
+++ b/tools/encoding-fuzz/tests/code-point-to-utf8-exhaustive.php
@@ -0,0 +1,159 @@
+<?php
+/**
+ * One-shot exhaustive test of `WP_HTML_Decoder::code_point_to_utf8_bytes()`.
+ *
+ * The function's domain is small enough (code points 0x0–0x10FFFF plus a
+ * handful of out-of-range probes) to test completely instead of fuzzing:
+ * total coverage in a few seconds, done forever.
+ *
+ * Documented contract (see the method docblock): a Unicode scalar value
+ * encodes to its UTF-8 byte sequence; surrogates and out-of-range values
+ * yield the replacement character U+FFFD.
+ *
+ * The genuinely independent oracle is `Generator::encode_code_point()`,
+ * the fuzzer's pure-arithmetic UTF-8 encoder (no mbstring involvement).
+ * A second comparison against `mb_chr( $cp, 'UTF-8' )` is a consistency
+ * cross-check, NOT an independent oracle — the implementation is itself
+ * mb_chr-backed — but it guards the arithmetic oracle and would expose
+ * a bug shared between the implementation and the arithmetic encoder.
+ *
+ * `ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark` injects a broken
+ * variant (surrogates yield '?' instead of U+FFFD) so the harness smoke
+ * test can prove this script's detection actually fires.
+ *
+ * Known caveat, asserted below: the implementation calls `mb_chr()`
+ * WITHOUT an explicit encoding, so it inherits `mb_internal_encoding()`.
+ * WordPress sets that from `blog_charset`, so on a non-UTF-8 site the
+ * method can return non-UTF-8 bytes (e.g. `"\xE9"` for U+00E9 under
+ * ISO-8859-1) despite its documented contract. This script pins the
+ * internal encoding to UTF-8 for the exhaustive sweep, then demonstrates
+ * the sensitivity as a separate documented finding.
+ *
+ * Exit codes: 0 pass, 1 findings, 2 harness error.
+ */
+
+namespace EncodingFuzz;
+
+require __DIR__ . '/../lib/autoload.php';
+
+error_reporting( E_ALL );
+ini_set( 'display_errors', 'stderr' );
+
+require Bootstrap::repo_root() . '/src/wp-includes/html-api/class-wp-html-decoder.php';
+
+$failed = 0;
+
+function check( string $label, bool $ok, string $detail = '' ): void {
+	global $failed;
+	if ( $ok ) {
+		echo "PASS {$label}\n";
+	} else {
+		++$failed;
+		echo "FAIL {$label}" . ( '' !== $detail ? ": {$detail}" : '' ) . "\n";
+	}
+}
+
+if ( ! function_exists( 'mb_chr' ) ) {
+	fwrite( STDERR, "mbstring is required (both by this test's cross-check and by the implementation itself).\n" );
+	exit( 2 );
+}
+
+$previous_encoding = mb_internal_encoding();
+mb_internal_encoding( 'UTF-8' );
+
+// Deliberately broken variant so the smoke test can prove detection fires.
+$encode = static fn( int $cp ): string => \WP_HTML_Decoder::code_point_to_utf8_bytes( $cp );
+if ( 'codepoint-surrogate-qmark' === getenv( 'ENCODING_FUZZ_FAULT' ) ) {
+	$encode = static fn( int $cp ): string => ( $cp >= 0xD800 && $cp <= 0xDFFF )
+		? '?'
+		: \WP_HTML_Decoder::code_point_to_utf8_bytes( $cp );
+}
+
+// ---------------------------------------------------------------------
+// 1. Exhaustive sweep over the entire code point domain.
+// ---------------------------------------------------------------------
+$replacement      = "\u{FFFD}";
+$mismatches       = array();
+$mismatch_count   = 0;
+$oracle_conflicts = array();
+$conflict_count   = 0;
+
+for ( $cp = 0; $cp <= 0x10FFFF; $cp++ ) {
+	$is_surrogate = $cp >= 0xD800 && $cp <= 0xDFFF;
+	$expected     = $is_surrogate ? $replacement : Generator::encode_code_point( $cp );
+	$got          = $encode( $cp );
+
+	if ( $got !== $expected ) {
+		++$mismatch_count;
+		if ( count( $mismatches ) < 10 ) {
+			$mismatches[] = sprintf( 'U+%04X: expected %s, got %s', $cp, bin2hex( $expected ), bin2hex( $got ) );
+		}
+	}
+
+	// Cross-check the arithmetic oracle against mb_chr: `mb_chr()` returns
+	// false exactly for surrogates, and the arithmetic encoder must match
+	// it everywhere else (this would expose a bug shared between the
+	// mb_chr-backed implementation and the arithmetic encoder).
+	$mb = mb_chr( $cp, 'UTF-8' );
+	if ( $is_surrogate ? false !== $mb : $mb !== $expected ) {
+		++$conflict_count;
+		if ( count( $oracle_conflicts ) < 10 ) {
+			$oracle_conflicts[] = sprintf(
+				'U+%04X: arithmetic %s, mb_chr %s',
+				$cp,
+				bin2hex( $expected ),
+				is_string( $mb ) ? bin2hex( $mb ) : var_export( $mb, true )
+			);
+		}
+	}
+}
+
+check(
+	'all 1,114,112 code points encode correctly (surrogates → U+FFFD)',
+	0 === $mismatch_count,
+	"{$mismatch_count} mismatches, first " . count( $mismatches ) . ': ' . implode( '; ', $mismatches )
+);
+check(
+	'arithmetic oracle and mb_chr agree on the whole domain',
+	0 === $conflict_count,
+	"{$conflict_count} conflicts, first " . count( $oracle_conflicts ) . ': ' . implode( '; ', $oracle_conflicts )
+);
+
+// ---------------------------------------------------------------------
+// 2. Out-of-range values must yield the replacement character.
+// ---------------------------------------------------------------------
+$out_of_range_fails = array();
+foreach ( array( -1, -0xE9, PHP_INT_MIN, 0x110000, 0x7FFFFFFF, PHP_INT_MAX ) as $cp ) {
+	$got = $encode( $cp );
+	if ( $replacement !== $got ) {
+		$out_of_range_fails[] = sprintf( '%d: got %s', $cp, bin2hex( $got ) );
+	}
+}
+check( 'out-of-range values yield U+FFFD', array() === $out_of_range_fails, implode( '; ', $out_of_range_fails ) );
+
+// ---------------------------------------------------------------------
+// 3. Documented finding: sensitivity to `mb_internal_encoding()`.
+//
+// Not a pass/fail gate on the WordPress contract — it pins the CURRENT
+// (arguably buggy) behavior so any change is noticed. Under a non-UTF-8
+// internal encoding the method returns non-UTF-8 bytes, contradicting
+// its docblock. Fix would be `mb_chr( $code_point, 'UTF-8' )`.
+// ---------------------------------------------------------------------
+mb_internal_encoding( 'ISO-8859-1' );
+$latin1_e9   = \WP_HTML_Decoder::code_point_to_utf8_bytes( 0xE9 );
+$latin1_d800 = \WP_HTML_Decoder::code_point_to_utf8_bytes( 0xD800 );
+mb_internal_encoding( 'UTF-8' );
+
+check(
+	'KNOWN ISSUE pin: mb_internal_encoding sensitivity unchanged (a FAIL here means upstream behavior changed — update or remove this pin)',
+	"\xE9" === $latin1_e9 && $replacement === $latin1_d800,
+	sprintf( 'U+00E9 → %s, U+D800 → %s', bin2hex( $latin1_e9 ), bin2hex( $latin1_d800 ) )
+);
+echo "NOTE  code_point_to_utf8_bytes() inherits mb_internal_encoding(); under ISO-8859-1 it returns raw latin1 bytes\n";
+echo "NOTE  for mappable code points while still returning UTF-8 U+FFFD for invalid ones. WordPress sets the internal\n";
+echo "NOTE  encoding from blog_charset, so non-UTF-8 sites are affected. Suggested fix: mb_chr( \$code_point, 'UTF-8' ).\n";
+
+mb_internal_encoding( $previous_encoding );
+
+echo $failed > 0 ? "\n{$failed} check(s) FAILED\n" : "\nAll checks passed\n";
+exit( $failed > 0 ? 1 : 0 );
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index 965c1743b03ae..e650738be9ed3 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -282,6 +282,18 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 }
 check( '300-case fuzz run clean (real findings would also surface here)', 0 === $fuzz_failures );
 
+// ---------------------------------------------------------------------
+// 6. One-shot exhaustive companion test: must pass, and its detection
+//    must provably fire (same mutation-testing rule as everything else).
+// ---------------------------------------------------------------------
+$exhaustive = escapeshellarg( PHP_BINARY ) . ' ' . escapeshellarg( __DIR__ . '/code-point-to-utf8-exhaustive.php' );
+
+exec( "{$exhaustive} 2>&1", $exh_output, $exh_code );
+check( 'code-point-to-utf8 exhaustive test passes', 0 === $exh_code, implode( ' | ', array_slice( $exh_output, -3 ) ) );
+
+exec( "ENCODING_FUZZ_FAULT=codepoint-surrogate-qmark {$exhaustive} 2>&1", $exh_fault_output, $exh_fault_code );
+check( 'exhaustive test catches broken surrogate handling', 1 === $exh_fault_code, "exit {$exh_fault_code}" );
+
 $oracles->shutdown();
 
 echo $failed > 0 ? "\n{$failed} smoke check(s) FAILED\n" : "\nAll smoke checks passed\n";

From 9d15731f8f45eb4fc62b1281ddf891135d83d3dd Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 12:59:59 +0200
Subject: [PATCH 06/14] Expand encoding fuzzer for mb chr ord

---
 tools/encoding-fuzz/README.md               |  15 +-
 tools/encoding-fuzz/lib/Bootstrap.php       |  73 ++++-
 tools/encoding-fuzz/lib/Checks.php          | 339 ++++++++++++++++++++
 tools/encoding-fuzz/lib/Oracles.php         |  10 +-
 tools/encoding-fuzz/lib/Targets.php         |   2 +
 tools/encoding-fuzz/tests/harness-smoke.php |  14 +
 6 files changed, 441 insertions(+), 12 deletions(-)

diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index 28932785a653b..0d196f2a1241f 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -6,6 +6,7 @@ Differential fuzzer for the WordPress UTF-8 functions:
 - `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()`
 - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()`
 - `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only)
+- `_mb_chr()` / `_mb_ord()`
 - `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary)
 
 The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main
@@ -112,6 +113,13 @@ Internal invariants:
 - `_wp_utf8_encode_fallback()` output is always valid UTF-8
 - `_wp_utf8_decode_fallback( _wp_utf8_encode_fallback( $s ) ) === $s`
   for any byte string `$s` (encode is total and injective per byte)
+- `_mb_chr()` matches the fuzzer's independent arithmetic UTF-8 encoder
+  for valid scalar values and returns false for invalid code points
+- `_mb_ord()` matches an independent first-code-point decoder on arbitrary
+  byte strings and returns false when the first code point is ill-formed
+- `_mb_ord( _mb_chr( $cp ) ) === $cp` for valid scalar values, and
+  `_mb_chr( _mb_ord( $s ) )` reconstructs the first UTF-8 character in
+  `$s` when it is well-formed
 
 ## Inputs
 
@@ -185,7 +193,7 @@ php tools/encoding-fuzz/tests/harness-smoke.php
 ```
 
 Verifies the oracle battery, runs the real targets over the battery
-vectors, and — most importantly — mutation-tests the harness: seventeen
+vectors, and — most importantly — mutation-tests the harness: nineteen
 classes of deliberately broken implementations (validator accepting
 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber,
 identity scrubber, byte-dropping scrubber, off-by-one code point count,
@@ -193,8 +201,9 @@ throwing target, cp1252-confused encoder, identity encoder, per-byte
 decoder, valid-input-mangling decoder, round-trip-violating decoder,
 null-returning encoder, sometimes-null decoder, blind noncharacter
 detector, U+FDD0-block-missing detector, over-eager noncharacter
-detector) must all be caught. It also asserts generator determinism,
-the valid/invalid input mix, and the documented
+detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`)
+must all be caught. It also asserts generator determinism, the
+valid/invalid input mix, and the documented
 `wp_has_noncharacters()` divergence stance on ill-formed input.
 
 For end-to-end pipeline testing while the real implementations are
diff --git a/tools/encoding-fuzz/lib/Bootstrap.php b/tools/encoding-fuzz/lib/Bootstrap.php
index e92921dcf272d..be54f7aa5ba4f 100644
--- a/tools/encoding-fuzz/lib/Bootstrap.php
+++ b/tools/encoding-fuzz/lib/Bootstrap.php
@@ -4,10 +4,9 @@
 /**
  * Loads the WordPress UTF-8 functions under test into a bare PHP process.
  *
- * Only `compat-utf8.php` and `utf8.php` are loaded. `utf8.php` calls
- * `_wp_can_use_pcre_u()` at load time, which normally lives in
- * `compat.php`; a minimal stand-in from `wp-stubs.php` covers it so the
- * rest of WordPress stays out of the fuzzer process.
+ * Only the UTF-8 files under test are loaded. `_mb_chr()` and `_mb_ord()`
+ * live in `compat.php`, so their function bodies are extracted from that
+ * source file without loading the rest of WordPress compatibility glue.
  */
 class Bootstrap {
 	public static function repo_root(): string {
@@ -19,10 +18,10 @@ public static function load_targets(): void {
 			return;
 		}
 
-		require_once __DIR__ . '/wp-stubs.php';
-
 		$root = self::repo_root();
+		require_once __DIR__ . '/wp-stubs.php';
 		require_once $root . '/src/wp-includes/compat-utf8.php';
+		self::load_compat_functions( $root . '/src/wp-includes/compat.php', array( '_mb_chr', '_mb_ord' ) );
 		require_once $root . '/src/wp-includes/utf8.php';
 
 		/*
@@ -33,4 +32,66 @@ public static function load_targets(): void {
 			mb_substitute_character( 0xFFFD );
 		}
 	}
+
+	/**
+	 * Loads selected top-level function definitions from `compat.php`.
+	 *
+	 * The full file has unrelated bootstrap assumptions (for example,
+	 * sodium and deprecation helpers). The fuzzer only needs these
+	 * private UTF-8 polyfills, and evaluating the source definitions keeps
+	 * the tested code tied to WordPress without widening the harness.
+	 *
+	 * @param string   $path      Source file path.
+	 * @param string[] $functions Function names to load.
+	 */
+	private static function load_compat_functions( string $path, array $functions ): void {
+		$source = file_get_contents( $path );
+		if ( false === $source ) {
+			throw new \RuntimeException( "Unable to read {$path}" );
+		}
+
+		foreach ( $functions as $function_name ) {
+			if ( function_exists( $function_name ) ) {
+				continue;
+			}
+
+			eval( self::extract_function_definition( $source, $function_name ) );
+		}
+	}
+
+	private static function extract_function_definition( string $source, string $function_name ): string {
+		$pattern = '/function\s+' . preg_quote( $function_name, '/' ) . '\s*\(/';
+		if ( 1 !== preg_match( $pattern, $source, $match, PREG_OFFSET_CAPTURE ) ) {
+			throw new \RuntimeException( "Unable to find function {$function_name}" );
+		}
+
+		$tokens    = token_get_all( '<?php ' . substr( $source, $match[0][1] ) );
+		$code      = '';
+		$depth     = 0;
+		$in_body   = false;
+		$skip_open = true;
+
+		foreach ( $tokens as $token ) {
+			$text = is_array( $token ) ? $token[1] : $token;
+
+			if ( $skip_open ) {
+				$skip_open = false;
+				continue;
+			}
+
+			$code .= $text;
+
+			if ( '{' === $text ) {
+				++$depth;
+				$in_body = true;
+			} elseif ( '}' === $text && $in_body ) {
+				--$depth;
+				if ( 0 === $depth ) {
+					return $code;
+				}
+			}
+		}
+
+		throw new \RuntimeException( "Unable to close function body for {$function_name}" );
+	}
 }
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
index 14bffa042acdd..6b8bb94b07cae 100644
--- a/tools/encoding-fuzz/lib/Checks.php
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -36,12 +36,45 @@
  *  - `decode(encode(s)) === s` for any byte string `s` (encode is total
  *    and injective per byte)
  *
+ * Character/code point polyfills:
+ *  - `_mb_chr()` against the independent arithmetic UTF-8 encoder for
+ *    valid scalar values, and false for invalid code points.
+ *  - `_mb_ord()` against an independent first-code-point decoder on
+ *    arbitrary byte strings.
+ *  - `_mb_ord( _mb_chr( cp ) ) === cp` and
+ *    `_mb_chr( _mb_ord( s ) ) === first UTF-8 character in s` where
+ *    those expressions are defined.
+ *
  * Target callables are injectable so the harness smoke test can verify
  * that deliberately broken implementations are caught.
  */
 class Checks {
 	public const PREVIEW_BYTES = 48;
 
+	private const MB_CHR_CODE_POINT_PROBES = array(
+		-1,
+		0x00,
+		0x01,
+		0x7F,
+		0x80,
+		0x7FF,
+		0x800,
+		0xD7FF,
+		0xD800,
+		0xDFFF,
+		0xE000,
+		0xFDCF,
+		0xFDD0,
+		0xFDEF,
+		0xFDF0,
+		0xFFFD,
+		0xFFFE,
+		0xFFFF,
+		0x10000,
+		0x10FFFF,
+		0x110000,
+	);
+
 	private Oracles $oracles;
 
 	/** @var array<string, callable> */
@@ -264,6 +297,213 @@ public function run( string $input ): array {
 			$failures[] = $failure;
 		}
 
+		// 10. mb_chr()/mb_ord() polyfill differentials and isomorphisms.
+		foreach ( $this->check_mb_chr_ord( $input ) as $failure ) {
+			$failures[] = $failure;
+		}
+
+		return $failures;
+	}
+
+	/**
+	 * Tests `_mb_chr()` and `_mb_ord()` as partial inverses. The oracle for
+	 * `_mb_chr()` is the fuzzer's arithmetic UTF-8 encoder; the oracle for
+	 * `_mb_ord()` is an independent decoder for the first code point only.
+	 *
+	 * @return array<int, array{check: string, signature: string, detail: array}>
+	 */
+	private function check_mb_chr_ord( string $input ): array {
+		$failures = array();
+
+		if ( ! isset( $this->targets['mb_chr'], $this->targets['mb_ord'] ) ) {
+			return $failures;
+		}
+
+		list( $expected_ord, $prefix_length ) = self::first_code_point_or_false( $input );
+
+		try {
+			$actual_ord = ( $this->targets['mb_ord'] )( $input );
+		} catch ( \Throwable $error ) {
+			$failures[] = self::failure(
+				'target-exception',
+				'mb_ord',
+				array(
+					'target'  => 'mb_ord',
+					'message' => $error->getMessage(),
+					'class'   => get_class( $error ),
+				)
+			);
+			$actual_ord = false;
+		}
+
+		if ( ! is_int( $actual_ord ) && false !== $actual_ord ) {
+			$failures[] = self::failure(
+				'mb-ord-bad-return',
+				'mb_ord',
+				array(
+					'type' => get_debug_type( $actual_ord ),
+				)
+			);
+		} elseif ( $actual_ord !== $expected_ord ) {
+			$failures[] = self::failure(
+				'mb-ord-mismatch',
+				'mb_ord',
+				array(
+					'got'           => $actual_ord,
+					'expected'      => $expected_ord,
+					'input_preview' => self::preview( $input ),
+				)
+			);
+		}
+
+		if ( is_int( $expected_ord ) ) {
+			try {
+				$round_trip_chr = ( $this->targets['mb_chr'] )( $expected_ord );
+			} catch ( \Throwable $error ) {
+				$failures[] = self::failure(
+					'target-exception',
+					'mb_chr:from-ord',
+					array(
+						'target'  => 'mb_chr',
+						'message' => $error->getMessage(),
+						'class'   => get_class( $error ),
+					)
+				);
+				$round_trip_chr = false;
+			}
+
+			$expected_prefix = substr( $input, 0, $prefix_length );
+			if ( $round_trip_chr !== $expected_prefix ) {
+				$failures[] = self::failure(
+					'mb-ord-chr-isomorphism',
+					'mb_ord:mb_chr',
+					array(
+						'code_point'      => $expected_ord,
+						'expected_prefix' => self::preview( $expected_prefix ),
+						'got'             => is_string( $round_trip_chr ) ? self::preview( $round_trip_chr ) : $round_trip_chr,
+					)
+				);
+			}
+		}
+
+		foreach ( self::mb_chr_code_point_probes( $input ) as $code_point ) {
+			$expected_chr = self::expected_mb_chr( $code_point );
+
+			try {
+				$actual_chr = ( $this->targets['mb_chr'] )( $code_point );
+			} catch ( \Throwable $error ) {
+				$failures[] = self::failure(
+					'target-exception',
+					'mb_chr',
+					array(
+						'target'     => 'mb_chr',
+						'code_point' => $code_point,
+						'message'    => $error->getMessage(),
+						'class'      => get_class( $error ),
+					)
+				);
+				$actual_chr = false;
+			}
+
+			if ( ! is_string( $actual_chr ) && false !== $actual_chr ) {
+				$failures[] = self::failure(
+					'mb-chr-bad-return',
+					'mb_chr',
+					array(
+						'code_point' => $code_point,
+						'type'       => get_debug_type( $actual_chr ),
+					)
+				);
+				continue;
+			}
+
+			if ( $actual_chr !== $expected_chr ) {
+				$failures[] = self::failure(
+					'mb-chr-mismatch',
+					'mb_chr',
+					array(
+						'code_point' => $code_point,
+						'expected'   => is_string( $expected_chr ) ? self::preview( $expected_chr ) : $expected_chr,
+						'got'        => is_string( $actual_chr ) ? self::preview( $actual_chr ) : $actual_chr,
+					)
+				);
+				continue;
+			}
+
+			if ( is_string( $actual_chr ) ) {
+				try {
+					$round_trip_ord = ( $this->targets['mb_ord'] )( $actual_chr );
+				} catch ( \Throwable $error ) {
+					$failures[] = self::failure(
+						'target-exception',
+						'mb_ord:from-chr',
+						array(
+							'target'     => 'mb_ord',
+							'code_point' => $code_point,
+							'message'    => $error->getMessage(),
+							'class'      => get_class( $error ),
+						)
+					);
+					$round_trip_ord = false;
+				}
+
+				if ( $round_trip_ord !== $code_point ) {
+					$failures[] = self::failure(
+						'mb-chr-ord-isomorphism',
+						'mb_chr:mb_ord',
+						array(
+							'code_point' => $code_point,
+							'got'        => $round_trip_ord,
+						)
+					);
+				}
+			}
+		}
+
+		$contract_probes = array(
+			array( 'mb_chr', array( 0x41, 'UTF-8' ), 'A' ),
+			array( 'mb_chr', array( 0x41, 'latin1' ), false ),
+			array( 'mb_chr', array( 0x41, 'utf8' ), false ),
+			array( 'mb_chr', array( '65' ), false ),
+			array( 'mb_ord', array( 'A', 'UTF-8' ), 0x41 ),
+			array( 'mb_ord', array( 'A', 'latin1' ), false ),
+			array( 'mb_ord', array( 'A', 'utf8' ), false ),
+			array( 'mb_ord', array( '' ), false ),
+			array( 'mb_ord', array( 0x41 ), false ),
+		);
+
+		foreach ( $contract_probes as $probe ) {
+			list( $target, $args, $expected ) = $probe;
+
+			try {
+				$actual = ( $this->targets[ $target ] )( ...$args );
+			} catch ( \Throwable $error ) {
+				$failures[] = self::failure(
+					'target-exception',
+					"{$target}:contract",
+					array(
+						'target'  => $target,
+						'args'    => array_map( static fn( $arg ) => is_string( $arg ) ? self::preview( $arg ) : $arg, $args ),
+						'message' => $error->getMessage(),
+						'class'   => get_class( $error ),
+					)
+				);
+				continue;
+			}
+
+			if ( $actual !== $expected ) {
+				$failures[] = self::failure(
+					"{$target}-contract-mismatch",
+					$target,
+					array(
+						'args'     => array_map( static fn( $arg ) => is_string( $arg ) ? self::preview( $arg ) : $arg, $args ),
+						'expected' => is_string( $expected ) ? self::preview( $expected ) : $expected,
+						'got'      => is_string( $actual ) ? self::preview( $actual ) : $actual,
+					)
+				);
+			}
+		}
+
 		return $failures;
 	}
 
@@ -593,6 +833,105 @@ private function check_chunked_scan( string $input, string $ref_scrub ): ?array
 		return null;
 	}
 
+	private static function expected_mb_chr( int $code_point ) {
+		if (
+			$code_point < 0 ||
+			( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||
+			$code_point > 0x10FFFF
+		) {
+			return false;
+		}
+
+		return Generator::encode_code_point( $code_point );
+	}
+
+	private static function mb_chr_code_point_probes( string $input ): array {
+		$probes = self::MB_CHR_CODE_POINT_PROBES;
+		$hash   = hash( 'sha256', $input, true );
+
+		for ( $i = 0; $i < 4; $i++ ) {
+			$raw      = unpack( 'N', substr( $hash, 4 * $i, 4 ) )[1];
+			$probes[] = ( $raw % 0x120000 ) - 0x800;
+		}
+
+		return array_values( array_unique( $probes ) );
+	}
+
+	/**
+	 * @return array{0: int|false, 1: int} First code point and byte length.
+	 */
+	private static function first_code_point_or_false( string $bytes ): array {
+		$length = strlen( $bytes );
+		if ( 0 === $length ) {
+			return array( false, 0 );
+		}
+
+		$b1 = ord( $bytes[0] );
+		if ( $b1 <= 0x7F ) {
+			return array( $b1, 1 );
+		}
+
+		if ( $length < 2 ) {
+			return array( false, 0 );
+		}
+
+		$b2 = ord( $bytes[1] );
+		if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
+			return array(
+				( ( $b1 & 0x1F ) << 6 ) | ( $b2 & 0x3F ),
+				2,
+			);
+		}
+
+		if ( $length < 3 ) {
+			return array( false, 0 );
+		}
+
+		$b3 = ord( $bytes[2] );
+		if (
+			$b3 >= 0x80 &&
+			$b3 <= 0xBF &&
+			(
+				( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+				( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+				( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
+			)
+		) {
+			return array(
+				( ( $b1 & 0x0F ) << 12 ) | ( ( $b2 & 0x3F ) << 6 ) | ( $b3 & 0x3F ),
+				3,
+			);
+		}
+
+		if ( $length < 4 ) {
+			return array( false, 0 );
+		}
+
+		$b4 = ord( $bytes[3] );
+		if (
+			$b3 >= 0x80 &&
+			$b3 <= 0xBF &&
+			$b4 >= 0x80 &&
+			$b4 <= 0xBF &&
+			(
+				( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+				( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
+			)
+		) {
+			return array(
+				( ( $b1 & 0x07 ) << 18 ) |
+				( ( $b2 & 0x3F ) << 12 ) |
+				( ( $b3 & 0x3F ) << 6 ) |
+				( $b4 & 0x3F ),
+				4,
+			);
+		}
+
+		return array( false, 0 );
+	}
+
 	private static function failure( string $check, string $party, array $detail ): array {
 		return array(
 			'check'     => $check,
diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php
index 88a5df3b1ae41..27c8822c07b79 100644
--- a/tools/encoding-fuzz/lib/Oracles.php
+++ b/tools/encoding-fuzz/lib/Oracles.php
@@ -116,16 +116,20 @@ public static function build( array $external_names ): self {
 			};
 		}
 
-		if ( function_exists( 'mb_str_split' ) && function_exists( 'mb_ord' ) ) {
+		$mb_ord = function_exists( 'mb_ord' )
+			? 'mb_ord'
+			: ( function_exists( '_mb_ord' ) ? '_mb_ord' : null );
+
+		if ( function_exists( 'mb_str_split' ) && null !== $mb_ord ) {
 			/*
 			 * Trivial decode-and-test reference for noncharacter detection,
 			 * independent of both implementations under test (the PCRE
 			 * character-class regex and the `_wp_scan_utf8()`-based scan).
 			 * Callers must pass valid UTF-8.
 			 */
-			$oracles->noncharacters['mb'] = static function ( string $valid_utf8 ): bool {
+			$oracles->noncharacters['mb'] = static function ( string $valid_utf8 ) use ( $mb_ord ): bool {
 				foreach ( mb_str_split( $valid_utf8, 1, 'UTF-8' ) as $character ) {
-					$code_point = mb_ord( $character, 'UTF-8' );
+					$code_point = $mb_ord( $character, 'UTF-8' );
 
 					// Fail loudly on contract violations: on ill-formed
 					// input `mb_ord()` returns false, which would otherwise
diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php
index 19f6ce3d45bc9..29d5d415be34a 100644
--- a/tools/encoding-fuzz/lib/Targets.php
+++ b/tools/encoding-fuzz/lib/Targets.php
@@ -31,6 +31,8 @@ public static function resolve(): array {
 			'utf8_decode_fb'  => '_wp_utf8_decode_fallback',
 			'has_nonchars'    => 'wp_has_noncharacters',
 			'has_nonchars_fb' => '_wp_has_noncharacters_fallback',
+			'mb_chr'          => '_mb_chr',
+			'mb_ord'          => '_mb_ord',
 		);
 
 		switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) {
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index e650738be9ed3..9cd6e1374ae61 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -104,6 +104,8 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 	'utf8_decode_fb'  => '_wp_utf8_decode_fallback',
 	'has_nonchars'    => 'wp_has_noncharacters',
 	'has_nonchars_fb' => '_wp_has_noncharacters_fallback',
+	'mb_chr'          => '_mb_chr',
+	'mb_ord'          => '_mb_ord',
 );
 
 /**
@@ -242,6 +244,18 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 ) );
 check( 'catches over-eager noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) );
 
+// 3r. Character encoder that confuses U+0080 with Windows-1252's euro sign.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'mb_chr' => static fn( int $code_point ) => 0x80 === $code_point ? "\xE2\x82\xAC" : _mb_chr( $code_point ),
+) );
+check( 'catches cp1252-confused _mb_chr', in_array( 'mb-chr-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3s. Character decoder that accepts an invalid leading C0 byte.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'mb_ord' => static fn( string $bytes ) => str_starts_with( $bytes, "\xC0" ) ? 0 : _mb_ord( $bytes ),
+) );
+check( 'catches invalid-accepting _mb_ord', in_array( 'mb-ord-mismatch', $seen, true ), implode( ',', $seen ) );
+
 // ---------------------------------------------------------------------
 // 4. Generator determinism and mix.
 // ---------------------------------------------------------------------

From 6ea247f9da3414d65663320f2292090417f71258 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:14:15 +0200
Subject: [PATCH 07/14] Add UTF-8 codepoint span fuzzing

---
 progress-handoff-xZOoEn.md                  |  33 ++
 tools/encoding-fuzz/README.md               |  22 +-
 tools/encoding-fuzz/lib/Checks.php          | 319 +++++++++++++++++++-
 tools/encoding-fuzz/lib/Targets.php         |  82 +++++
 tools/encoding-fuzz/tests/harness-smoke.php |  27 ++
 5 files changed, 474 insertions(+), 9 deletions(-)
 create mode 100644 progress-handoff-xZOoEn.md

diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md
new file mode 100644
index 0000000000000..4ace0ce8d0896
--- /dev/null
+++ b/progress-handoff-xZOoEn.md
@@ -0,0 +1,33 @@
+# Progress for handoff-xZOoEn
+
+Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn.md`
+
+## 2026-06-11
+
+### Current status
+
+- Confirmed step 0 (`_mb_chr()` / `_mb_ord()` coverage) is already committed as `9d15731f8f`.
+- Worktree was clean before starting follow-up work.
+- Next active slice: step 1, direct `_wp_utf8_codepoint_span()` coverage.
+
+### Step 1: `_wp_utf8_codepoint_span()` coverage
+
+- Status: done; included in the step 1 commit.
+- Scope:
+  - Add `_wp_utf8_codepoint_span()` target wiring.
+  - Add span properties for scrubbed valid text and arbitrary input.
+  - Start nonzero-offset checks only at known code point or maximal-subpart boundaries.
+  - Add mutation tests for off-by-one span length, invalid subpart byte-counting, incorrect `found_code_points`, and stale `found_code_points` on empty spans.
+- Verification:
+  - `php -l tools/encoding-fuzz/lib/Checks.php`
+  - `php -l tools/encoding-fuzz/lib/Targets.php`
+  - `php -l tools/encoding-fuzz/lib/Bootstrap.php`
+  - `php -l tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none`
+  - `git diff --check`
+- Review gate: satisfied by 3 adversarial reviewers.
+  - Reviewer 1: satisfied after checking the independent maximal-subpart span reference and the stale-count update.
+  - Reviewer 2: satisfied after checking mutation adequacy, replay/minimize fault behavior, and the README clarification.
+  - Reviewer 3: initially found the stale `found_code_points` gap; satisfied after the sentinel and stale-count mutation were added.
+- Commit: this step commit.
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index 0d196f2a1241f..f035e7147dbcf 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -7,7 +7,8 @@ Differential fuzzer for the WordPress UTF-8 functions:
 - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()`
 - `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only)
 - `_mb_chr()` / `_mb_ord()`
-- `_wp_utf8_codepoint_count()` and the resumable `_wp_scan_utf8()` paths (secondary)
+- `_wp_utf8_codepoint_count()`, `_wp_utf8_codepoint_span()`, and the
+  resumable `_wp_scan_utf8()` paths (secondary)
 
 The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main
 fuzz surface; the mbstring-backed public functions are checked alongside
@@ -107,6 +108,12 @@ Internal invariants:
 - scrub is idempotent
 - `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed text
   (each maximal subpart counts as one code point)
+- `_wp_utf8_codepoint_span()` reports the original byte span occupied by a
+  requested number of code points; on scrubbed valid text it matches
+  `strlen( mb_substr( ... ) )`, and on arbitrary input an independent
+  maximal-subpart parser checks that invalid subparts count as one code
+  point. Nonzero starts are probed only at known code point or
+  maximal-subpart boundaries.
 - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points`
   chunks reconstructs the same scrubbed text and always makes forward
   progress (chunk sizes derive from the input hash, so replays are exact)
@@ -193,7 +200,7 @@ php tools/encoding-fuzz/tests/harness-smoke.php
 ```
 
 Verifies the oracle battery, runs the real targets over the battery
-vectors, and — most importantly — mutation-tests the harness: nineteen
+vectors, and — most importantly — mutation-tests the harness: twenty-three
 classes of deliberately broken implementations (validator accepting
 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber,
 identity scrubber, byte-dropping scrubber, off-by-one code point count,
@@ -201,16 +208,21 @@ throwing target, cp1252-confused encoder, identity encoder, per-byte
 decoder, valid-input-mangling decoder, round-trip-violating decoder,
 null-returning encoder, sometimes-null decoder, blind noncharacter
 detector, U+FDD0-block-missing detector, over-eager noncharacter
-detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`)
+detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`,
+off-by-one code point span, invalid-subpart byte-counted span, and
+wrong or stale `found_code_points` span)
 must all be caught. It also asserts generator determinism, the
 valid/invalid input mix, and the documented
 `wp_has_noncharacters()` divergence stance on ill-formed input.
 
 For end-to-end pipeline testing while the real implementations are
-healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager`
+healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale`
 injects a broken target into worker, replay, and minimize alike.
 Fault-injected artifacts record the fault name in their environment
-metadata so they cannot be mistaken for real findings:
+metadata so they cannot be mistaken for real findings. Replaying or
+minimizing a fault-injected artifact requires setting the same
+`ENCODING_FUZZ_FAULT`; replay without it checks the healthy targets
+against the saved input:
 
 ```sh
 ENCODING_FUZZ_FAULT=non-maximal php tools/encoding-fuzz/runner.php --lanes 2 --duration-seconds 5
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
index 6b8bb94b07cae..061a9b0f74f56 100644
--- a/tools/encoding-fuzz/lib/Checks.php
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -16,6 +16,10 @@
  *  - scrub is idempotent
  *  - `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed
  *    text (each maximal subpart counts as one code point)
+ *  - `_wp_utf8_codepoint_span()` reports the original byte span for a
+ *    requested number of code points, with invalid maximal subparts
+ *    counted as one code point and `found_code_points` reporting the
+ *    available/requested count
  *  - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points`
  *    chunks reconstructs the same scrubbed text and always makes
  *    forward progress
@@ -281,23 +285,28 @@ public function run( string $input ): array {
 			);
 		}
 
-		// 7. Chunked scan reconstruction.
+		// 7. Code point span agrees with valid-text and maximal-subpart references.
+		foreach ( $this->check_codepoint_span( $input, $ref_scrub ) as $failure ) {
+			$failures[] = $failure;
+		}
+
+		// 8. Chunked scan reconstruction.
 		$chunk_failure = $this->check_chunked_scan( $input, $ref_scrub );
 		if ( null !== $chunk_failure ) {
 			$failures[] = $chunk_failure;
 		}
 
-		// 8. Legacy utf8_encode()/utf8_decode() fallback differentials.
+		// 9. Legacy utf8_encode()/utf8_decode() fallback differentials.
 		foreach ( $this->check_utf8_encode_decode( $input, $ref_valid, $mb_validity ) as $failure ) {
 			$failures[] = $failure;
 		}
 
-		// 9. Noncharacter detection, on valid input only.
+		// 10. Noncharacter detection, on valid input only.
 		foreach ( $this->check_noncharacters( $input, $ref_valid ) as $failure ) {
 			$failures[] = $failure;
 		}
 
-		// 10. mb_chr()/mb_ord() polyfill differentials and isomorphisms.
+		// 11. mb_chr()/mb_ord() polyfill differentials and isomorphisms.
 		foreach ( $this->check_mb_chr_ord( $input ) as $failure ) {
 			$failures[] = $failure;
 		}
@@ -305,6 +314,165 @@ public function run( string $input ): array {
 		return $failures;
 	}
 
+	/**
+	 * Tests `_wp_utf8_codepoint_span()` from known boundaries only.
+	 *
+	 * Starts inside a continuation byte or inside an invalid maximal subpart
+	 * are deliberately outside this property: `_mb_substr()` reaches this
+	 * helper by first computing a boundary with the same maximal-subpart model.
+	 *
+	 * @return array<int, array{check: string, signature: string, detail: array}>
+	 */
+	private function check_codepoint_span( string $input, string $ref_scrub ): array {
+		if ( ! isset( $this->targets['codepoint_span'] ) ) {
+			return array();
+		}
+
+		$failures = array();
+
+		list( $offsets, $reference_scrub ) = self::reference_utf8_offsets_and_scrub( $input );
+		if ( $reference_scrub !== $ref_scrub ) {
+			return array(
+				self::failure(
+					'span-reference-disagreement',
+					'maximal-subpart-reference',
+					self::diff_detail( 'maximal-subpart-reference', $ref_scrub, $reference_scrub )
+				),
+			);
+		}
+
+		$segment_count = count( $offsets ) - 1;
+		foreach ( self::span_probe_indices( $segment_count, $input ) as $segment_index ) {
+			$byte_offset = $offsets[ $segment_index ];
+			$available   = $segment_count - $segment_index;
+
+			foreach ( self::span_probe_counts( $available, $input . ":{$segment_index}" ) as $max_code_points ) {
+				$expected_found = min( $max_code_points, $available );
+				$expected_span  = $offsets[ $segment_index + $expected_found ] - $byte_offset;
+
+				$failure = $this->assert_codepoint_span(
+					$input,
+					$byte_offset,
+					$max_code_points,
+					$expected_span,
+					$expected_found,
+					'arbitrary-boundary'
+				);
+
+				if ( null !== $failure ) {
+					$failures[] = $failure;
+				}
+			}
+		}
+
+		$scrubbed_code_points = mb_strlen( $ref_scrub, 'UTF-8' );
+		foreach ( self::span_probe_indices( $scrubbed_code_points, $ref_scrub ) as $start_code_point ) {
+			$byte_offset = strlen( mb_substr( $ref_scrub, 0, $start_code_point, 'UTF-8' ) );
+			$available   = $scrubbed_code_points - $start_code_point;
+
+			foreach ( self::span_probe_counts( $available, $ref_scrub . ":scrubbed:{$start_code_point}" ) as $max_code_points ) {
+				$expected_found = min( $max_code_points, $available );
+				$expected_span  = strlen( mb_substr( $ref_scrub, $start_code_point, $max_code_points, 'UTF-8' ) );
+
+				$failure = $this->assert_codepoint_span(
+					$ref_scrub,
+					$byte_offset,
+					$max_code_points,
+					$expected_span,
+					$expected_found,
+					'scrubbed-mb-substr'
+				);
+
+				if ( null !== $failure ) {
+					$failures[] = $failure;
+				}
+			}
+		}
+
+		return $failures;
+	}
+
+	private function assert_codepoint_span( string $input, int $byte_offset, int $max_code_points, int $expected_span, int $expected_found, string $property ): ?array {
+		$found_code_points = -1;
+
+		try {
+			$actual_span = ( $this->targets['codepoint_span'] )( $input, $byte_offset, $max_code_points, $found_code_points );
+		} catch ( \Throwable $error ) {
+			return self::failure(
+				'target-exception',
+				'codepoint_span',
+				array(
+					'target'          => 'codepoint_span',
+					'property'        => $property,
+					'byte_offset'     => $byte_offset,
+					'max_code_points' => $max_code_points,
+					'message'         => $error->getMessage(),
+					'class'           => get_class( $error ),
+				)
+			);
+		}
+
+		if ( ! is_int( $actual_span ) ) {
+			return self::failure(
+				'codepoint-span-bad-return',
+				'codepoint_span',
+				array(
+					'property'        => $property,
+					'byte_offset'     => $byte_offset,
+					'max_code_points' => $max_code_points,
+					'type'            => get_debug_type( $actual_span ),
+				)
+			);
+		}
+
+		if ( ! is_int( $found_code_points ) ) {
+			return self::failure(
+				'codepoint-span-found-bad-return',
+				'codepoint_span',
+				array(
+					'property'        => $property,
+					'byte_offset'     => $byte_offset,
+					'max_code_points' => $max_code_points,
+					'type'            => get_debug_type( $found_code_points ),
+				)
+			);
+		}
+
+		if ( $actual_span !== $expected_span ) {
+			return self::failure(
+				'codepoint-span-mismatch',
+				'codepoint_span',
+				array(
+					'property'          => $property,
+					'byte_offset'       => $byte_offset,
+					'max_code_points'   => $max_code_points,
+					'got'               => $actual_span,
+					'expected'          => $expected_span,
+					'found_code_points' => $found_code_points,
+					'input_preview'     => self::preview( $input, $byte_offset ),
+				)
+			);
+		}
+
+		if ( $found_code_points !== $expected_found ) {
+			return self::failure(
+				'codepoint-span-found-mismatch',
+				'codepoint_span',
+				array(
+					'property'        => $property,
+					'byte_offset'     => $byte_offset,
+					'max_code_points' => $max_code_points,
+					'got'             => $found_code_points,
+					'expected'        => $expected_found,
+					'span'            => $actual_span,
+					'input_preview'   => self::preview( $input, $byte_offset ),
+				)
+			);
+		}
+
+		return null;
+	}
+
 	/**
 	 * Tests `_mb_chr()` and `_mb_ord()` as partial inverses. The oracle for
 	 * `_mb_chr()` is the fuzzer's arithmetic UTF-8 encoder; the oracle for
@@ -857,6 +1025,149 @@ private static function mb_chr_code_point_probes( string $input ): array {
 		return array_values( array_unique( $probes ) );
 	}
 
+	/**
+	 * Builds a small boundary table from an independent UTF-8 maximal-subpart
+	 * parser. `$offsets[$i]` is the byte offset before logical code point `$i`.
+	 *
+	 * @return array{0: int[], 1: string} Boundary offsets and scrubbed text.
+	 */
+	private static function reference_utf8_offsets_and_scrub( string $bytes ): array {
+		$length  = strlen( $bytes );
+		$offsets = array( 0 );
+		$scrub   = '';
+		$at      = 0;
+
+		while ( $at < $length ) {
+			list( $segment_length, $valid ) = self::reference_utf8_segment( $bytes, $at );
+			$scrub                        .= $valid ? substr( $bytes, $at, $segment_length ) : "\u{FFFD}";
+			$at                           += $segment_length;
+			$offsets[]                     = $at;
+		}
+
+		return array( $offsets, $scrub );
+	}
+
+	/**
+	 * @return array{0: int, 1: bool} Byte length and whether the segment is well-formed.
+	 */
+	private static function reference_utf8_segment( string $bytes, int $at ): array {
+		$remaining = strlen( $bytes ) - $at;
+		$b1        = ord( $bytes[ $at ] );
+
+		if ( $b1 <= 0x7F ) {
+			return array( 1, true );
+		}
+
+		$b2 = $remaining >= 2 ? ord( $bytes[ $at + 1 ] ) : null;
+		if ( $b1 >= 0xC2 && $b1 <= 0xDF ) {
+			return self::is_continuation( $b2 ) ? array( 2, true ) : array( 1, false );
+		}
+
+		$b3 = $remaining >= 3 ? ord( $bytes[ $at + 2 ] ) : null;
+		if (
+			self::is_continuation( $b3 ) &&
+			(
+				( 0xE0 === $b1 && null !== $b2 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xE1 && $b1 <= 0xEC && self::is_continuation( $b2 ) ) ||
+				( 0xED === $b1 && null !== $b2 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+				( $b1 >= 0xEE && $b1 <= 0xEF && self::is_continuation( $b2 ) )
+			)
+		) {
+			return array( 3, true );
+		}
+
+		$b4 = $remaining >= 4 ? ord( $bytes[ $at + 3 ] ) : null;
+		if (
+			self::is_continuation( $b3 ) &&
+			self::is_continuation( $b4 ) &&
+			(
+				( 0xF0 === $b1 && null !== $b2 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xF1 && $b1 <= 0xF3 && self::is_continuation( $b2 ) ) ||
+				( 0xF4 === $b1 && null !== $b2 && $b2 >= 0x80 && $b2 <= 0x8F )
+			)
+		) {
+			return array( 4, true );
+		}
+
+		if ( $b1 >= 0xE0 && $b1 <= 0xEF && self::is_valid_three_byte_second( $b1, $b2 ) ) {
+			return array( min( $remaining, 2 ), false );
+		}
+
+		if ( $b1 >= 0xF0 && $b1 <= 0xF4 && self::is_valid_four_byte_second( $b1, $b2 ) ) {
+			return array( min( $remaining, self::is_continuation( $b3 ) ? 3 : 2 ), false );
+		}
+
+		return array( 1, false );
+	}
+
+	private static function is_continuation( ?int $byte ): bool {
+		return null !== $byte && $byte >= 0x80 && $byte <= 0xBF;
+	}
+
+	private static function is_valid_three_byte_second( int $b1, ?int $b2 ): bool {
+		return (
+			( 0xE0 === $b1 && null !== $b2 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+			( $b1 >= 0xE1 && $b1 <= 0xEC && self::is_continuation( $b2 ) ) ||
+			( 0xED === $b1 && null !== $b2 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+			( $b1 >= 0xEE && $b1 <= 0xEF && self::is_continuation( $b2 ) )
+		);
+	}
+
+	private static function is_valid_four_byte_second( int $b1, ?int $b2 ): bool {
+		return (
+			( 0xF0 === $b1 && null !== $b2 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+			( $b1 >= 0xF1 && $b1 <= 0xF3 && self::is_continuation( $b2 ) ) ||
+			( 0xF4 === $b1 && null !== $b2 && $b2 >= 0x80 && $b2 <= 0x8F )
+		);
+	}
+
+	/**
+	 * @return int[] Segment/code point indices to use as start boundaries.
+	 */
+	private static function span_probe_indices( int $code_points, string $salt ): array {
+		$indices = array(
+			0,
+			min( 1, $code_points ),
+			min( 2, $code_points ),
+			intdiv( $code_points, 2 ),
+			max( 0, $code_points - 1 ),
+			$code_points,
+		);
+
+		$hash = hash( 'sha256', $salt, true );
+		for ( $i = 0; $i < 4; $i++ ) {
+			$indices[] = ord( $hash[ $i ] ) % ( $code_points + 1 );
+		}
+
+		sort( $indices );
+		return array_values( array_unique( $indices ) );
+	}
+
+	/**
+	 * @return int[] Requested code point counts to probe from a start boundary.
+	 */
+	private static function span_probe_counts( int $available, string $salt ): array {
+		$counts = array(
+			0,
+			1,
+			2,
+			3,
+			min( 7, $available ),
+			intdiv( $available, 2 ),
+			max( 0, $available - 1 ),
+			$available,
+			$available + 1,
+		);
+
+		$hash = hash( 'sha256', $salt, true );
+		for ( $i = 0; $i < 4; $i++ ) {
+			$counts[] = ord( $hash[ $i ] ) % ( $available + 2 );
+		}
+
+		sort( $counts );
+		return array_values( array_unique( $counts ) );
+	}
+
 	/**
 	 * @return array{0: int|false, 1: int} First code point and byte length.
 	 */
diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php
index 29d5d415be34a..7f5b1e27c821a 100644
--- a/tools/encoding-fuzz/lib/Targets.php
+++ b/tools/encoding-fuzz/lib/Targets.php
@@ -15,6 +15,10 @@
  *   ENCODING_FUZZ_FAULT=decode-per-byte    decoder emits '?' per invalid byte
  *   ENCODING_FUZZ_FAULT=nonchars-miss-fdd0 fallback detector misses U+FDD0–U+FDEF
  *   ENCODING_FUZZ_FAULT=nonchars-overeager public detector also flags U+FDCF
+ *   ENCODING_FUZZ_FAULT=span-off-by-one    code point span reports one extra byte
+ *   ENCODING_FUZZ_FAULT=span-invalid-bytes code point span counts invalid bytes individually
+ *   ENCODING_FUZZ_FAULT=span-found-max     code point span over-reports found_code_points
+ *   ENCODING_FUZZ_FAULT=span-found-stale   code point span leaves found_code_points stale
  */
 class Targets {
 	/**
@@ -33,6 +37,7 @@ public static function resolve(): array {
 			'has_nonchars_fb' => '_wp_has_noncharacters_fallback',
 			'mb_chr'          => '_mb_chr',
 			'mb_ord'          => '_mb_ord',
+			'codepoint_span'  => '_wp_utf8_codepoint_span',
 		);
 
 		switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) {
@@ -67,6 +72,22 @@ public static function resolve(): array {
 			case 'nonchars-overeager':
 				$targets['has_nonchars'] = self::nonchars_overeager( ... );
 				break;
+
+			case 'span-off-by-one':
+				$targets['codepoint_span'] = self::codepoint_span_off_by_one( ... );
+				break;
+
+			case 'span-invalid-bytes':
+				$targets['codepoint_span'] = self::codepoint_span_counts_invalid_bytes( ... );
+				break;
+
+			case 'span-found-max':
+				$targets['codepoint_span'] = self::codepoint_span_found_max( ... );
+				break;
+
+			case 'span-found-stale':
+				$targets['codepoint_span'] = self::codepoint_span_stale_empty_found( ... );
+				break;
 		}
 
 		return $targets;
@@ -116,4 +137,65 @@ public static function decode_per_invalid_byte( string $bytes ): string {
 
 		return $out;
 	}
+
+	/**
+	 * Deliberately broken span finder: reports the correct found count but
+	 * includes one extra byte whenever a non-empty span was found.
+	 */
+	public static function codepoint_span_off_by_one( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
+		$span = _wp_utf8_codepoint_span( $text, $byte_offset, $max_code_points, $found_code_points );
+		return $span > 0 ? $span + 1 : $span;
+	}
+
+	/**
+	 * Deliberately broken span finder: treats each byte of an invalid maximal
+	 * subpart as its own code point, so a two-byte truncated sequence can be
+	 * split in half.
+	 */
+	public static function codepoint_span_counts_invalid_bytes( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
+		$was_at            = $byte_offset;
+		$invalid_length    = 0;
+		$end               = strlen( $text );
+		$found_code_points = 0;
+
+		while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
+			$needed      = $max_code_points - $found_code_points;
+			$chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
+
+			$found_code_points += $chunk_count;
+
+			if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
+				$bytes_to_take       = min( $invalid_length, $max_code_points - $found_code_points );
+				$found_code_points  += $bytes_to_take;
+				$byte_offset        += $bytes_to_take;
+			}
+		}
+
+		return $byte_offset - $was_at;
+	}
+
+	/**
+	 * Deliberately broken span finder: returns the right byte span but always
+	 * claims it found the requested number of code points.
+	 */
+	public static function codepoint_span_found_max( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
+		$span              = _wp_utf8_codepoint_span( $text, $byte_offset, $max_code_points, $found_code_points );
+		$found_code_points = $max_code_points;
+		return $span;
+	}
+
+	/**
+	 * Deliberately broken span finder: leaves the caller's by-reference
+	 * value untouched whenever no bytes are spanned.
+	 */
+	public static function codepoint_span_stale_empty_found( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
+		$previous = $found_code_points;
+		$span     = _wp_utf8_codepoint_span( $text, $byte_offset, $max_code_points, $found_code_points );
+
+		if ( 0 === $span ) {
+			$found_code_points = $previous;
+		}
+
+		return $span;
+	}
 }
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index 9cd6e1374ae61..883f88497e19a 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -106,6 +106,7 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 	'has_nonchars_fb' => '_wp_has_noncharacters_fallback',
 	'mb_chr'          => '_mb_chr',
 	'mb_ord'          => '_mb_ord',
+	'codepoint_span'  => '_wp_utf8_codepoint_span',
 );
 
 /**
@@ -256,6 +257,32 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 ) );
 check( 'catches invalid-accepting _mb_ord', in_array( 'mb-ord-mismatch', $seen, true ), implode( ',', $seen ) );
 
+// 3t. Code point span that reports one extra byte.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'codepoint_span' => Targets::codepoint_span_off_by_one( ... ),
+) );
+check( 'catches off-by-one code point span', in_array( 'codepoint-span-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3u. Code point span that treats invalid maximal subparts as one code
+//     point per byte instead of one code point per maximal subpart.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'codepoint_span' => Targets::codepoint_span_counts_invalid_bytes( ... ),
+) );
+check( 'catches byte-counted invalid code point span', in_array( 'codepoint-span-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3v. Code point span that returns the right byte span but corrupts the
+//     by-reference found count.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'codepoint_span' => Targets::codepoint_span_found_max( ... ),
+) );
+check( 'catches wrong code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3w. Code point span that leaves found_code_points stale on empty spans.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'codepoint_span' => Targets::codepoint_span_stale_empty_found( ... ),
+) );
+check( 'catches stale empty code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) );
+
 // ---------------------------------------------------------------------
 // 4. Generator determinism and mix.
 // ---------------------------------------------------------------------

From 1f875a1f2181fca08a770e3e088a465a9eda363c Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:22:08 +0200
Subject: [PATCH 08/14] Add mb substr fuzzing

---
 progress-handoff-xZOoEn.md                  |  24 +++
 tools/encoding-fuzz/README.md               |  18 +-
 tools/encoding-fuzz/lib/Bootstrap.php       |   4 +-
 tools/encoding-fuzz/lib/Checks.php          | 222 +++++++++++++++++++-
 tools/encoding-fuzz/lib/Targets.php         |  55 +++++
 tools/encoding-fuzz/lib/wp-stubs.php        |   6 +
 tools/encoding-fuzz/tests/harness-smoke.php |  25 +++
 7 files changed, 346 insertions(+), 8 deletions(-)

diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md
index 4ace0ce8d0896..4d5fe5e56511b 100644
--- a/progress-handoff-xZOoEn.md
+++ b/progress-handoff-xZOoEn.md
@@ -31,3 +31,27 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn
   - Reviewer 2: satisfied after checking mutation adequacy, replay/minimize fault behavior, and the README clarification.
   - Reviewer 3: initially found the stale `found_code_points` gap; satisfied after the sentinel and stale-count mutation were added.
 - Commit: this step commit.
+
+### Step 2: `_mb_substr()` property coverage
+
+- Status: done; included in the step 2 commit.
+- Prior step commit: `6ea247f9da`.
+- Scope:
+  - Load `_mb_substr()` and its `_is_utf8_charset()` dependency into the fuzzer bootstrap.
+  - Add UTF-8 substring properties over valid and arbitrary input.
+  - Pin current invalid-input semantics: invalid maximal subparts count as one code point, but the returned substring preserves the original bytes rather than returning scrubbed text.
+  - Add explicit non-UTF-8 encoding fallback checks against byte-level `substr()`.
+  - Add mutation tests for byte-offset slicing, scrubbed-input slicing, negative length handling, and non-UTF-8 fallback drift.
+- Verification:
+  - `php -l tools/encoding-fuzz/lib/Checks.php`
+  - `php -l tools/encoding-fuzz/lib/Targets.php`
+  - `php -l tools/encoding-fuzz/lib/Bootstrap.php`
+  - `php -l tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none`
+  - `git diff --check`
+- Review gate: satisfied by 3 adversarial reviewers.
+  - Reviewer 1: satisfied after checking invalid-input expected substrings, negative start/length semantics, and valid native `mb_substr()` comparison.
+  - Reviewer 2: satisfied after checking mutation adequacy and faulted worker/replay/minimize behavior.
+  - Reviewer 3: satisfied after checking bootstrap/stub wiring, edge coverage, performance, and docs/progress accuracy.
+- Commit: this step commit.
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index f035e7147dbcf..9f65ee966eaa9 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -7,13 +7,15 @@ Differential fuzzer for the WordPress UTF-8 functions:
 - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()`
 - `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only)
 - `_mb_chr()` / `_mb_ord()`
+- `_mb_substr()`
 - `_wp_utf8_codepoint_count()`, `_wp_utf8_codepoint_span()`, and the
   resumable `_wp_scan_utf8()` paths (secondary)
 
 The pure-PHP fallbacks in `src/wp-includes/compat-utf8.php` are the main
 fuzz surface; the mbstring-backed public functions are checked alongside
-them. Only `compat-utf8.php` and `utf8.php` are loaded — no WordPress
-bootstrap, database, or `wp-env`.
+them. The harness loads `compat-utf8.php`, `utf8.php`, and selected private
+UTF-8 helpers extracted from `compat.php` — no WordPress bootstrap,
+database, or `wp-env`.
 
 ## Oracles
 
@@ -127,6 +129,10 @@ Internal invariants:
 - `_mb_ord( _mb_chr( $cp ) ) === $cp` for valid scalar values, and
   `_mb_chr( _mb_ord( $s ) )` reconstructs the first UTF-8 character in
   `$s` when it is well-formed
+- `_mb_substr()` in UTF-8 mode preserves original bytes while treating each
+  invalid maximal subpart as one code point; on valid input it also agrees
+  with native `mb_substr()`, and explicit non-UTF-8 encodings fall back to
+  byte-level `substr()` semantics
 
 ## Inputs
 
@@ -200,7 +206,7 @@ php tools/encoding-fuzz/tests/harness-smoke.php
 ```
 
 Verifies the oracle battery, runs the real targets over the battery
-vectors, and — most importantly — mutation-tests the harness: twenty-three
+vectors, and — most importantly — mutation-tests the harness: twenty-seven
 classes of deliberately broken implementations (validator accepting
 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber,
 identity scrubber, byte-dropping scrubber, off-by-one code point count,
@@ -210,13 +216,15 @@ null-returning encoder, sometimes-null decoder, blind noncharacter
 detector, U+FDD0-block-missing detector, over-eager noncharacter
 detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`,
 off-by-one code point span, invalid-subpart byte-counted span, and
-wrong or stale `found_code_points` span)
+wrong or stale `found_code_points` span, byte-offset `_mb_substr()`,
+scrubbed-input `_mb_substr()`, negative-length `_mb_substr()`, and
+non-UTF-8 fallback drift)
 must all be caught. It also asserts generator determinism, the
 valid/invalid input mix, and the documented
 `wp_has_noncharacters()` divergence stance on ill-formed input.
 
 For end-to-end pipeline testing while the real implementations are
-healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale`
+healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8`
 injects a broken target into worker, replay, and minimize alike.
 Fault-injected artifacts record the fault name in their environment
 metadata so they cannot be mistaken for real findings. Replaying or
diff --git a/tools/encoding-fuzz/lib/Bootstrap.php b/tools/encoding-fuzz/lib/Bootstrap.php
index be54f7aa5ba4f..de065a39d1ebc 100644
--- a/tools/encoding-fuzz/lib/Bootstrap.php
+++ b/tools/encoding-fuzz/lib/Bootstrap.php
@@ -4,7 +4,7 @@
 /**
  * Loads the WordPress UTF-8 functions under test into a bare PHP process.
  *
- * Only the UTF-8 files under test are loaded. `_mb_chr()` and `_mb_ord()`
+ * Only the UTF-8 files under test are loaded. A few private UTF-8 helpers
  * live in `compat.php`, so their function bodies are extracted from that
  * source file without loading the rest of WordPress compatibility glue.
  */
@@ -21,7 +21,7 @@ public static function load_targets(): void {
 		$root = self::repo_root();
 		require_once __DIR__ . '/wp-stubs.php';
 		require_once $root . '/src/wp-includes/compat-utf8.php';
-		self::load_compat_functions( $root . '/src/wp-includes/compat.php', array( '_mb_chr', '_mb_ord' ) );
+		self::load_compat_functions( $root . '/src/wp-includes/compat.php', array( '_is_utf8_charset', '_mb_chr', '_mb_ord', '_mb_substr' ) );
 		require_once $root . '/src/wp-includes/utf8.php';
 
 		/*
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
index 061a9b0f74f56..387f871f8d796 100644
--- a/tools/encoding-fuzz/lib/Checks.php
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -48,6 +48,10 @@
  *  - `_mb_ord( _mb_chr( cp ) ) === cp` and
  *    `_mb_chr( _mb_ord( s ) ) === first UTF-8 character in s` where
  *    those expressions are defined.
+ *  - `_mb_substr()` preserves original bytes while using UTF-8
+ *    code-point/maximal-subpart offsets; on valid input it agrees with
+ *    native `mb_substr()`, and for non-UTF-8 encodings it agrees with
+ *    byte-level `substr()`.
  *
  * Target callables are injectable so the harness smoke test can verify
  * that deliberately broken implementations are caught.
@@ -306,7 +310,12 @@ public function run( string $input ): array {
 			$failures[] = $failure;
 		}
 
-		// 11. mb_chr()/mb_ord() polyfill differentials and isomorphisms.
+		// 11. _mb_substr() UTF-8 and byte-fallback properties.
+		foreach ( $this->check_mb_substr( $input, $ref_valid, $ref_scrub ) as $failure ) {
+			$failures[] = $failure;
+		}
+
+		// 12. mb_chr()/mb_ord() polyfill differentials and isomorphisms.
 		foreach ( $this->check_mb_chr_ord( $input ) as $failure ) {
 			$failures[] = $failure;
 		}
@@ -473,6 +482,145 @@ private function assert_codepoint_span( string $input, int $byte_offset, int $ma
 		return null;
 	}
 
+	/**
+	 * Tests `_mb_substr()` against the semantics currently implemented by
+	 * `compat.php`: UTF-8 mode computes character offsets by treating each
+	 * invalid maximal subpart as one code point, then returns the original
+	 * bytes in the selected range. It does not slice scrubbed text.
+	 *
+	 * @return array<int, array{check: string, signature: string, detail: array}>
+	 */
+	private function check_mb_substr( string $input, bool $ref_valid, string $ref_scrub ): array {
+		if ( ! isset( $this->targets['mb_substr'] ) ) {
+			return array();
+		}
+
+		$failures = array();
+
+		list( $offsets, $reference_scrub ) = self::reference_utf8_offsets_and_scrub( $input );
+		if ( $reference_scrub !== $ref_scrub ) {
+			return array(
+				self::failure(
+					'substr-reference-disagreement',
+					'maximal-subpart-reference',
+					self::diff_detail( 'maximal-subpart-reference', $ref_scrub, $reference_scrub )
+				),
+			);
+		}
+
+		$code_points = count( $offsets ) - 1;
+		$encodings   = array( 'UTF-8', 'utf8', null );
+		foreach ( self::mb_substr_probes( $code_points, $input . ':utf8' ) as $i => $probe ) {
+			list( $start, $length ) = $probe;
+			$encoding              = $encodings[ $i % count( $encodings ) ];
+			$expected              = self::expected_mb_substr_from_offsets( $input, $offsets, $start, $length );
+
+			$failure = $this->assert_mb_substr(
+				$input,
+				$start,
+				$length,
+				$encoding,
+				$expected,
+				'utf8-maximal-subpart'
+			);
+
+			if ( null !== $failure ) {
+				$failures[] = $failure;
+			}
+		}
+
+		if ( $ref_valid && function_exists( 'mb_substr' ) ) {
+			foreach ( self::mb_substr_probes( $code_points, $input . ':native' ) as $probe ) {
+				list( $start, $length ) = $probe;
+				$expected              = mb_substr( $input, $start, $length, 'UTF-8' );
+
+				$failure = $this->assert_mb_substr(
+					$input,
+					$start,
+					$length,
+					'UTF-8',
+					$expected,
+					'valid-native-mb-substr'
+				);
+
+				if ( null !== $failure ) {
+					$failures[] = $failure;
+				}
+			}
+		}
+
+		$byte_encodings = array( 'ISO-8859-1', 'latin1', 'Windows-1252', 'UTF 8' );
+		foreach ( array_slice( self::mb_substr_probes( strlen( $input ), $input . ':bytes' ), 0, 18 ) as $i => $probe ) {
+			list( $start, $length ) = $probe;
+			$encoding              = $byte_encodings[ $i % count( $byte_encodings ) ];
+			$expected              = is_null( $length ) ? substr( $input, $start ) : substr( $input, $start, $length );
+
+			$failure = $this->assert_mb_substr(
+				$input,
+				$start,
+				$length,
+				$encoding,
+				$expected,
+				'non-utf8-byte-substr'
+			);
+
+			if ( null !== $failure ) {
+				$failures[] = $failure;
+			}
+		}
+
+		return $failures;
+	}
+
+	private function assert_mb_substr( string $input, int $start, ?int $length, ?string $encoding, string $expected, string $property ): ?array {
+		try {
+			$actual = ( $this->targets['mb_substr'] )( $input, $start, $length, $encoding );
+		} catch ( \Throwable $error ) {
+			return self::failure(
+				'target-exception',
+				'mb_substr',
+				array(
+					'target'   => 'mb_substr',
+					'property' => $property,
+					'start'    => $start,
+					'length'   => $length,
+					'encoding' => $encoding,
+					'message'  => $error->getMessage(),
+					'class'    => get_class( $error ),
+				)
+			);
+		}
+
+		if ( ! is_string( $actual ) ) {
+			return self::failure(
+				'mb-substr-bad-return',
+				'mb_substr',
+				array(
+					'property' => $property,
+					'start'    => $start,
+					'length'   => $length,
+					'encoding' => $encoding,
+					'type'     => get_debug_type( $actual ),
+				)
+			);
+		}
+
+		if ( $actual !== $expected ) {
+			return self::failure(
+				'mb-substr-mismatch',
+				'mb_substr',
+				array(
+					'property' => $property,
+					'start'    => $start,
+					'length'   => $length,
+					'encoding' => $encoding,
+				) + self::diff_detail( 'mb_substr', $expected, $actual )
+			);
+		}
+
+		return null;
+	}
+
 	/**
 	 * Tests `_mb_chr()` and `_mb_ord()` as partial inverses. The oracle for
 	 * `_mb_chr()` is the fuzzer's arithmetic UTF-8 encoder; the oracle for
@@ -1168,6 +1316,78 @@ private static function span_probe_counts( int $available, string $salt ): array
 		return array_values( array_unique( $counts ) );
 	}
 
+	/**
+	 * @return array<int, array{0: int, 1: int|null}> Start/length probes.
+	 */
+	private static function mb_substr_probes( int $code_points, string $salt ): array {
+		$mid  = intdiv( $code_points, 2 );
+		$last = max( 0, $code_points - 1 );
+
+		$probes = array(
+			array( 0, null ),
+			array( 0, 0 ),
+			array( 0, 1 ),
+			array( 1, null ),
+			array( 1, 1 ),
+			array( 2, 3 ),
+			array( $mid, 1 ),
+			array( $last, 1 ),
+			array( $code_points, 1 ),
+			array( $code_points + 1, 1 ),
+			array( -1, null ),
+			array( -1, 1 ),
+			array( -2, 1 ),
+			array( -$code_points, 2 ),
+			array( -( $code_points + 1 ), 2 ),
+			array( 0, -1 ),
+			array( 1, -1 ),
+			array( $mid, -1 ),
+			array( -2, -1 ),
+			array( 0, -$code_points ),
+			array( 1, -( $code_points + 1 ) ),
+		);
+
+		$range = max( 3, $code_points + 3 );
+		$hash  = hash( 'sha256', $salt, true );
+		for ( $i = 0; $i < 4; $i++ ) {
+			$start  = ( ord( $hash[ $i ] ) % ( ( 2 * $range ) + 1 ) ) - $range;
+			$length = ( ord( $hash[ $i + 4 ] ) % ( ( 2 * $range ) + 2 ) ) - $range;
+			if ( 0 === ord( $hash[ $i + 8 ] ) % 5 ) {
+				$length = null;
+			}
+
+			$probes[] = array( $start, $length );
+		}
+
+		$unique = array();
+		foreach ( $probes as $probe ) {
+			$unique[ json_encode( $probe ) ] = $probe;
+		}
+
+		return array_values( $unique );
+	}
+
+	/**
+	 * @param int[] $offsets Boundary offsets from `reference_utf8_offsets_and_scrub()`.
+	 */
+	private static function expected_mb_substr_from_offsets( string $input, array $offsets, int $start, ?int $length ): string {
+		$total            = count( $offsets ) - 1;
+		$normalized_start = $start < 0 ? max( 0, $total + $start ) : $start;
+		$start_index      = min( $normalized_start, $total );
+		$start_offset     = $offsets[ $start_index ];
+
+		if ( null === $length ) {
+			return substr( $input, $start_offset );
+		}
+
+		$normalized_length = $length < 0
+			? max( 0, $total - $normalized_start + $length )
+			: $length;
+		$end_index         = min( $start_index + $normalized_length, $total );
+
+		return substr( $input, $start_offset, $offsets[ $end_index ] - $start_offset );
+	}
+
 	/**
 	 * @return array{0: int|false, 1: int} First code point and byte length.
 	 */
diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php
index 7f5b1e27c821a..80a4cfcfa3800 100644
--- a/tools/encoding-fuzz/lib/Targets.php
+++ b/tools/encoding-fuzz/lib/Targets.php
@@ -19,6 +19,10 @@
  *   ENCODING_FUZZ_FAULT=span-invalid-bytes code point span counts invalid bytes individually
  *   ENCODING_FUZZ_FAULT=span-found-max     code point span over-reports found_code_points
  *   ENCODING_FUZZ_FAULT=span-found-stale   code point span leaves found_code_points stale
+ *   ENCODING_FUZZ_FAULT=substr-byte-level   substr treats UTF-8 offsets as byte offsets
+ *   ENCODING_FUZZ_FAULT=substr-scrub        substr slices scrubbed invalid input
+ *   ENCODING_FUZZ_FAULT=substr-no-neg-len   substr ignores negative lengths
+ *   ENCODING_FUZZ_FAULT=substr-force-utf8   substr ignores non-UTF-8 byte fallback
  */
 class Targets {
 	/**
@@ -38,6 +42,7 @@ public static function resolve(): array {
 			'mb_chr'          => '_mb_chr',
 			'mb_ord'          => '_mb_ord',
 			'codepoint_span'  => '_wp_utf8_codepoint_span',
+			'mb_substr'       => '_mb_substr',
 		);
 
 		switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) {
@@ -88,6 +93,22 @@ public static function resolve(): array {
 			case 'span-found-stale':
 				$targets['codepoint_span'] = self::codepoint_span_stale_empty_found( ... );
 				break;
+
+			case 'substr-byte-level':
+				$targets['mb_substr'] = self::mb_substr_byte_level( ... );
+				break;
+
+			case 'substr-scrub':
+				$targets['mb_substr'] = self::mb_substr_scrub_invalid( ... );
+				break;
+
+			case 'substr-no-neg-len':
+				$targets['mb_substr'] = self::mb_substr_no_negative_length( ... );
+				break;
+
+			case 'substr-force-utf8':
+				$targets['mb_substr'] = self::mb_substr_force_utf8( ... );
+				break;
 		}
 
 		return $targets;
@@ -198,4 +219,38 @@ public static function codepoint_span_stale_empty_found( string $text, int $byte
 
 		return $span;
 	}
+
+	/**
+	 * Deliberately broken substring: treats character offsets as byte offsets.
+	 */
+	public static function mb_substr_byte_level( $str, $start, $length = null, $encoding = null ) {
+		return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
+	}
+
+	/**
+	 * Deliberately broken substring: slices scrubbed UTF-8, masking that
+	 * `_mb_substr()` is expected to preserve original invalid bytes.
+	 */
+	public static function mb_substr_scrub_invalid( $str, $start, $length = null, $encoding = null ) {
+		if ( _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ) {
+			$str = wp_scrub_utf8( $str );
+		}
+
+		return _mb_substr( $str, $start, $length, $encoding );
+	}
+
+	/**
+	 * Deliberately broken substring: handles negative lengths as "to the end".
+	 */
+	public static function mb_substr_no_negative_length( $str, $start, $length = null, $encoding = null ) {
+		return _mb_substr( $str, $start, is_int( $length ) && $length < 0 ? null : $length, $encoding );
+	}
+
+	/**
+	 * Deliberately broken substring: runs the UTF-8 path even for explicit
+	 * non-UTF-8 encodings, instead of falling back to byte-level `substr()`.
+	 */
+	public static function mb_substr_force_utf8( $str, $start, $length = null, $encoding = null ) {
+		return _mb_substr( $str, $start, $length, _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ? $encoding : 'UTF-8' );
+	}
 }
diff --git a/tools/encoding-fuzz/lib/wp-stubs.php b/tools/encoding-fuzz/lib/wp-stubs.php
index ffe4cbc64a191..f86bd4b367332 100644
--- a/tools/encoding-fuzz/lib/wp-stubs.php
+++ b/tools/encoding-fuzz/lib/wp-stubs.php
@@ -14,3 +14,9 @@ function _wp_can_use_pcre_u( $set = null ): bool {
 		return (bool) $utf8_pcre;
 	}
 }
+
+if ( ! function_exists( 'get_option' ) ) {
+	function get_option( $option, $default_value = false ) {
+		return 'blog_charset' === $option ? 'UTF-8' : $default_value;
+	}
+}
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index 883f88497e19a..4977d1b5679e1 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -107,6 +107,7 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 	'mb_chr'          => '_mb_chr',
 	'mb_ord'          => '_mb_ord',
 	'codepoint_span'  => '_wp_utf8_codepoint_span',
+	'mb_substr'       => '_mb_substr',
 );
 
 /**
@@ -283,6 +284,30 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 ) );
 check( 'catches stale empty code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) );
 
+// 3x. UTF-8 substring that treats character offsets as byte offsets.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'mb_substr' => Targets::mb_substr_byte_level( ... ),
+) );
+check( 'catches byte-offset _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3y. UTF-8 substring that slices scrubbed text, losing original invalid bytes.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'mb_substr' => Targets::mb_substr_scrub_invalid( ... ),
+) );
+check( 'catches scrubbed-input _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3z. UTF-8 substring that ignores negative length semantics.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'mb_substr' => Targets::mb_substr_no_negative_length( ... ),
+) );
+check( 'catches negative-length _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3aa. Non-UTF-8 substring must fall back to byte-level substr().
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'mb_substr' => Targets::mb_substr_force_utf8( ... ),
+) );
+check( 'catches non-UTF-8 _mb_substr fallback drift', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) );
+
 // ---------------------------------------------------------------------
 // 4. Generator determinism and mix.
 // ---------------------------------------------------------------------

From a0f6820eb1adce18e5a7df56988b119881f44911 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:32:40 +0200
Subject: [PATCH 09/14] Add bounded codepoint count fuzzing

---
 progress-handoff-xZOoEn.md                  |  22 ++
 tools/encoding-fuzz/README.md               |  14 +-
 tools/encoding-fuzz/lib/Checks.php          | 226 +++++++++++++++++---
 tools/encoding-fuzz/lib/Targets.php         |  62 ++++++
 tools/encoding-fuzz/tests/harness-smoke.php |  64 ++++--
 5 files changed, 334 insertions(+), 54 deletions(-)

diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md
index 4d5fe5e56511b..aaf355609eada 100644
--- a/progress-handoff-xZOoEn.md
+++ b/progress-handoff-xZOoEn.md
@@ -55,3 +55,25 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn
   - Reviewer 2: satisfied after checking mutation adequacy and faulted worker/replay/minimize behavior.
   - Reviewer 3: satisfied after checking bootstrap/stub wiring, edge coverage, performance, and docs/progress accuracy.
 - Commit: this step commit.
+
+### Step 3: bounded `_wp_utf8_codepoint_count()` coverage
+
+- Status: done; included in the step 3 commit.
+- Prior step commit: `1f875a1f21`.
+- Scope:
+  - Add bounded `_wp_utf8_codepoint_count()` probes for negative offsets, zero lengths, oversized lengths, nonzero byte offsets, and ranges ending before/at/after code point boundaries.
+  - Pin current byte-window semantics: a range ending inside a valid multibyte character or invalid maximal subpart counts the truncated prefix as one invalid subpart.
+  - Add mutation tests for invalid-byte counting, range-end off-by-one behavior, and ignored byte offsets.
+- Verification:
+  - `php -l tools/encoding-fuzz/lib/Checks.php`
+  - `php -l tools/encoding-fuzz/lib/Targets.php`
+  - `php -l tools/encoding-fuzz/lib/Bootstrap.php`
+  - `php -l tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none`
+  - `git diff --check`
+- Review gate: satisfied by 3 adversarial reviewers.
+  - Reviewer 1: satisfied after checking the bounded-window model, negative offsets, truncation semantics, and reference independence.
+  - Reviewer 2: satisfied after checking the new mutation modes through worker/replay/minimize.
+  - Reviewer 3: satisfied after checking probe coverage, performance, docs/progress accuracy, and the smoke comment cleanup.
+- Commit: this step commit.
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index 9f65ee966eaa9..606a50ac70152 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -108,8 +108,10 @@ Internal invariants:
 - valid ⟺ scrub returns the input unchanged
 - scrub output is always valid UTF-8
 - scrub is idempotent
-- `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed text
-  (each maximal subpart counts as one code point)
+- `_wp_utf8_codepoint_count()` equals an independent maximal-subpart count
+  for whole strings and bounded byte windows; a byte window ending inside a
+  multibyte character or invalid maximal subpart counts its truncated prefix
+  as one invalid subpart
 - `_wp_utf8_codepoint_span()` reports the original byte span occupied by a
   requested number of code points; on scrubbed valid text it matches
   `strlen( mb_substr( ... ) )`, and on arbitrary input an independent
@@ -206,11 +208,13 @@ php tools/encoding-fuzz/tests/harness-smoke.php
 ```
 
 Verifies the oracle battery, runs the real targets over the battery
-vectors, and — most importantly — mutation-tests the harness: twenty-seven
+vectors, and — most importantly — mutation-tests the harness: thirty
 classes of deliberately broken implementations (validator accepting
 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber,
 identity scrubber, byte-dropping scrubber, off-by-one code point count,
-throwing target, cp1252-confused encoder, identity encoder, per-byte
+invalid-byte-counting code point count, range-end off-by-one code point
+count, byte-offset-ignoring code point count, throwing target,
+cp1252-confused encoder, identity encoder, per-byte
 decoder, valid-input-mangling decoder, round-trip-violating decoder,
 null-returning encoder, sometimes-null decoder, blind noncharacter
 detector, U+FDD0-block-missing detector, over-eager noncharacter
@@ -224,7 +228,7 @@ valid/invalid input mix, and the documented
 `wp_has_noncharacters()` divergence stance on ill-formed input.
 
 For end-to-end pipeline testing while the real implementations are
-healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8`
+healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset`
 injects a broken target into worker, replay, and minimize alike.
 Fault-injected artifacts record the fault name in their environment
 metadata so they cannot be mistaken for real findings. Replaying or
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
index 387f871f8d796..72a9a0995f52c 100644
--- a/tools/encoding-fuzz/lib/Checks.php
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -14,8 +14,8 @@
  *  - valid ⟺ scrub returns the input unchanged
  *  - scrub output is always valid UTF-8
  *  - scrub is idempotent
- *  - `_wp_utf8_codepoint_count()` equals `mb_strlen()` of the scrubbed
- *    text (each maximal subpart counts as one code point)
+ *  - `_wp_utf8_codepoint_count()` equals the independent maximal-subpart
+ *    count for whole strings and bounded byte windows
  *  - `_wp_utf8_codepoint_span()` reports the original byte span for a
  *    requested number of code points, with invalid maximal subparts
  *    counted as one code point and `found_code_points` reporting the
@@ -263,30 +263,9 @@ public function run( string $input ): array {
 			}
 		}
 
-		// 6. Code point count agrees with the scrubbed length.
-		try {
-			$count    = ( $this->targets['codepoint_count'] )( $input );
-			$expected = mb_strlen( $ref_scrub, 'UTF-8' );
-			if ( $count !== $expected ) {
-				$failures[] = self::failure(
-					'codepoint-count-mismatch',
-					'codepoint_count',
-					array(
-						'got'      => $count,
-						'expected' => $expected,
-					)
-				);
-			}
-		} catch ( \Throwable $error ) {
-			$failures[] = self::failure(
-				'target-exception',
-				'codepoint_count',
-				array(
-					'target'  => 'codepoint_count',
-					'message' => $error->getMessage(),
-					'class'   => get_class( $error ),
-				)
-			);
+		// 6. Code point count agrees with whole-string and bounded-window references.
+		foreach ( $this->check_codepoint_count( $input, $ref_scrub ) as $failure ) {
+			$failures[] = $failure;
 		}
 
 		// 7. Code point span agrees with valid-text and maximal-subpart references.
@@ -323,6 +302,114 @@ public function run( string $input ): array {
 		return $failures;
 	}
 
+	/**
+	 * Tests `_wp_utf8_codepoint_count()` over whole strings and bounded byte
+	 * windows. Byte windows are counted as standalone strings: if a window
+	 * ends inside a valid multibyte character or invalid maximal subpart,
+	 * the truncated prefix counts as one invalid maximal subpart.
+	 *
+	 * @return array<int, array{check: string, signature: string, detail: array}>
+	 */
+	private function check_codepoint_count( string $input, string $ref_scrub ): array {
+		if ( ! isset( $this->targets['codepoint_count'] ) ) {
+			return array();
+		}
+
+		list( $offsets, $reference_scrub ) = self::reference_utf8_offsets_and_scrub( $input );
+		if ( $reference_scrub !== $ref_scrub ) {
+			return array(
+				self::failure(
+					'count-reference-disagreement',
+					'maximal-subpart-reference',
+					self::diff_detail( 'maximal-subpart-reference', $ref_scrub, $reference_scrub )
+				),
+			);
+		}
+
+		$failures = array();
+		$whole    = $this->assert_codepoint_count(
+			$input,
+			0,
+			null,
+			count( $offsets ) - 1,
+			'whole-string'
+		);
+
+		if ( null !== $whole ) {
+			$failures[] = $whole;
+		}
+
+		foreach ( self::codepoint_count_probes( $offsets, strlen( $input ), $input ) as $probe ) {
+			list( $byte_offset, $max_byte_length ) = $probe;
+			$expected                             = self::expected_codepoint_count_window( $input, $byte_offset, $max_byte_length );
+
+			$failure = $this->assert_codepoint_count(
+				$input,
+				$byte_offset,
+				$max_byte_length,
+				$expected,
+				'bounded-window'
+			);
+
+			if ( null !== $failure ) {
+				$failures[] = $failure;
+			}
+		}
+
+		return $failures;
+	}
+
+	private function assert_codepoint_count( string $input, int $byte_offset, ?int $max_byte_length, int $expected, string $property ): ?array {
+		try {
+			$actual = null === $max_byte_length
+				? ( $this->targets['codepoint_count'] )( $input )
+				: ( $this->targets['codepoint_count'] )( $input, $byte_offset, $max_byte_length );
+		} catch ( \Throwable $error ) {
+			return self::failure(
+				'target-exception',
+				'codepoint_count',
+				array(
+					'target'          => 'codepoint_count',
+					'property'        => $property,
+					'byte_offset'     => $byte_offset,
+					'max_byte_length' => $max_byte_length,
+					'message'         => $error->getMessage(),
+					'class'           => get_class( $error ),
+				)
+			);
+		}
+
+		if ( ! is_int( $actual ) ) {
+			return self::failure(
+				'codepoint-count-bad-return',
+				'codepoint_count',
+				array(
+					'property'        => $property,
+					'byte_offset'     => $byte_offset,
+					'max_byte_length' => $max_byte_length,
+					'type'            => get_debug_type( $actual ),
+				)
+			);
+		}
+
+		if ( $actual !== $expected ) {
+			return self::failure(
+				'codepoint-count-mismatch',
+				'codepoint_count',
+				array(
+					'property'        => $property,
+					'byte_offset'     => $byte_offset,
+					'max_byte_length' => $max_byte_length,
+					'got'             => $actual,
+					'expected'        => $expected,
+					'input_preview'   => self::preview( $input, max( 0, $byte_offset ) ),
+				)
+			);
+		}
+
+		return null;
+	}
+
 	/**
 	 * Tests `_wp_utf8_codepoint_span()` from known boundaries only.
 	 *
@@ -1388,6 +1475,93 @@ private static function expected_mb_substr_from_offsets( string $input, array $o
 		return substr( $input, $start_offset, $offsets[ $end_index ] - $start_offset );
 	}
 
+	/**
+	 * @param int[] $offsets Boundary offsets from `reference_utf8_offsets_and_scrub()`.
+	 * @return array<int, array{0: int, 1: int}> Byte offset and max byte length probes.
+	 */
+	private static function codepoint_count_probes( array $offsets, int $byte_length, string $salt ): array {
+		$segment_count = count( $offsets ) - 1;
+		$probes        = array(
+			array( -1, 0 ),
+			array( -1, 1 ),
+			array( -5, 10 ),
+			array( 0, -1 ),
+			array( 0, 0 ),
+			array( 0, $byte_length ),
+			array( 0, $byte_length + 1 ),
+			array( $byte_length, 0 ),
+			array( $byte_length, 1 ),
+			array( $byte_length + 1, 1 ),
+		);
+
+		foreach ( self::span_probe_indices( $segment_count, $salt . ':boundaries' ) as $segment_index ) {
+			$byte_offset = $offsets[ $segment_index ];
+			$remaining   = max( 0, $byte_length - $byte_offset );
+			$lengths     = array(
+				0,
+				1,
+				2,
+				3,
+				min( 7, $remaining ),
+				$remaining,
+				$remaining + 1,
+			);
+
+			if ( $segment_index < $segment_count ) {
+				$next_segment_length = $offsets[ $segment_index + 1 ] - $byte_offset;
+				$lengths[]           = max( 0, $next_segment_length - 1 );
+				$lengths[]           = $next_segment_length;
+				$lengths[]           = $next_segment_length + 1;
+			}
+
+			if ( $segment_index + 2 <= $segment_count ) {
+				$two_segment_length = $offsets[ $segment_index + 2 ] - $byte_offset;
+				$lengths[]          = max( 0, $two_segment_length - 1 );
+				$lengths[]          = $two_segment_length;
+			}
+
+			foreach ( $lengths as $length ) {
+				$probes[] = array( $byte_offset, $length );
+			}
+		}
+
+		foreach ( array( 0, 1, 2, intdiv( $byte_length, 2 ), max( 0, $byte_length - 1 ), $byte_length ) as $byte_offset ) {
+			$remaining = max( 0, $byte_length - $byte_offset );
+			foreach ( array( 0, 1, 2, min( 7, $remaining ), $remaining, $remaining + 1 ) as $length ) {
+				$probes[] = array( $byte_offset, $length );
+			}
+		}
+
+		$hash  = hash( 'sha256', $salt . ':count', true );
+		$range = max( 3, $byte_length + 2 );
+		for ( $i = 0; $i < 4; $i++ ) {
+			$byte_offset = ( ord( $hash[ $i ] ) % ( ( 2 * $range ) + 1 ) ) - $range;
+			$length      = ( ord( $hash[ $i + 4 ] ) % ( $byte_length + 8 ) ) - 2;
+			$probes[]    = array( $byte_offset, $length );
+		}
+
+		$unique = array();
+		foreach ( $probes as $probe ) {
+			$unique[ json_encode( $probe ) ] = $probe;
+		}
+
+		return array_values( $unique );
+	}
+
+	private static function expected_codepoint_count_window( string $input, int $byte_offset, int $max_byte_length ): int {
+		if ( $byte_offset < 0 || $max_byte_length < 0 ) {
+			return 0;
+		}
+
+		$window = substr( $input, $byte_offset, $max_byte_length );
+		if ( '' === $window ) {
+			return 0;
+		}
+
+		list( $offsets ) = self::reference_utf8_offsets_and_scrub( $window );
+		return count( $offsets ) - 1;
+	}
+
 	/**
 	 * @return array{0: int|false, 1: int} First code point and byte length.
 	 */
diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php
index 80a4cfcfa3800..8f26dc833a6c0 100644
--- a/tools/encoding-fuzz/lib/Targets.php
+++ b/tools/encoding-fuzz/lib/Targets.php
@@ -23,6 +23,9 @@
  *   ENCODING_FUZZ_FAULT=substr-scrub        substr slices scrubbed invalid input
  *   ENCODING_FUZZ_FAULT=substr-no-neg-len   substr ignores negative lengths
  *   ENCODING_FUZZ_FAULT=substr-force-utf8   substr ignores non-UTF-8 byte fallback
+ *   ENCODING_FUZZ_FAULT=count-invalid-bytes count treats invalid bytes individually
+ *   ENCODING_FUZZ_FAULT=count-range-minus1  count stops one byte early in bounded ranges
+ *   ENCODING_FUZZ_FAULT=count-ignore-offset count ignores the requested byte offset
  */
 class Targets {
 	/**
@@ -109,6 +112,18 @@ public static function resolve(): array {
 			case 'substr-force-utf8':
 				$targets['mb_substr'] = self::mb_substr_force_utf8( ... );
 				break;
+
+			case 'count-invalid-bytes':
+				$targets['codepoint_count'] = self::codepoint_count_invalid_bytes( ... );
+				break;
+
+			case 'count-range-minus1':
+				$targets['codepoint_count'] = self::codepoint_count_range_minus_one( ... );
+				break;
+
+			case 'count-ignore-offset':
+				$targets['codepoint_count'] = self::codepoint_count_ignore_offset( ... );
+				break;
 		}
 
 		return $targets;
@@ -253,4 +268,51 @@ public static function mb_substr_no_negative_length( $str, $start, $length = nul
 	public static function mb_substr_force_utf8( $str, $start, $length = null, $encoding = null ) {
 		return _mb_substr( $str, $start, $length, _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ? $encoding : 'UTF-8' );
 	}
+
+	/**
+	 * Deliberately broken code point counter: treats every byte in an invalid
+	 * maximal subpart as a separate code point.
+	 */
+	public static function codepoint_count_invalid_bytes( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
+		$byte_offset     = $byte_offset ?? 0;
+		$max_byte_length = $max_byte_length ?? PHP_INT_MAX;
+
+		if ( $byte_offset < 0 || $max_byte_length < 0 ) {
+			return 0;
+		}
+
+		$count           = 0;
+		$at              = $byte_offset;
+		$end             = strlen( $text );
+		$invalid_length  = 0;
+		$max_byte_length = min( $end - $at, $max_byte_length );
+
+		while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
+			$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
+			$count += $invalid_length;
+			$at    += $invalid_length;
+		}
+
+		return $count;
+	}
+
+	/**
+	 * Deliberately broken code point counter: stops one byte early when a
+	 * bounded range is requested.
+	 */
+	public static function codepoint_count_range_minus_one( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
+		$max_byte_length = $max_byte_length ?? PHP_INT_MAX;
+		if ( $max_byte_length <= 0 ) {
+			return _wp_utf8_codepoint_count( $text, $byte_offset, $max_byte_length );
+		}
+
+		return _wp_utf8_codepoint_count( $text, $byte_offset, $max_byte_length - 1 );
+	}
+
+	/**
+	 * Deliberately broken code point counter: always starts at byte offset 0.
+	 */
+	public static function codepoint_count_ignore_offset( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
+		return _wp_utf8_codepoint_count( $text, 0, $max_byte_length );
+	}
 }
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index 4977d1b5679e1..7e15f51945dab 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -162,13 +162,31 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 ) );
 check( 'catches byte-dropping scrubber', in_array( 'scrub-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3f. Code point counter that counts invalid bytes individually.
+// 3f. Code point counter with a simple off-by-one drift on invalid input.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
-	'codepoint_count' => static fn( string $bytes ): int => _wp_utf8_codepoint_count( $bytes ) + ( wp_is_valid_utf8( $bytes ) ? 0 : 1 ),
+	'codepoint_count' => static fn( string $bytes, ?int $offset = 0, ?int $length = PHP_INT_MAX ): int => _wp_utf8_codepoint_count( $bytes, $offset, $length ) + ( wp_is_valid_utf8( $bytes ) ? 0 : 1 ),
 ) );
 check( 'catches off-by-one code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3g. Throwing target is reported, not fatal.
+// 3g. Code point counter that counts each byte in invalid maximal subparts.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'codepoint_count' => Targets::codepoint_count_invalid_bytes( ... ),
+) );
+check( 'catches invalid-byte-counting code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3h. Bounded counter that stops one byte early at the range end.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'codepoint_count' => Targets::codepoint_count_range_minus_one( ... ),
+) );
+check( 'catches range-end off-by-one code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3i. Bounded counter that ignores the byte offset.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'codepoint_count' => Targets::codepoint_count_ignore_offset( ... ),
+) );
+check( 'catches byte-offset-ignoring code point count', in_array( 'codepoint-count-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3j. Throwing target is reported, not fatal.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'is_valid_fb' => static function ( string $bytes ): bool {
 		throw new \RuntimeException( 'boom' );
@@ -176,13 +194,13 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 ) );
 check( 'reports throwing target', in_array( 'target-exception', $seen, true ), implode( ',', $seen ) );
 
-// 3h. Encoder that confuses ISO-8859-1 with Windows-1252 (0x80 becomes '€').
+// 3k. Encoder that confuses ISO-8859-1 with Windows-1252 (0x80 becomes '€').
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'utf8_encode_fb' => static fn( string $bytes ): string => str_replace( "\xC2\x80", "\xE2\x82\xAC", _wp_utf8_encode_fallback( $bytes ) ),
 ) );
 check( 'catches cp1252-confused encoder', in_array( 'utf8-encode-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3i. Encoder that passes high bytes through raw (invalid UTF-8 output).
+// 3l. Encoder that passes high bytes through raw (invalid UTF-8 output).
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'utf8_encode_fb' => static fn( string $bytes ): string => $bytes,
 ) );
@@ -192,27 +210,27 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 	implode( ',', $seen )
 );
 
-// 3j. Decoder that emits one '?' per invalid byte instead of per maximal
+// 3m. Decoder that emits one '?' per invalid byte instead of per maximal
 //     subpart (`E2 8C` becomes '??' instead of '?').
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'utf8_decode_fb' => Targets::decode_per_invalid_byte( ... ),
 ) );
 check( 'catches per-byte decoder', in_array( 'utf8-decode-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3k. Decoder that mangles a mappable code point on fully valid input.
+// 3n. Decoder that mangles a mappable code point on fully valid input.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'utf8_decode_fb' => static fn( string $bytes ): string => str_replace( "\xFC", "\xFD", _wp_utf8_decode_fallback( $bytes ) ),
 ) );
 check( 'catches decoder mangling valid input', in_array( 'utf8-decode-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3l. Decoder that drops U+0080 entirely; the encode→decode round trip
+// 3o. Decoder that drops U+0080 entirely; the encode→decode round trip
 //     must restore every input byte string exactly.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'utf8_decode_fb' => static fn( string $bytes ): string => str_replace( "\x80", '', _wp_utf8_decode_fallback( $bytes ) ),
 ) );
 check( 'catches round-trip violation', in_array( 'utf8-round-trip-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3m. Encoder that returns null (the fallbacks are untyped, so a broken
+// 3p. Encoder that returns null (the fallbacks are untyped, so a broken
 //     variant can return non-strings without throwing); must be reported,
 //     not silently skipped by every encode-side check.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
@@ -220,89 +238,89 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 ) );
 check( 'catches null-returning encoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) );
 
-// 3n. Decoder that returns null only for some inputs; must be reported
+// 3q. Decoder that returns null only for some inputs; must be reported
 //     from both the direct call and the round-trip path without crashing.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'utf8_decode_fb' => static fn( string $bytes ) => str_contains( $bytes, "\x80" ) ? null : _wp_utf8_decode_fallback( $bytes ),
 ) );
 check( 'catches sometimes-null decoder', in_array( 'target-bad-return', $seen, true ), implode( ',', $seen ) );
 
-// 3o. Noncharacter detector that never finds anything.
+// 3r. Noncharacter detector that never finds anything.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'has_nonchars_fb' => static fn( string $text ): bool => false,
 ) );
 check( 'catches blind noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3p. Detector that misses the contiguous U+FDD0–U+FDEF block (the
+// 3s. Detector that misses the contiguous U+FDD0–U+FDEF block (the
 //     plane-final pairs alone are a plausible spec misreading).
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'has_nonchars_fb' => Targets::nonchars_missing_fdd0_block( ... ),
 ) );
 check( 'catches detector missing U+FDD0 block', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3q. Over-eager detector that flags U+FDCF, just below the block.
+// 3t. Over-eager detector that flags U+FDCF, just below the block.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'has_nonchars' => Targets::nonchars_overeager( ... ),
 ) );
 check( 'catches over-eager noncharacter detector', in_array( 'noncharacters-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3r. Character encoder that confuses U+0080 with Windows-1252's euro sign.
+// 3u. Character encoder that confuses U+0080 with Windows-1252's euro sign.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'mb_chr' => static fn( int $code_point ) => 0x80 === $code_point ? "\xE2\x82\xAC" : _mb_chr( $code_point ),
 ) );
 check( 'catches cp1252-confused _mb_chr', in_array( 'mb-chr-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3s. Character decoder that accepts an invalid leading C0 byte.
+// 3v. Character decoder that accepts an invalid leading C0 byte.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'mb_ord' => static fn( string $bytes ) => str_starts_with( $bytes, "\xC0" ) ? 0 : _mb_ord( $bytes ),
 ) );
 check( 'catches invalid-accepting _mb_ord', in_array( 'mb-ord-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3t. Code point span that reports one extra byte.
+// 3w. Code point span that reports one extra byte.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'codepoint_span' => Targets::codepoint_span_off_by_one( ... ),
 ) );
 check( 'catches off-by-one code point span', in_array( 'codepoint-span-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3u. Code point span that treats invalid maximal subparts as one code
+// 3x. Code point span that treats invalid maximal subparts as one code
 //     point per byte instead of one code point per maximal subpart.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'codepoint_span' => Targets::codepoint_span_counts_invalid_bytes( ... ),
 ) );
 check( 'catches byte-counted invalid code point span', in_array( 'codepoint-span-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3v. Code point span that returns the right byte span but corrupts the
+// 3y. Code point span that returns the right byte span but corrupts the
 //     by-reference found count.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'codepoint_span' => Targets::codepoint_span_found_max( ... ),
 ) );
 check( 'catches wrong code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3w. Code point span that leaves found_code_points stale on empty spans.
+// 3z. Code point span that leaves found_code_points stale on empty spans.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'codepoint_span' => Targets::codepoint_span_stale_empty_found( ... ),
 ) );
 check( 'catches stale empty code point span found count', in_array( 'codepoint-span-found-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3x. UTF-8 substring that treats character offsets as byte offsets.
+// 3aa. UTF-8 substring that treats character offsets as byte offsets.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'mb_substr' => Targets::mb_substr_byte_level( ... ),
 ) );
 check( 'catches byte-offset _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3y. UTF-8 substring that slices scrubbed text, losing original invalid bytes.
+// 3ab. UTF-8 substring that slices scrubbed text, losing original invalid bytes.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'mb_substr' => Targets::mb_substr_scrub_invalid( ... ),
 ) );
 check( 'catches scrubbed-input _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3z. UTF-8 substring that ignores negative length semantics.
+// 3ac. UTF-8 substring that ignores negative length semantics.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'mb_substr' => Targets::mb_substr_no_negative_length( ... ),
 ) );
 check( 'catches negative-length _mb_substr', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) );
 
-// 3aa. Non-UTF-8 substring must fall back to byte-level substr().
+// 3ad. Non-UTF-8 substring must fall back to byte-level substr().
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'mb_substr' => Targets::mb_substr_force_utf8( ... ),
 ) );

From 1c208acee0af707cee235690158fae1546366772 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 14:50:15 +0200
Subject: [PATCH 10/14] Add bounded UTF-8 scan fuzzing

---
 progress-handoff-xZOoEn.md                  |  22 ++
 tools/encoding-fuzz/README.md               |  13 +-
 tools/encoding-fuzz/lib/Checks.php          | 289 +++++++++++++++++++-
 tools/encoding-fuzz/lib/Targets.php         |  85 ++++++
 tools/encoding-fuzz/tests/harness-smoke.php |  65 +++++
 5 files changed, 466 insertions(+), 8 deletions(-)

diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md
index aaf355609eada..5cb171e6e8a3a 100644
--- a/progress-handoff-xZOoEn.md
+++ b/progress-handoff-xZOoEn.md
@@ -77,3 +77,25 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn
   - Reviewer 2: satisfied after checking the new mutation modes through worker/replay/minimize.
   - Reviewer 3: satisfied after checking probe coverage, performance, docs/progress accuracy, and the smoke comment cleanup.
 - Commit: this step commit.
+
+### Step 4: bounded `_wp_scan_utf8()` properties
+
+- Status: done; included in the step 4 commit.
+- Prior step commit: `a0f6820eb1`.
+- Scope:
+  - Add direct `_wp_scan_utf8()` probes for `max_bytes`, `max_code_points`, negative limits, nonzero boundary starts, invalid spans, by-ref noncharacter flag reset, and scanned-region noncharacter reporting.
+  - Pin current scan semantics: valid multibyte characters that start before the byte limit are scanned whole, while invalid spans are bounded by `max_bytes`.
+  - Add mutation tests for ignored `max_bytes`, noncharacter leakage from outside the scanned region, missed noncharacters inside the scanned region, ASCII fast-path overrun of `max_code_points`, and stale noncharacter flags.
+- Verification:
+  - `php -l tools/encoding-fuzz/lib/Checks.php`
+  - `php -l tools/encoding-fuzz/lib/Targets.php`
+  - `php -l tools/encoding-fuzz/lib/Bootstrap.php`
+  - `php -l tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none`
+  - `git diff --check`
+- Review gate: satisfied by 3 adversarial reviewers.
+  - Reviewer 1: initially found stale `has_noncharacters` and negative-bound gaps; satisfied after adding stale-true probes and negative limit probes.
+  - Reviewer 2: initially found missing false-negative noncharacter mutation coverage; satisfied after adding `scan-miss-nonchars` and selector wiring checks.
+  - Reviewer 3: satisfied after checking probe volume, performance, README mutation count/list, fault list, and progress ordering.
+- Commit: this step commit.
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index 606a50ac70152..a8ec642a5258f 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -118,6 +118,10 @@ Internal invariants:
   maximal-subpart parser checks that invalid subparts count as one code
   point. Nonzero starts are probed only at known code point or
   maximal-subpart boundaries.
+- bounded `_wp_scan_utf8()` calls agree with an independent scan model for
+  `max_bytes`, `max_code_points`, negative limits, nonzero boundary starts,
+  invalid spans, forward progress, by-ref noncharacter flag reset, and
+  scanned-region noncharacter reporting
 - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points`
   chunks reconstructs the same scrubbed text and always makes forward
   progress (chunk sizes derive from the input hash, so replays are exact)
@@ -208,7 +212,7 @@ php tools/encoding-fuzz/tests/harness-smoke.php
 ```
 
 Verifies the oracle battery, runs the real targets over the battery
-vectors, and — most importantly — mutation-tests the harness: thirty
+vectors, and — most importantly — mutation-tests the harness: thirty-five
 classes of deliberately broken implementations (validator accepting
 0xC0, validator rejecting noncharacters, non-maximal-subpart scrubber,
 identity scrubber, byte-dropping scrubber, off-by-one code point count,
@@ -222,13 +226,16 @@ detector, cp1252-confused `_mb_chr()`, invalid-accepting `_mb_ord()`,
 off-by-one code point span, invalid-subpart byte-counted span, and
 wrong or stale `found_code_points` span, byte-offset `_mb_substr()`,
 scrubbed-input `_mb_substr()`, negative-length `_mb_substr()`, and
-non-UTF-8 fallback drift)
+non-UTF-8 fallback drift, max-bytes-ignoring `_wp_scan_utf8()`,
+noncharacter-leaking `_wp_scan_utf8()`, noncharacter-missing
+`_wp_scan_utf8()`, ASCII-overrunning `_wp_scan_utf8()`, and
+stale-noncharacter-flag `_wp_scan_utf8()`)
 must all be caught. It also asserts generator determinism, the
 valid/invalid input mix, and the documented
 `wp_has_noncharacters()` divergence stance on ill-formed input.
 
 For end-to-end pipeline testing while the real implementations are
-healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset`
+healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset|scan-ignore-bytes|scan-nonchars-leak|scan-miss-nonchars|scan-ascii-overrun|scan-stale-nonchars`
 injects a broken target into worker, replay, and minimize alike.
 Fault-injected artifacts record the fault name in their environment
 metadata so they cannot be mistaken for real findings. Replaying or
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
index 72a9a0995f52c..2a48994979cb1 100644
--- a/tools/encoding-fuzz/lib/Checks.php
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -20,6 +20,10 @@
  *    requested number of code points, with invalid maximal subparts
  *    counted as one code point and `found_code_points` reporting the
  *    available/requested count
+ *  - bounded `_wp_scan_utf8()` calls agree with an independent scan model
+ *    for `max_bytes`, `max_code_points`, negative limits, nonzero boundary
+ *    starts, invalid spans, by-ref noncharacter flag reset, and
+ *    scanned-region noncharacter reporting
  *  - scanning with `_wp_scan_utf8()` in pseudo-random `max_code_points`
  *    chunks reconstructs the same scrubbed text and always makes
  *    forward progress
@@ -273,28 +277,33 @@ public function run( string $input ): array {
 			$failures[] = $failure;
 		}
 
-		// 8. Chunked scan reconstruction.
+		// 8. Direct bounded scan properties.
+		foreach ( $this->check_bounded_scan( $input, $ref_scrub ) as $failure ) {
+			$failures[] = $failure;
+		}
+
+		// 9. Chunked scan reconstruction.
 		$chunk_failure = $this->check_chunked_scan( $input, $ref_scrub );
 		if ( null !== $chunk_failure ) {
 			$failures[] = $chunk_failure;
 		}
 
-		// 9. Legacy utf8_encode()/utf8_decode() fallback differentials.
+		// 10. Legacy utf8_encode()/utf8_decode() fallback differentials.
 		foreach ( $this->check_utf8_encode_decode( $input, $ref_valid, $mb_validity ) as $failure ) {
 			$failures[] = $failure;
 		}
 
-		// 10. Noncharacter detection, on valid input only.
+		// 11. Noncharacter detection, on valid input only.
 		foreach ( $this->check_noncharacters( $input, $ref_valid ) as $failure ) {
 			$failures[] = $failure;
 		}
 
-		// 11. _mb_substr() UTF-8 and byte-fallback properties.
+		// 12. _mb_substr() UTF-8 and byte-fallback properties.
 		foreach ( $this->check_mb_substr( $input, $ref_valid, $ref_scrub ) as $failure ) {
 			$failures[] = $failure;
 		}
 
-		// 12. mb_chr()/mb_ord() polyfill differentials and isomorphisms.
+		// 13. mb_chr()/mb_ord() polyfill differentials and isomorphisms.
 		foreach ( $this->check_mb_chr_ord( $input ) as $failure ) {
 			$failures[] = $failure;
 		}
@@ -410,6 +419,151 @@ private function assert_codepoint_count( string $input, int $byte_offset, ?int $
 		return null;
 	}
 
+	/**
+	 * Tests optional `_wp_scan_utf8()` bounds directly from known code
+	 * point/maximal-subpart boundaries. Starts inside continuation bytes or
+	 * inside invalid maximal subparts remain undefined for this property.
+	 *
+	 * @return array<int, array{check: string, signature: string, detail: array}>
+	 */
+	private function check_bounded_scan( string $input, string $ref_scrub ): array {
+		if ( ! isset( $this->targets['scan_utf8'] ) ) {
+			return array();
+		}
+
+		list( $offsets, $reference_scrub ) = self::reference_utf8_offsets_and_scrub( $input );
+		if ( $reference_scrub !== $ref_scrub ) {
+			return array(
+				self::failure(
+					'scan-reference-disagreement',
+					'maximal-subpart-reference',
+					self::diff_detail( 'maximal-subpart-reference', $ref_scrub, $reference_scrub )
+				),
+			);
+		}
+
+		$failures = array();
+		foreach ( self::scan_utf8_probes( $offsets, strlen( $input ), $input ) as $probe ) {
+			list( $start, $max_bytes, $max_code_points ) = $probe;
+			$expected                                   = self::expected_scan_utf8( $input, $start, $max_bytes, $max_code_points );
+
+			$failure = $this->assert_scan_utf8(
+				$input,
+				$start,
+				$max_bytes,
+				$max_code_points,
+				$expected
+			);
+
+			if ( null !== $failure ) {
+				$failures[] = $failure;
+			}
+		}
+
+		return $failures;
+	}
+
+	/**
+	 * @param array{count: int, at: int, invalid_length: int, has_noncharacters: bool} $expected
+	 */
+	private function assert_scan_utf8( string $input, int $start, ?int $max_bytes, ?int $max_code_points, array $expected ): ?array {
+		$failure = $this->assert_scan_utf8_with_initial_has( $input, $start, $max_bytes, $max_code_points, null, $expected );
+		if ( null !== $failure ) {
+			return $failure;
+		}
+
+		if ( ! $expected['has_noncharacters'] ) {
+			$failure = $this->assert_scan_utf8_with_initial_has(
+				$input,
+				$start,
+				$max_bytes,
+				$max_code_points,
+				true,
+				$expected
+			);
+
+			if ( null !== $failure ) {
+				return $failure;
+			}
+		}
+
+		return null;
+	}
+
+	/**
+	 * @param array{count: int, at: int, invalid_length: int, has_noncharacters: bool} $expected
+	 */
+	private function assert_scan_utf8_with_initial_has( string $input, int $start, ?int $max_bytes, ?int $max_code_points, ?bool $initial_has, array $expected ): ?array {
+		$at                = $start;
+		$invalid_length    = -1;
+		$has_noncharacters = $initial_has;
+
+		try {
+			$count = ( $this->targets['scan_utf8'] )( $input, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters );
+		} catch ( \Throwable $error ) {
+			return self::failure(
+				'target-exception',
+				'scan_utf8',
+				array(
+					'target'          => 'scan_utf8',
+					'start'           => $start,
+					'max_bytes'       => $max_bytes,
+					'max_code_points' => $max_code_points,
+					'initial_has'     => $initial_has,
+					'message'         => $error->getMessage(),
+					'class'           => get_class( $error ),
+				)
+			);
+		}
+
+		if (
+			! is_int( $count ) ||
+			! is_int( $at ) ||
+			! is_int( $invalid_length ) ||
+			( ! is_bool( $has_noncharacters ) && ! in_array( $has_noncharacters, array( 0, 1 ), true ) )
+		) {
+			return self::failure(
+				'scan-utf8-bad-return',
+				'scan_utf8',
+				array(
+					'start'                  => $start,
+					'max_bytes'              => $max_bytes,
+					'max_code_points'        => $max_code_points,
+					'initial_has'            => $initial_has,
+					'count_type'             => get_debug_type( $count ),
+					'at_type'                => get_debug_type( $at ),
+					'invalid_length_type'    => get_debug_type( $invalid_length ),
+					'has_noncharacters_type' => get_debug_type( $has_noncharacters ),
+				)
+			);
+		}
+
+		$actual = array(
+			'count'             => $count,
+			'at'                => $at,
+			'invalid_length'    => $invalid_length,
+			'has_noncharacters' => (bool) $has_noncharacters,
+		);
+
+		if ( $actual !== $expected ) {
+			return self::failure(
+				'scan-utf8-mismatch',
+				'scan_utf8',
+				array(
+					'start'           => $start,
+					'max_bytes'       => $max_bytes,
+					'max_code_points' => $max_code_points,
+					'initial_has'     => $initial_has,
+					'got'             => $actual,
+					'expected'        => $expected,
+					'input_preview'   => self::preview( $input, $start ),
+				)
+			);
+		}
+
+		return null;
+	}
+
 	/**
 	 * Tests `_wp_utf8_codepoint_span()` from known boundaries only.
 	 *
@@ -1562,6 +1716,131 @@ private static function expected_codepoint_count_window( string $input, int $byt
 		return count( $offsets ) - 1;
 	}
 
+	/**
+	 * @param int[] $offsets Boundary offsets from `reference_utf8_offsets_and_scrub()`.
+	 * @return array<int, array{0: int, 1: int|null, 2: int|null}> Start, max bytes, max code points.
+	 */
+	private static function scan_utf8_probes( array $offsets, int $byte_length, string $salt ): array {
+		$segment_count = count( $offsets ) - 1;
+		$probes        = array(
+			array( 0, null, null ),
+			array( 0, 0, null ),
+			array( 0, null, 0 ),
+			array( $byte_length, null, null ),
+			array( $byte_length, 1, 1 ),
+		);
+
+		foreach ( self::span_probe_indices( $segment_count, $salt . ':scan' ) as $segment_index ) {
+			$start     = $offsets[ $segment_index ];
+			$remaining = max( 0, $byte_length - $start );
+			$available = $segment_count - $segment_index;
+
+			$byte_limits = array( null, -1, 0, 1, min( 7, $remaining ), $remaining, $remaining + 1 );
+			if ( $segment_index < $segment_count ) {
+				$next_length   = $offsets[ $segment_index + 1 ] - $start;
+				$byte_limits[] = max( 0, $next_length - 1 );
+				$byte_limits[] = $next_length;
+				$byte_limits[] = $next_length + 1;
+			}
+			if ( $segment_index + 2 <= $segment_count ) {
+				$two_length    = $offsets[ $segment_index + 2 ] - $start;
+				$byte_limits[] = max( 0, $two_length - 1 );
+				$byte_limits[] = $two_length;
+			}
+
+			$point_limits = array( null, -1, 0, 1, 2, min( 7, $available ), $available, $available + 1 );
+
+			foreach ( array_values( array_unique( $byte_limits ) ) as $max_bytes ) {
+				$probes[] = array( $start, $max_bytes, null );
+			}
+
+			foreach ( array_values( array_unique( $point_limits ) ) as $max_code_points ) {
+				$probes[] = array( $start, null, $max_code_points );
+			}
+
+			foreach ( array( -1, 0, 1, min( 7, $remaining ), $remaining ) as $max_bytes ) {
+				foreach ( array( -1, 0, 1, min( 3, $available ) ) as $max_code_points ) {
+					$probes[] = array( $start, $max_bytes, $max_code_points );
+				}
+			}
+		}
+
+		$hash = hash( 'sha256', $salt . ':scan-random', true );
+		for ( $i = 0; $i < 4; $i++ ) {
+			$start_index     = ord( $hash[ $i ] ) % ( $segment_count + 1 );
+			$start           = $offsets[ $start_index ];
+			$remaining       = max( 0, $byte_length - $start );
+			$available       = $segment_count - $start_index;
+			$max_bytes       = ord( $hash[ $i + 4 ] ) % ( $remaining + 2 );
+			$max_code_points = ord( $hash[ $i + 8 ] ) % ( $available + 2 );
+			$probes[]        = array( $start, $max_bytes, $max_code_points );
+		}
+
+		$unique = array();
+		foreach ( $probes as $probe ) {
+			$unique[ json_encode( $probe ) ] = $probe;
+		}
+
+		return array_values( $unique );
+	}
+
+	/**
+	 * @return array{count: int, at: int, invalid_length: int, has_noncharacters: bool}
+	 */
+	private static function expected_scan_utf8( string $input, int $start, ?int $max_bytes, ?int $max_code_points ): array {
+		$byte_length       = strlen( $input );
+		$end               = min( $byte_length, $start + ( $max_bytes ?? PHP_INT_MAX ) );
+		$max_code_points   = $max_code_points ?? PHP_INT_MAX;
+		$at                = $start;
+		$count             = 0;
+		$has_noncharacters = false;
+
+		while ( $at < $end ) {
+			if ( $count >= $max_code_points ) {
+				return array(
+					'count'             => $count,
+					'at'                => $at,
+					'invalid_length'    => 0,
+					'has_noncharacters' => $has_noncharacters,
+				);
+			}
+
+			list( $segment_length, $valid ) = self::reference_utf8_segment( $input, $at );
+
+			if ( ! $valid ) {
+				return array(
+					'count'             => $count,
+					'at'                => $at,
+					'invalid_length'    => min( $segment_length, $end - $at ),
+					'has_noncharacters' => $has_noncharacters,
+				);
+			}
+
+			$character = substr( $input, $at, $segment_length );
+			list( $code_point ) = self::first_code_point_or_false( $character );
+			if ( is_int( $code_point ) && self::is_noncharacter_code_point( $code_point ) ) {
+				$has_noncharacters = true;
+			}
+
+			++$count;
+			$at += $segment_length;
+		}
+
+		return array(
+			'count'             => $count,
+			'at'                => $at,
+			'invalid_length'    => 0,
+			'has_noncharacters' => $has_noncharacters,
+		);
+	}
+
+	private static function is_noncharacter_code_point( int $code_point ): bool {
+		return (
+			( $code_point >= 0xFDD0 && $code_point <= 0xFDEF ) ||
+			0xFFFE === ( $code_point & 0xFFFE )
+		);
+	}
+
 	/**
 	 * @return array{0: int|false, 1: int} First code point and byte length.
 	 */
diff --git a/tools/encoding-fuzz/lib/Targets.php b/tools/encoding-fuzz/lib/Targets.php
index 8f26dc833a6c0..67bb56c08e078 100644
--- a/tools/encoding-fuzz/lib/Targets.php
+++ b/tools/encoding-fuzz/lib/Targets.php
@@ -26,6 +26,11 @@
  *   ENCODING_FUZZ_FAULT=count-invalid-bytes count treats invalid bytes individually
  *   ENCODING_FUZZ_FAULT=count-range-minus1  count stops one byte early in bounded ranges
  *   ENCODING_FUZZ_FAULT=count-ignore-offset count ignores the requested byte offset
+ *   ENCODING_FUZZ_FAULT=scan-ignore-bytes   scan ignores max_bytes
+ *   ENCODING_FUZZ_FAULT=scan-nonchars-leak  scan reports noncharacters outside scanned region
+ *   ENCODING_FUZZ_FAULT=scan-miss-nonchars  scan misses noncharacters inside scanned region
+ *   ENCODING_FUZZ_FAULT=scan-ascii-overrun  scan ASCII fast path overruns max_code_points
+ *   ENCODING_FUZZ_FAULT=scan-stale-nonchars scan leaves a stale noncharacter flag
  */
 class Targets {
 	/**
@@ -46,6 +51,7 @@ public static function resolve(): array {
 			'mb_ord'          => '_mb_ord',
 			'codepoint_span'  => '_wp_utf8_codepoint_span',
 			'mb_substr'       => '_mb_substr',
+			'scan_utf8'       => '_wp_scan_utf8',
 		);
 
 		switch ( getenv( 'ENCODING_FUZZ_FAULT' ) ) {
@@ -124,6 +130,26 @@ public static function resolve(): array {
 			case 'count-ignore-offset':
 				$targets['codepoint_count'] = self::codepoint_count_ignore_offset( ... );
 				break;
+
+			case 'scan-ignore-bytes':
+				$targets['scan_utf8'] = self::scan_utf8_ignore_max_bytes( ... );
+				break;
+
+			case 'scan-nonchars-leak':
+				$targets['scan_utf8'] = self::scan_utf8_noncharacters_leak( ... );
+				break;
+
+			case 'scan-miss-nonchars':
+				$targets['scan_utf8'] = self::scan_utf8_miss_noncharacters( ... );
+				break;
+
+			case 'scan-ascii-overrun':
+				$targets['scan_utf8'] = self::scan_utf8_ascii_overrun( ... );
+				break;
+
+			case 'scan-stale-nonchars':
+				$targets['scan_utf8'] = self::scan_utf8_stale_noncharacters( ... );
+				break;
 		}
 
 		return $targets;
@@ -315,4 +341,63 @@ public static function codepoint_count_range_minus_one( string $text, ?int $byte
 	public static function codepoint_count_ignore_offset( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
 		return _wp_utf8_codepoint_count( $text, 0, $max_byte_length );
 	}
+
+	/**
+	 * Deliberately broken scan: ignores the byte limit.
+	 */
+	public static function scan_utf8_ignore_max_bytes( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
+		return _wp_scan_utf8( $bytes, $at, $invalid_length, null, $max_code_points, $has_noncharacters );
+	}
+
+	/**
+	 * Deliberately broken scan: leaks noncharacters from outside the scanned
+	 * region into `$has_noncharacters`.
+	 */
+	public static function scan_utf8_noncharacters_leak( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
+		$count = _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters );
+
+		if ( _wp_has_noncharacters_fallback( $bytes ) ) {
+			$has_noncharacters = true;
+		}
+
+		return $count;
+	}
+
+	/**
+	 * Deliberately broken scan: misses noncharacters inside the scanned
+	 * region.
+	 */
+	public static function scan_utf8_miss_noncharacters( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
+		$count             = _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters );
+		$has_noncharacters = false;
+
+		return $count;
+	}
+
+	/**
+	 * Deliberately broken scan: the ASCII fast path consumes one extra code
+	 * point when a code point limit is supplied.
+	 */
+	public static function scan_utf8_ascii_overrun( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
+		if ( null !== $max_code_points && $at < strlen( $bytes ) && ord( $bytes[ $at ] ) <= 0x7F ) {
+			return _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points + 1, $has_noncharacters );
+		}
+
+		return _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters );
+	}
+
+	/**
+	 * Deliberately broken scan: preserves a stale noncharacter flag instead
+	 * of resetting it for the current scan.
+	 */
+	public static function scan_utf8_stale_noncharacters( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
+		$initial_has = $has_noncharacters;
+		$count       = _wp_scan_utf8( $bytes, $at, $invalid_length, $max_bytes, $max_code_points, $has_noncharacters );
+
+		if ( true === $initial_has && ! (bool) $has_noncharacters ) {
+			$has_noncharacters = true;
+		}
+
+		return $count;
+	}
 }
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index 7e15f51945dab..b5f6d3d11242c 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -108,6 +108,7 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 	'mb_ord'          => '_mb_ord',
 	'codepoint_span'  => '_wp_utf8_codepoint_span',
 	'mb_substr'       => '_mb_substr',
+	'scan_utf8'       => '_wp_scan_utf8',
 );
 
 /**
@@ -127,6 +128,35 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 	return array_keys( $seen );
 }
 
+/**
+ * Runs every battery vector through `Targets::resolve()` with a fault
+ * environment variable, proving the CLI fault selector names are wired.
+ *
+ * @return string[] Distinct check names observed.
+ */
+function fault_run( Oracles $oracles, array $vectors, string $fault ): array {
+	$previous_fault = getenv( 'ENCODING_FUZZ_FAULT' );
+	putenv( "ENCODING_FUZZ_FAULT={$fault}" );
+
+	try {
+		$checks = new Checks( $oracles, Targets::resolve() );
+		$seen   = array();
+		foreach ( $vectors as $bytes ) {
+			foreach ( $checks->run( $bytes ) as $failure ) {
+				$seen[ $failure['check'] ] = true;
+			}
+		}
+	} finally {
+		if ( false === $previous_fault ) {
+			putenv( 'ENCODING_FUZZ_FAULT' );
+		} else {
+			putenv( "ENCODING_FUZZ_FAULT={$previous_fault}" );
+		}
+	}
+
+	return array_keys( $seen );
+}
+
 // 3a. Validator that wrongly accepts a never-valid byte.
 $seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
 	'is_valid_fb' => static fn( string $bytes ): bool => str_contains( $bytes, "\xC0" ) ? true : _wp_is_valid_utf8_fallback( $bytes ),
@@ -326,6 +356,41 @@ function broken_run( Oracles $oracles, array $real, array $vectors, array $overr
 ) );
 check( 'catches non-UTF-8 _mb_substr fallback drift', in_array( 'mb-substr-mismatch', $seen, true ), implode( ',', $seen ) );
 
+// 3ae. Bounded scan that ignores max_bytes.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'scan_utf8' => Targets::scan_utf8_ignore_max_bytes( ... ),
+) );
+check( 'catches max_bytes-ignoring scan', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3af. Bounded scan that leaks noncharacters from outside the scanned region.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'scan_utf8' => Targets::scan_utf8_noncharacters_leak( ... ),
+) );
+check( 'catches noncharacter-leaking scan', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3ag. Bounded scan that misses noncharacters inside the scanned region.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'scan_utf8' => Targets::scan_utf8_miss_noncharacters( ... ),
+) );
+check( 'catches noncharacter-missing scan', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3ah. Bounded scan whose ASCII fast path overruns max_code_points.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'scan_utf8' => Targets::scan_utf8_ascii_overrun( ... ),
+) );
+check( 'catches ASCII-overrunning scan', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) );
+
+// 3ai. Bounded scan that preserves a stale noncharacter flag.
+$seen = broken_run( $oracles, $real_targets, $battery_vectors, array(
+	'scan_utf8' => Targets::scan_utf8_stale_noncharacters( ... ),
+) );
+check( 'catches stale noncharacter scan flag', in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) );
+
+foreach ( array( 'scan-ignore-bytes', 'scan-nonchars-leak', 'scan-miss-nonchars', 'scan-ascii-overrun', 'scan-stale-nonchars' ) as $fault ) {
+	$seen = fault_run( $oracles, $battery_vectors, $fault );
+	check( "fault selector {$fault} is wired", in_array( 'scan-utf8-mismatch', $seen, true ), implode( ',', $seen ) );
+}
+
 // ---------------------------------------------------------------------
 // 4. Generator determinism and mix.
 // ---------------------------------------------------------------------

From 4005f40d3ccd630613f31179aeb6a5d3970356cc Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 15:07:15 +0200
Subject: [PATCH 11/14] Add deterministic UTF-8 boundary corpus

---
 progress-handoff-xZOoEn.md                  |  27 ++
 tools/encoding-fuzz/README.md               |  21 +-
 tools/encoding-fuzz/corpus.php              | 184 ++++++++++++
 tools/encoding-fuzz/lib/Corpus.php          | 310 ++++++++++++++++++++
 tools/encoding-fuzz/tests/harness-smoke.php |  89 +++++-
 5 files changed, 625 insertions(+), 6 deletions(-)
 create mode 100644 tools/encoding-fuzz/corpus.php
 create mode 100644 tools/encoding-fuzz/lib/Corpus.php

diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md
index 5cb171e6e8a3a..d618244deca93 100644
--- a/progress-handoff-xZOoEn.md
+++ b/progress-handoff-xZOoEn.md
@@ -99,3 +99,30 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn
   - Reviewer 2: initially found missing false-negative noncharacter mutation coverage; satisfied after adding `scan-miss-nonchars` and selector wiring checks.
   - Reviewer 3: satisfied after checking probe volume, performance, README mutation count/list, fault list, and progress ordering.
 - Commit: this step commit.
+
+### Step 5: deterministic short-boundary corpus
+
+- Status: done; included in the step 5 commit.
+- Prior step commit: `1c208acee0`.
+- Scope:
+  - Add a deterministic short-boundary corpus separate from the random generator so random `(seed, case)` derivation remains stable.
+  - Cover lead-byte boundary classes crossed with boundary continuation positions, adjacent invalid maximal subparts, valid/malformed sandwiches, EOF truncations, and noncharacter boundary neighbors.
+  - Add a standalone corpus runner and smoke coverage for the new fixed cases.
+- Verification:
+  - `php -l tools/encoding-fuzz/lib/Checks.php`
+  - `php -l tools/encoding-fuzz/lib/Targets.php`
+  - `php -l tools/encoding-fuzz/lib/Bootstrap.php`
+  - `php -l tools/encoding-fuzz/lib/Corpus.php`
+  - `php -l tools/encoding-fuzz/corpus.php`
+  - `php -l tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/corpus.php --external none`
+  - `php -d disable_functions=utf8_encode,utf8_decode tools/encoding-fuzz/corpus.php --external none`
+  - `ENCODING_FUZZ_FAULT=scan-ignore-bytes php tools/encoding-fuzz/corpus.php --external none --output-dir /tmp/encoding-fuzz-corpus-fault`
+  - `php tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none`
+  - `git diff --cached --check`
+- Review gate: satisfied by 3 adversarial reviewers.
+  - Reviewer 1: initially noted byte-level dedupe hid intended labels; satisfied after preserving label-level corpus entries.
+  - Reviewer 2: noted smoke skipped the new CLI/artifact path; satisfied after adding CLI smoke coverage, fail-closed artifact writes, and manual faulted artifact verification.
+  - Reviewer 3: noted count/fingerprint/runtime and oracle-event ordering gaps; satisfied after pinning corpus count/fingerprint, updating smoke docs, and making CLI smoke parse NDJSON by record type.
+- Commit: this step commit.
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index a8ec642a5258f..e3563af7df3f1 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -142,7 +142,7 @@ Internal invariants:
 
 ## Inputs
 
-Each case is fully determined by `(seed, case index)` **for a given
+Random cases are fully determined by `(seed, case index)` **for a given
 generator version**: changing the generator (e.g. its boundary code
 point list) invalidates `--seed`/`--case` re-derivation of older
 findings. Failure artifacts embed the input bytes, so `--failure` and
@@ -156,6 +156,14 @@ ISO-8859-1-ish text, UTF-16 with/without BOM, long ASCII runs with
 broken tails (`strspn()` fast-path stress), and repeated motifs.
 Roughly a third of generated inputs are fully valid UTF-8.
 
+A separate deterministic short-boundary corpus lives outside the random
+generator so changing the fixed corpus does not perturb random
+`(seed, case)` reproduction. It covers lead-byte boundary classes
+crossed with boundary second/third/fourth byte positions, adjacent
+invalid maximal subparts, valid text immediately before and after
+malformed prefixes, EOF truncations at each prefix length, and
+noncharacter boundary neighbors.
+
 ## Common Commands
 
 Run one worker batch:
@@ -164,6 +172,12 @@ Run one worker batch:
 php tools/encoding-fuzz/worker.php --seed 1 --cases 5000
 ```
 
+Run the deterministic short-boundary corpus:
+
+```sh
+php tools/encoding-fuzz/corpus.php
+```
+
 Run parallel lanes for a minute (artifacts under `artifacts/encoding-fuzz/`):
 
 ```sh
@@ -231,8 +245,9 @@ noncharacter-leaking `_wp_scan_utf8()`, noncharacter-missing
 `_wp_scan_utf8()`, ASCII-overrunning `_wp_scan_utf8()`, and
 stale-noncharacter-flag `_wp_scan_utf8()`)
 must all be caught. It also asserts generator determinism, the
-valid/invalid input mix, and the documented
-`wp_has_noncharacters()` divergence stance on ill-formed input.
+valid/invalid input mix, the deterministic short-boundary corpus, and
+the documented `wp_has_noncharacters()` divergence stance on ill-formed
+input.
 
 For end-to-end pipeline testing while the real implementations are
 healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset|scan-ignore-bytes|scan-nonchars-leak|scan-miss-nonchars|scan-ascii-overrun|scan-stale-nonchars`
diff --git a/tools/encoding-fuzz/corpus.php b/tools/encoding-fuzz/corpus.php
new file mode 100644
index 0000000000000..460beb7677caf
--- /dev/null
+++ b/tools/encoding-fuzz/corpus.php
@@ -0,0 +1,184 @@
+<?php
+/**
+ * Runs deterministic fixed corpora that complement the random generator.
+ *
+ *     php tools/encoding-fuzz/corpus.php --external none
+ *
+ * Exit codes: 0 all cases passed, 1 failures found, 2 harness error.
+ */
+
+namespace EncodingFuzz;
+
+require __DIR__ . '/lib/autoload.php';
+
+error_reporting( E_ALL );
+ini_set( 'display_errors', 'stderr' );
+ini_set( 'memory_limit', '512M' );
+
+$options = Cli::parse_args(
+	$argv,
+	array(
+		'external'       => 'auto',
+		'output-dir'     => '',
+		'progress-every' => 0,
+	)
+);
+
+Bootstrap::load_targets();
+
+$oracles = Oracles::build( Cli::resolve_externals( $options['external'] ) );
+foreach ( $oracles->drain_events() as $event ) {
+	Cli::emit( array( 'type' => 'oracle-event' ) + $event );
+}
+
+if ( ! $oracles->has_required() ) {
+	Cli::emit(
+		array(
+			'type'   => 'fatal',
+			'reason' => 'mbstring oracle unavailable or failed the battery; cannot run corpus without a primary oracle',
+		)
+	);
+	exit( 2 );
+}
+
+$output_dir = $options['output-dir'];
+if ( '' !== $output_dir && ! is_dir( $output_dir ) && ! mkdir( $output_dir, 0777, true ) ) {
+	Cli::emit(
+		array(
+			'type'   => 'fatal',
+			'reason' => "cannot create output dir {$output_dir}",
+		)
+	);
+	exit( 2 );
+}
+
+$checks     = new Checks( $oracles );
+$mb_valid   = $oracles->validity_oracles()['mb'];
+$cases      = Corpus::short_boundary_cases();
+$stats      = array(
+	'cases'        => 0,
+	'failures'     => 0,
+	'valid_inputs' => 0,
+	'bytes'        => 0,
+	'by_strategy'  => array(
+		'short-boundary-corpus' => 0,
+	),
+);
+$started_at = microtime( true );
+
+Cli::emit(
+	array(
+		'type'        => 'start',
+		'corpus'      => 'short-boundary',
+		'cases'       => count( $cases ),
+		'environment' => Cli::environment_metadata( $oracles ),
+	)
+);
+
+foreach ( $cases as $case => $entry ) {
+	$input    = $entry['bytes'];
+	$label    = $entry['label'];
+	$failures = $checks->run( $input );
+
+	++$stats['cases'];
+	++$stats['by_strategy']['short-boundary-corpus'];
+	$stats['bytes'] += strlen( $input );
+	if ( $mb_valid( $input ) ) {
+		++$stats['valid_inputs'];
+	}
+
+	foreach ( $oracles->drain_events() as $event ) {
+		Cli::emit( array( 'type' => 'oracle-event', 'case' => $case, 'corpus_label' => $label ) + $event );
+	}
+
+	if ( array() !== $failures ) {
+		$stats['failures'] += count( $failures );
+
+		$record = array(
+			'type'         => 'failure',
+			'corpus'       => 'short-boundary',
+			'case'         => $case,
+			'corpus_label' => $label,
+			'strategy'     => 'short-boundary-corpus',
+			'input_size'   => strlen( $input ),
+			'signatures'   => array_values( array_unique( array_column( $failures, 'signature' ) ) ),
+			'failures'     => $failures,
+			'input_base64' => base64_encode( $input ),
+		);
+
+		if ( '' !== $output_dir ) {
+			$case_dir = "{$output_dir}/failure-corpus-short-boundary-case{$case}";
+			if ( ! is_dir( $case_dir ) && ! mkdir( $case_dir, 0777, true ) ) {
+				Cli::emit(
+					array(
+						'type'   => 'fatal',
+						'reason' => "cannot create artifact dir {$case_dir}",
+					)
+				);
+				$oracles->shutdown();
+				exit( 2 );
+			}
+			if ( false === file_put_contents( "{$case_dir}/input.bin", $input ) ) {
+				Cli::emit(
+					array(
+						'type'   => 'fatal',
+						'reason' => "cannot write {$case_dir}/input.bin",
+					)
+				);
+				$oracles->shutdown();
+				exit( 2 );
+			}
+
+			$artifact                = $record;
+			$artifact['environment'] = Cli::environment_metadata( $oracles );
+			$artifact['git']         = Cli::git_metadata( Bootstrap::repo_root() );
+			if ( false === file_put_contents(
+				"{$case_dir}/failure.json",
+				json_encode( $artifact, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES )
+			) ) {
+				Cli::emit(
+					array(
+						'type'   => 'fatal',
+						'reason' => "cannot write {$case_dir}/failure.json",
+					)
+				);
+				$oracles->shutdown();
+				exit( 2 );
+			}
+			$record['artifact_dir'] = $case_dir;
+		}
+
+		Cli::emit( $record );
+	}
+
+	if (
+		$options['progress-every'] > 0 &&
+		0 === ( $stats['cases'] % max( 1, $options['progress-every'] ) )
+	) {
+		$elapsed = microtime( true ) - $started_at;
+		Cli::emit(
+			array(
+				'type'          => 'progress',
+				'corpus'        => 'short-boundary',
+				'case'          => $case,
+				'cases_done'    => $stats['cases'],
+				'failures'      => $stats['failures'],
+				'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null,
+			)
+		);
+	}
+}
+
+$elapsed = microtime( true ) - $started_at;
+Cli::emit(
+	array(
+		'type'          => 'done',
+		'corpus'        => 'short-boundary',
+		'stats'         => $stats,
+		'elapsed_sec'   => round( $elapsed, 2 ),
+		'cases_per_sec' => $elapsed > 0 ? round( $stats['cases'] / $elapsed, 1 ) : null,
+	)
+);
+
+$oracles->shutdown();
+exit( $stats['failures'] > 0 ? 1 : 0 );
diff --git a/tools/encoding-fuzz/lib/Corpus.php b/tools/encoding-fuzz/lib/Corpus.php
new file mode 100644
index 0000000000000..1ff98071b3892
--- /dev/null
+++ b/tools/encoding-fuzz/lib/Corpus.php
@@ -0,0 +1,310 @@
+<?php
+namespace EncodingFuzz;
+
+/**
+ * Deterministic fixed corpora that complement, but do not perturb, the
+ * pseudo-random generator.
+ */
+class Corpus {
+	/**
+	 * Required UTF-8 boundary representatives from the handoff, plus adjacent
+	 * comparison bytes needed to hit the real Table 3-7 cut points.
+	 */
+	private const BOUNDARY_BYTES = array(
+		0x00,
+		0x7E,
+		0x7F,
+		0x80,
+		0x8F,
+		0x90,
+		0x9F,
+		0xA0,
+		0xBF,
+		0xC0,
+		0xC1,
+		0xC2,
+		0xDF,
+		0xE0,
+		0xE1,
+		0xEC,
+		0xED,
+		0xEE,
+		0xEF,
+		0xF0,
+		0xF1,
+		0xF3,
+		0xF4,
+		0xF5,
+		0xFE,
+		0xFF,
+	);
+
+	/**
+	 * The exact lead byte class representatives named in the handoff.
+	 */
+	private const LEAD_CLASS_BYTES = array(
+		0x7F,
+		0x80,
+		0xBF,
+		0xC0,
+		0xC1,
+		0xC2,
+		0xDF,
+		0xE0,
+		0xE1,
+		0xEC,
+		0xED,
+		0xEE,
+		0xEF,
+		0xF0,
+		0xF1,
+		0xF3,
+		0xF4,
+		0xF5,
+		0xFE,
+		0xFF,
+	);
+
+	private const TWO_BYTE_LEADS   = array( 0xC0, 0xC1, 0xC2, 0xDF );
+	private const THREE_BYTE_LEADS = array( 0xE0, 0xE1, 0xEC, 0xED, 0xEE, 0xEF );
+	private const FOUR_BYTE_LEADS  = array( 0xF0, 0xF1, 0xF3, 0xF4, 0xF5 );
+
+	/**
+	 * @return array<int, array{label: string, bytes: string}>
+	 */
+	public static function short_boundary_cases(): array {
+		$cases = array();
+
+		self::add_lead_boundary_cases( $cases );
+		self::add_adjacent_invalid_cases( $cases );
+		self::add_sandwich_cases( $cases );
+		self::add_truncation_cases( $cases );
+		self::add_noncharacter_boundary_cases( $cases );
+
+		return $cases;
+	}
+
+	/**
+	 * @param array<int, array{label: string, bytes: string}> $cases
+	 */
+	private static function add_lead_boundary_cases( array &$cases ): void {
+		foreach ( self::LEAD_CLASS_BYTES as $lead ) {
+			self::add_case( $cases, 'lead:' . self::hex_byte( $lead ), self::bytes( $lead ) );
+		}
+
+		foreach ( self::TWO_BYTE_LEADS as $lead ) {
+			foreach ( self::BOUNDARY_BYTES as $second ) {
+				self::add_case(
+					$cases,
+					sprintf( 'two-second:%02x-%02x', $lead, $second ),
+					self::bytes( $lead, $second )
+				);
+			}
+		}
+
+		foreach ( self::THREE_BYTE_LEADS as $lead ) {
+			list( $base_second, $base_third ) = self::three_byte_baseline( $lead );
+			foreach ( self::BOUNDARY_BYTES as $second ) {
+				self::add_case(
+					$cases,
+					sprintf( 'three-second:%02x-%02x-%02x', $lead, $second, $base_third ),
+					self::bytes( $lead, $second, $base_third )
+				);
+			}
+
+			foreach ( self::BOUNDARY_BYTES as $third ) {
+				self::add_case(
+					$cases,
+					sprintf( 'three-third:%02x-%02x-%02x', $lead, $base_second, $third ),
+					self::bytes( $lead, $base_second, $third )
+				);
+			}
+		}
+
+		foreach ( self::FOUR_BYTE_LEADS as $lead ) {
+			list( $base_second, $base_third, $base_fourth ) = self::four_byte_baseline( $lead );
+			foreach ( self::BOUNDARY_BYTES as $second ) {
+				self::add_case(
+					$cases,
+					sprintf( 'four-second:%02x-%02x-%02x-%02x', $lead, $second, $base_third, $base_fourth ),
+					self::bytes( $lead, $second, $base_third, $base_fourth )
+				);
+			}
+
+			foreach ( self::BOUNDARY_BYTES as $third ) {
+				self::add_case(
+					$cases,
+					sprintf( 'four-third:%02x-%02x-%02x-%02x', $lead, $base_second, $third, $base_fourth ),
+					self::bytes( $lead, $base_second, $third, $base_fourth )
+				);
+			}
+
+			foreach ( self::BOUNDARY_BYTES as $fourth ) {
+				self::add_case(
+					$cases,
+					sprintf( 'four-fourth:%02x-%02x-%02x-%02x', $lead, $base_second, $base_third, $fourth ),
+					self::bytes( $lead, $base_second, $base_third, $fourth )
+				);
+			}
+		}
+	}
+
+	/**
+	 * @param array<int, array{label: string, bytes: string}> $cases
+	 */
+	private static function add_adjacent_invalid_cases( array &$cases ): void {
+		$adjacent = array(
+			'continuation-run'    => "\x80\xBF\x80",
+			'never-valid-leads'   => "\xC0\xC1\xF5\xFE\xFF",
+			'overlong-pair'       => "\xE0\x80\xE0\x9F",
+			'surrogate-pair'      => "\xED\xA0\xED\xB0",
+			'past-range-pair'     => "\xF4\x90\xF5\x80",
+			'truncated-three'     => "\xE2\x8C\xE2\x8C",
+			'truncated-four'      => "\xF1\x80\x80\xF0\x90",
+			'unicode-table-3-8'   => "\xF1\x80\x80\xE1\x80\xC2",
+			'bad-lead-after-cont' => "\x80\xF5\xBF\xFE",
+		);
+
+		foreach ( $adjacent as $label => $bytes ) {
+			self::add_case( $cases, "adjacent-invalid:{$label}", $bytes );
+		}
+	}
+
+	/**
+	 * @param array<int, array{label: string, bytes: string}> $cases
+	 */
+	private static function add_sandwich_cases( array &$cases ): void {
+		$valid_atoms = array(
+			'ascii'        => 'a',
+			'two-byte'     => "\xC2\x80",
+			'three-byte'   => "\xE2\x9C\x8F",
+			'four-byte'    => "\xF0\x90\x80\x80",
+			'noncharacter' => "\xEF\xBF\xBE",
+		);
+		$malformed = array(
+			'lone-continuation' => "\x80",
+			'never-valid-c0'    => "\xC0",
+			'truncated-two'     => "\xC2",
+			'overlong-three'    => "\xE0\x80",
+			'surrogate'         => "\xED\xA0",
+			'truncated-three'   => "\xE2\x8C",
+			'truncated-four'    => "\xF1\x80\x80",
+			'past-range'        => "\xF4\x90",
+			'never-valid-f5'    => "\xF5",
+			'never-valid-ff'    => "\xFF",
+		);
+
+		foreach ( $valid_atoms as $valid_label => $valid ) {
+			foreach ( $malformed as $bad_label => $bad ) {
+				self::add_case( $cases, "sandwich:{$valid_label}-before-{$bad_label}", $valid . $bad );
+				self::add_case( $cases, "sandwich:{$bad_label}-before-{$valid_label}", $bad . $valid );
+				self::add_case( $cases, "sandwich:{$valid_label}-around-{$bad_label}", $valid . $bad . $valid );
+			}
+		}
+	}
+
+	/**
+	 * @param array<int, array{label: string, bytes: string}> $cases
+	 */
+	private static function add_truncation_cases( array &$cases ): void {
+		$complete = array(
+			'two-min'      => "\xC2\x80",
+			'two-max'      => "\xDF\xBF",
+			'three-min'    => "\xE0\xA0\x80",
+			'three-mid'    => "\xE1\x80\x80",
+			'surrogate-hi' => "\xED\x9F\xBF",
+			'nonchar'      => "\xEF\xBF\xBE",
+			'four-min'     => "\xF0\x90\x80\x80",
+			'four-mid'     => "\xF1\x80\x80\x80",
+			'four-max'     => "\xF4\x8F\xBF\xBF",
+		);
+
+		foreach ( $complete as $label => $bytes ) {
+			$length = strlen( $bytes );
+			for ( $prefix_length = 1; $prefix_length < $length; $prefix_length++ ) {
+				$prefix = substr( $bytes, 0, $prefix_length );
+				self::add_case( $cases, "truncation:{$label}-{$prefix_length}", $prefix );
+				self::add_case( $cases, "truncation:ascii-{$label}-{$prefix_length}", 'a' . $prefix );
+			}
+		}
+	}
+
+	/**
+	 * @param array<int, array{label: string, bytes: string}> $cases
+	 */
+	private static function add_noncharacter_boundary_cases( array &$cases ): void {
+		$code_points = array(
+			0xFDCF,
+			0xFDD0,
+			0xFDEF,
+			0xFDF0,
+			0xFFFD,
+			0xFFFE,
+			0xFFFF,
+		);
+
+		for ( $plane = 0; $plane <= 0x10; $plane++ ) {
+			$final         = ( $plane << 16 ) | 0xFFFF;
+			$code_points[] = $final - 2;
+			$code_points[] = $final - 1;
+			$code_points[] = $final;
+		}
+
+		foreach ( array_values( array_unique( $code_points ) ) as $code_point ) {
+			$bytes = Generator::encode_code_point( $code_point );
+			$label = sprintf( 'noncharacter-boundary:u+%04x', $code_point );
+			self::add_case( $cases, $label, $bytes );
+			self::add_case( $cases, "{$label}-embedded", 'a' . $bytes . 'b' );
+		}
+	}
+
+	/**
+	 * @param array<int, array{label: string, bytes: string}> $cases
+	 */
+	private static function add_case( array &$cases, string $label, string $bytes ): void {
+		$cases[] = array(
+			'label' => $label,
+			'bytes' => $bytes,
+		);
+	}
+
+	/**
+	 * @return array{0: int, 1: int}
+	 */
+	private static function three_byte_baseline( int $lead ): array {
+		switch ( $lead ) {
+			case 0xE0:
+				return array( 0xA0, 0x80 );
+			case 0xED:
+				return array( 0x9F, 0xBF );
+			default:
+				return array( 0x80, 0x80 );
+		}
+	}
+
+	/**
+	 * @return array{0: int, 1: int, 2: int}
+	 */
+	private static function four_byte_baseline( int $lead ): array {
+		switch ( $lead ) {
+			case 0xF0:
+				return array( 0x90, 0x80, 0x80 );
+			case 0xF4:
+				return array( 0x8F, 0xBF, 0xBF );
+			default:
+				return array( 0x80, 0x80, 0x80 );
+		}
+	}
+
+	private static function bytes( int ...$bytes ): string {
+		$out = '';
+		foreach ( $bytes as $byte ) {
+			$out .= chr( $byte );
+		}
+		return $out;
+	}
+
+	private static function hex_byte( int $byte ): string {
+		return sprintf( '%02x', $byte );
+	}
+}
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index b5f6d3d11242c..6fcd660776fe6 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -9,7 +9,9 @@
  *     masquerade as "no findings".
  *  4. The generator is deterministic and produces the advertised mix of
  *     valid and invalid inputs across all strategies.
- *  5. A short real fuzz run completes.
+ *  5. The deterministic short-boundary corpus is stable and clean.
+ *  6. A short real fuzz run completes.
+ *  7. The one-shot exhaustive companion test passes and catches its mutant.
  *
  * Exit codes: 0 pass, 1 fail.
  */
@@ -418,7 +420,88 @@ function fault_run( Oracles $oracles, array $vectors, string $fault ): array {
 );
 
 // ---------------------------------------------------------------------
-// 5. Short real fuzz run.
+// 5. Deterministic short-boundary corpus.
+// ---------------------------------------------------------------------
+$corpus_cases      = Corpus::short_boundary_cases();
+$corpus_categories = array();
+foreach ( $corpus_cases as $entry ) {
+	$category                       = explode( ':', $entry['label'], 2 )[0];
+	$corpus_categories[ $category ] = true;
+}
+
+$expected_categories = array(
+	'lead',
+	'two-second',
+	'three-second',
+	'three-third',
+	'four-second',
+	'four-third',
+	'four-fourth',
+	'adjacent-invalid',
+	'sandwich',
+	'truncation',
+	'noncharacter-boundary',
+);
+$missing_categories  = array_values( array_diff( $expected_categories, array_keys( $corpus_categories ) ) );
+check(
+	'short-boundary corpus has broad deterministic coverage',
+	1133 === count( $corpus_cases ) && array() === $missing_categories,
+	'count ' . count( $corpus_cases ) . ', missing ' . implode( ',', $missing_categories )
+);
+
+$corpus_fingerprint = static function ( array $cases ): string {
+	$parts = array();
+	foreach ( $cases as $entry ) {
+		$parts[] = $entry['label'] . '=' . bin2hex( $entry['bytes'] );
+	}
+	return hash( 'sha256', implode( "\n", $parts ) );
+};
+check(
+	'short-boundary corpus deterministic',
+	'93f63dec5d9534e0ed1db643d5eb0596ececb0807cc3fb92cc6fe21fc4c60fbd' === $corpus_fingerprint( $corpus_cases )
+);
+
+$corpus_failures = 0;
+foreach ( $corpus_cases as $entry ) {
+	$failures = $checks->run( $entry['bytes'] );
+	foreach ( $failures as $failure ) {
+		++$corpus_failures;
+		echo "  corpus finding: {$failure['signature']} on {$entry['label']} " . bin2hex( $entry['bytes'] ) . "\n";
+	}
+}
+check( 'short-boundary corpus clean (' . count( $corpus_cases ) . ' cases)', 0 === $corpus_failures );
+
+$corpus_command = escapeshellarg( PHP_BINARY ) . ' ' . escapeshellarg( __DIR__ . '/../corpus.php' ) . ' --external none';
+exec( "{$corpus_command} 2>&1", $corpus_output, $corpus_code );
+$corpus_start = null;
+$corpus_done  = null;
+foreach ( $corpus_output as $line ) {
+	$record = json_decode( $line, true );
+	if ( ! is_array( $record ) ) {
+		continue;
+	}
+
+	if ( 'start' === ( $record['type'] ?? null ) ) {
+		$corpus_start = $record;
+	} elseif ( 'done' === ( $record['type'] ?? null ) ) {
+		$corpus_done = $record;
+	}
+}
+check(
+	'short-boundary corpus CLI clean',
+	0 === $corpus_code &&
+	is_array( $corpus_start ) &&
+	is_array( $corpus_done ) &&
+	'start' === ( $corpus_start['type'] ?? null ) &&
+	'done' === ( $corpus_done['type'] ?? null ) &&
+	1133 === ( $corpus_start['cases'] ?? null ) &&
+	1133 === ( $corpus_done['stats']['cases'] ?? null ) &&
+	0 === ( $corpus_done['stats']['failures'] ?? null ),
+	implode( ' | ', array_slice( $corpus_output, -3 ) )
+);
+
+// ---------------------------------------------------------------------
+// 6. Short real fuzz run.
 // ---------------------------------------------------------------------
 $fuzz_failures = 0;
 for ( $i = 0; $i < 300; $i++ ) {
@@ -432,7 +515,7 @@ function fault_run( Oracles $oracles, array $vectors, string $fault ): array {
 check( '300-case fuzz run clean (real findings would also surface here)', 0 === $fuzz_failures );
 
 // ---------------------------------------------------------------------
-// 6. One-shot exhaustive companion test: must pass, and its detection
+// 7. One-shot exhaustive companion test: must pass, and its detection
 //    must provably fire (same mutation-testing rule as everything else).
 // ---------------------------------------------------------------------
 $exhaustive = escapeshellarg( PHP_BINARY ) . ' ' . escapeshellarg( __DIR__ . '/code-point-to-utf8-exhaustive.php' );

From a6d67b18f06ac4a0cf8c2e2fc00cc7f1f39dcf26 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 15:21:17 +0200
Subject: [PATCH 12/14] Add encoding fuzzer environment matrix

---
 progress-handoff-xZOoEn.md           |  29 +++
 tools/encoding-fuzz/README.md        |  14 ++
 tools/encoding-fuzz/lib/Cli.php      |  16 +-
 tools/encoding-fuzz/lib/wp-stubs.php |   9 +-
 tools/encoding-fuzz/matrix.php       | 255 +++++++++++++++++++++++++++
 5 files changed, 316 insertions(+), 7 deletions(-)
 create mode 100644 tools/encoding-fuzz/matrix.php

diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md
index d618244deca93..6b549275cd59b 100644
--- a/progress-handoff-xZOoEn.md
+++ b/progress-handoff-xZOoEn.md
@@ -126,3 +126,32 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn
   - Reviewer 2: noted smoke skipped the new CLI/artifact path; satisfied after adding CLI smoke coverage, fail-closed artifact writes, and manual faulted artifact verification.
   - Reviewer 3: noted count/fingerprint/runtime and oracle-event ordering gaps; satisfied after pinning corpus count/fingerprint, updating smoke docs, and making CLI smoke parse NDJSON by record type.
 - Commit: this step commit.
+
+### Step 6: environment matrix
+
+- Status: done; included in the step 6 commit.
+- Prior step commit: `4005f40d3c`.
+- Scope:
+  - Add a compact environment matrix command that runs the fixed corpus under current environment, forced no-PCRE-u target branch, simulated PHP 9 native `utf8_encode()` / `utf8_decode()` absence, and missing primary mbstring oracle functions.
+  - Add a fuzzer-only PCRE-u override in `wp-stubs.php` so the fallback `wp_has_noncharacters()` branch can be exercised without a separate PHP build.
+  - Document that a true no-mbstring target run still requires a PHP build without mbstring because the local harness fails closed without its mb-backed primary oracle.
+- Verification:
+  - `php -l tools/encoding-fuzz/lib/Checks.php`
+  - `php -l tools/encoding-fuzz/lib/Targets.php`
+  - `php -l tools/encoding-fuzz/lib/Bootstrap.php`
+  - `php -l tools/encoding-fuzz/lib/Cli.php`
+  - `php -l tools/encoding-fuzz/lib/wp-stubs.php`
+  - `php -l tools/encoding-fuzz/matrix.php`
+  - `php -l tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/matrix.php`
+  - `env ENCODING_FUZZ_FORCE_PCRE_U=0 php tools/encoding-fuzz/matrix.php`
+  - `php tools/encoding-fuzz/corpus.php --external none`
+  - `php -d disable_functions=utf8_encode,utf8_decode tools/encoding-fuzz/corpus.php --external none`
+  - `php tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none`
+  - `git diff --cached --check`
+- Review gate: satisfied by 3 adversarial reviewers.
+  - Reviewer 1: initially noted PCRE override metadata and force-on risks; satisfied after adding `pcre_u_override` metadata and making the override force-off only.
+  - Reviewer 2: initially found matrix pipe-deadlock, exit-code, and NDJSON-shape issues; satisfied after nonblocking pipe reads, harness-error exit `2`, and stricter record parsing.
+  - Reviewer 3: initially found the matrix exit-code contract mismatch; satisfied after preserving exit `2` for harness-error-shaped failures and checking docs/progress accuracy.
+- Commit: this step commit.
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index e3563af7df3f1..e89e5d712a516 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -178,6 +178,20 @@ Run the deterministic short-boundary corpus:
 php tools/encoding-fuzz/corpus.php
 ```
 
+Run the compact environment matrix:
+
+```sh
+php tools/encoding-fuzz/matrix.php
+```
+
+The matrix runs the fixed corpus in the current environment, with the
+fuzzer's PCRE-u branch forced off, with native `utf8_encode()` /
+`utf8_decode()` disabled to simulate PHP 9, and with the primary mbstring
+oracle functions disabled to verify the harness fails closed. A true
+no-mbstring target run still requires a PHP build without mbstring; the
+local harness intentionally refuses to fuzz without the mb-backed primary
+oracle.
+
 Run parallel lanes for a minute (artifacts under `artifacts/encoding-fuzz/`):
 
 ```sh
diff --git a/tools/encoding-fuzz/lib/Cli.php b/tools/encoding-fuzz/lib/Cli.php
index 3ddd47679b5ab..e6d6b62c59b98 100644
--- a/tools/encoding-fuzz/lib/Cli.php
+++ b/tools/encoding-fuzz/lib/Cli.php
@@ -103,15 +103,21 @@ public static function git_metadata( string $repo_root ): array {
 	}
 
 	public static function environment_metadata( Oracles $oracles ): array {
+		$forced_pcre_u = getenv( 'ENCODING_FUZZ_FORCE_PCRE_U' );
+		$pcre_override = false !== $forced_pcre_u && in_array( strtolower( $forced_pcre_u ), array( '0', 'false', 'no', 'off' ), true )
+			? 'off'
+			: null;
+
 		return array(
-			'php'     => PHP_VERSION,
-			'os'      => PHP_OS_FAMILY,
-			'oracles' => $oracles->names(),
+			'php'             => PHP_VERSION,
+			'os'              => PHP_OS_FAMILY,
+			'oracles'         => $oracles->names(),
 			// Which environment branch of utf8.php loaded (PCRE vs fallback).
-			'pcre_u'  => function_exists( '_wp_can_use_pcre_u' ) ? _wp_can_use_pcre_u() : null,
+			'pcre_u'          => function_exists( '_wp_can_use_pcre_u' ) ? _wp_can_use_pcre_u() : null,
+			'pcre_u_override' => $pcre_override,
 			// Mark fault-injected artifacts so they can never be mistaken
 			// for real findings.
-			'fault'   => getenv( 'ENCODING_FUZZ_FAULT' ) ?: null,
+			'fault'           => getenv( 'ENCODING_FUZZ_FAULT' ) ?: null,
 		);
 	}
 }
diff --git a/tools/encoding-fuzz/lib/wp-stubs.php b/tools/encoding-fuzz/lib/wp-stubs.php
index f86bd4b367332..2eaa662cbbeba 100644
--- a/tools/encoding-fuzz/lib/wp-stubs.php
+++ b/tools/encoding-fuzz/lib/wp-stubs.php
@@ -8,8 +8,13 @@
 	function _wp_can_use_pcre_u( $set = null ): bool {
 		static $utf8_pcre = null;
 		if ( null === $utf8_pcre ) {
-			// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
-			$utf8_pcre = false !== @preg_match( '/^./u', 'a' );
+			$forced = getenv( 'ENCODING_FUZZ_FORCE_PCRE_U' );
+			if ( false !== $forced && in_array( strtolower( $forced ), array( '0', 'false', 'no', 'off' ), true ) ) {
+				$utf8_pcre = false;
+			} else {
+				// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
+				$utf8_pcre = false !== @preg_match( '/^./u', 'a' );
+			}
 		}
 		return (bool) $utf8_pcre;
 	}
diff --git a/tools/encoding-fuzz/matrix.php b/tools/encoding-fuzz/matrix.php
new file mode 100644
index 0000000000000..98f72f63cd498
--- /dev/null
+++ b/tools/encoding-fuzz/matrix.php
@@ -0,0 +1,255 @@
+<?php
+/**
+ * Runs a compact environment matrix for the encoding fuzzer.
+ *
+ *     php tools/encoding-fuzz/matrix.php
+ *
+ * Exit codes: 0 pass, 1 matrix failure, 2 harness error.
+ */
+
+namespace EncodingFuzz;
+
+require __DIR__ . '/lib/autoload.php';
+
+error_reporting( E_ALL );
+ini_set( 'display_errors', 'stderr' );
+
+/**
+ * @param string[]              $command
+ * @param array<string, string> $env
+ * @return array{code: int, stdout: string, stderr: string}
+ */
+function matrix_run_command( array $command, array $env = array() ): array {
+	$process = proc_open(
+		$command,
+		array(
+			0 => array( 'file', '/dev/null', 'r' ),
+			1 => array( 'pipe', 'w' ),
+			2 => array( 'pipe', 'w' ),
+		),
+		$pipes,
+		null,
+		array_merge( $_ENV, $env )
+	);
+
+	if ( ! is_resource( $process ) ) {
+		return array(
+			'code'   => 2,
+			'stdout' => '',
+			'stderr' => 'proc_open failed',
+		);
+	}
+
+	stream_set_blocking( $pipes[1], false );
+	stream_set_blocking( $pipes[2], false );
+
+	$stdout   = '';
+	$stderr   = '';
+	$open     = array(
+		1 => $pipes[1],
+		2 => $pipes[2],
+	);
+	$deadline = microtime( true ) + 120;
+
+	while ( array() !== $open ) {
+		if ( microtime( true ) > $deadline ) {
+			proc_terminate( $process, 9 );
+			foreach ( $open as $pipe ) {
+				fclose( $pipe );
+			}
+			proc_close( $process );
+			return array(
+				'code'   => 2,
+				'stdout' => $stdout,
+				'stderr' => $stderr . "\nmatrix child timed out",
+			);
+		}
+
+		$read   = array_values( $open );
+		$write  = null;
+		$except = null;
+		$ready  = stream_select( $read, $write, $except, 1, 0 );
+
+		if ( false === $ready ) {
+			foreach ( $open as $pipe ) {
+				fclose( $pipe );
+			}
+			proc_close( $process );
+			return array(
+				'code'   => 2,
+				'stdout' => $stdout,
+				'stderr' => $stderr . "\nmatrix stream_select failed",
+			);
+		}
+
+		foreach ( $read as $pipe ) {
+			$chunk = stream_get_contents( $pipe );
+			if ( false === $chunk || '' === $chunk ) {
+				continue;
+			}
+
+			if ( $pipe === $pipes[1] ) {
+				$stdout .= $chunk;
+			} else {
+				$stderr .= $chunk;
+			}
+		}
+
+		foreach ( $open as $index => $pipe ) {
+			if ( feof( $pipe ) ) {
+				$chunk = stream_get_contents( $pipe );
+				if ( is_string( $chunk ) && '' !== $chunk ) {
+					if ( 1 === $index ) {
+						$stdout .= $chunk;
+					} else {
+						$stderr .= $chunk;
+					}
+				}
+				fclose( $pipe );
+				unset( $open[ $index ] );
+			}
+		}
+	}
+
+	return array(
+		'code'   => proc_close( $process ),
+		'stdout' => (string) $stdout,
+		'stderr' => (string) $stderr,
+	);
+}
+
+/**
+ * @return array{records: array<int, array<string, mixed>>, malformed: string[]}
+ */
+function matrix_decode_ndjson( string $stdout ): array {
+	$records   = array();
+	$malformed = array();
+	foreach ( explode( "\n", trim( $stdout ) ) as $line ) {
+		if ( '' === $line ) {
+			continue;
+		}
+
+		$record = json_decode( $line, true );
+		if ( is_array( $record ) && isset( $record['type'] ) && is_string( $record['type'] ) ) {
+			$records[] = $record;
+		} else {
+			$malformed[] = $line;
+		}
+	}
+	return array(
+		'records'   => $records,
+		'malformed' => $malformed,
+	);
+}
+
+/**
+ * @param array<int, array<string, mixed>> $records
+ */
+function matrix_first_record( array $records, string $type ): ?array {
+	foreach ( $records as $record ) {
+		if ( $type === ( $record['type'] ?? null ) ) {
+			return $record;
+		}
+	}
+	return null;
+}
+
+/**
+ * @param array<int, array<string, mixed>> $records
+ */
+function matrix_has_oracle_event( array $records, string $oracle ): bool {
+	foreach ( $records as $record ) {
+		if ( 'oracle-event' === ( $record['type'] ?? null ) && $oracle === ( $record['oracle'] ?? null ) ) {
+			return true;
+		}
+	}
+	return false;
+}
+
+$cases = array(
+	array(
+		'name'    => 'current-corpus',
+		'command' => array( PHP_BINARY, __DIR__ . '/corpus.php', '--external', 'none' ),
+			'env'     => array( 'ENCODING_FUZZ_FORCE_PCRE_U' => '' ),
+			'check'   => static function ( array $run ): array {
+				$decoded = matrix_decode_ndjson( $run['stdout'] );
+				$records = $decoded['records'];
+				$start   = matrix_first_record( $records, 'start' );
+				$done    = matrix_first_record( $records, 'done' );
+				$ok      = 0 === $run['code'] &&
+					array() === $decoded['malformed'] &&
+					is_array( $start ) &&
+					is_array( $done ) &&
+					true === ( $start['environment']['pcre_u'] ?? null ) &&
+					0 === ( $done['stats']['failures'] ?? null );
+				return array( $ok, is_array( $done ) ? json_encode( $done['stats'], JSON_UNESCAPED_SLASHES ) : trim( $run['stderr'] ), 2 === $run['code'] || array() !== $decoded['malformed'] );
+			},
+		),
+	array(
+		'name'    => 'forced-no-pcre-corpus',
+		'command' => array( PHP_BINARY, __DIR__ . '/corpus.php', '--external', 'none' ),
+			'env'     => array( 'ENCODING_FUZZ_FORCE_PCRE_U' => '0' ),
+			'check'   => static function ( array $run ): array {
+				$decoded = matrix_decode_ndjson( $run['stdout'] );
+				$records = $decoded['records'];
+				$start   = matrix_first_record( $records, 'start' );
+				$done    = matrix_first_record( $records, 'done' );
+				$ok      = 0 === $run['code'] &&
+					array() === $decoded['malformed'] &&
+					is_array( $start ) &&
+					is_array( $done ) &&
+					false === ( $start['environment']['pcre_u'] ?? null ) &&
+					'off' === ( $start['environment']['pcre_u_override'] ?? null ) &&
+					0 === ( $done['stats']['failures'] ?? null );
+				return array( $ok, is_array( $start ) ? json_encode( $start['environment'], JSON_UNESCAPED_SLASHES ) : trim( $run['stderr'] ), 2 === $run['code'] || array() !== $decoded['malformed'] );
+			},
+		),
+	array(
+		'name'    => 'native-unavailable-corpus',
+		'command' => array( PHP_BINARY, '-d', 'disable_functions=utf8_encode,utf8_decode', __DIR__ . '/corpus.php', '--external', 'none' ),
+			'env'     => array( 'ENCODING_FUZZ_FORCE_PCRE_U' => '' ),
+			'check'   => static function ( array $run ): array {
+				$decoded = matrix_decode_ndjson( $run['stdout'] );
+				$records = $decoded['records'];
+				$done    = matrix_first_record( $records, 'done' );
+				$ok      = 0 === $run['code'] &&
+					array() === $decoded['malformed'] &&
+					is_array( $done ) &&
+					matrix_has_oracle_event( $records, 'native' ) &&
+					0 === ( $done['stats']['failures'] ?? null );
+				return array( $ok, is_array( $done ) ? json_encode( $done['stats'], JSON_UNESCAPED_SLASHES ) : trim( $run['stderr'] ), 2 === $run['code'] || array() !== $decoded['malformed'] );
+			},
+		),
+	array(
+		'name'    => 'mb-oracle-unavailable-fails-closed',
+		'command' => array( PHP_BINARY, '-d', 'disable_functions=mb_check_encoding,mb_scrub', __DIR__ . '/corpus.php', '--external', 'none' ),
+			'env'     => array( 'ENCODING_FUZZ_FORCE_PCRE_U' => '' ),
+			'check'   => static function ( array $run ): array {
+				$decoded = matrix_decode_ndjson( $run['stdout'] );
+				$records = $decoded['records'];
+				$fatal   = matrix_first_record( $records, 'fatal' );
+				$ok      = 2 === $run['code'] &&
+					array() === $decoded['malformed'] &&
+					matrix_has_oracle_event( $records, 'mb' ) &&
+					is_array( $fatal );
+				return array( $ok, is_array( $fatal ) ? (string) $fatal['reason'] : trim( $run['stderr'] ), array() !== $decoded['malformed'] || ( 2 === $run['code'] && ! $ok ) );
+			},
+		),
+);
+
+$failed        = 0;
+$harness_error = false;
+foreach ( $cases as $case ) {
+	$run = matrix_run_command( $case['command'], $case['env'] ?? array() );
+	list( $ok, $detail, $case_harness_error ) = $case['check']( $run );
+	if ( $ok ) {
+		echo "PASS {$case['name']}\n";
+	} else {
+		++$failed;
+		$harness_error = $harness_error || $case_harness_error;
+		echo "FAIL {$case['name']}: exit {$run['code']}; {$detail}\n";
+	}
+}
+
+echo $failed > 0 ? "\n{$failed} matrix check(s) FAILED\n" : "\nAll matrix checks passed\n";
+exit( $failed > 0 ? ( $harness_error ? 2 : 1 ) : 0 );

From 56dc15b1f6292fd06d694958b606dd4f96d4e5cd Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 15:26:46 +0200
Subject: [PATCH 13/14] Document invalid-input noncharacter policy

---
 progress-handoff-xZOoEn.md    | 23 +++++++++++++++++++++++
 tools/encoding-fuzz/README.md | 14 ++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/progress-handoff-xZOoEn.md b/progress-handoff-xZOoEn.md
index 6b549275cd59b..b35700e57dd19 100644
--- a/progress-handoff-xZOoEn.md
+++ b/progress-handoff-xZOoEn.md
@@ -155,3 +155,26 @@ Source handoff: `/var/folders/v7/flqy7j3s3q72cql9ppnrbqth0000gn/T/handoff-xZOoEn
   - Reviewer 2: initially found matrix pipe-deadlock, exit-code, and NDJSON-shape issues; satisfied after nonblocking pipe reads, harness-error exit `2`, and stricter record parsing.
   - Reviewer 3: initially found the matrix exit-code contract mismatch; satisfied after preserving exit `2` for harness-error-shaped failures and checking docs/progress accuracy.
 - Commit: this step commit.
+
+### Step 7: invalid-input noncharacter policy
+
+- Status: done; included in the step 7 commit.
+- Prior step commit: `a6d67b18f0`.
+- Scope:
+  - Do not broaden invalid-input noncharacter fuzzing.
+  - Document the currently pinned divergence between the PCRE-u public path and `_wp_has_noncharacters_fallback()` on ill-formed input.
+  - Record that further fuzz expansion is blocked on a Core policy decision: document `wp_has_noncharacters()` as valid-input-only or align public/fallback behavior on ill-formed input.
+- Verification:
+  - `php -l tools/encoding-fuzz/lib/Checks.php`
+  - `php -l tools/encoding-fuzz/lib/Targets.php`
+  - `php -l tools/encoding-fuzz/lib/Bootstrap.php`
+  - `php -l tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/tests/harness-smoke.php`
+  - `php tools/encoding-fuzz/worker.php --seed 1 --cases 200 --external none`
+  - `git diff --cached --check`
+  - Manual probe confirmed `wp_has_noncharacters( "\xC0\xEF\xBF\xBE" ) === false` and `_wp_has_noncharacters_fallback( "\xC0\xEF\xBF\xBE" ) === true` in the current PCRE-u environment.
+- Review gate: satisfied by 3 adversarial reviewers.
+  - Reviewer 1: satisfied after checking the README policy text against current public/fallback behavior and the handoff.
+  - Reviewer 2: satisfied after confirming the diff is docs/progress only and does not broaden invalid-input noncharacter fuzzing.
+  - Reviewer 3: satisfied after checking previous steps are complete, staged scope is limited, and this section is updated before commit.
+- Commit: this step commit.
diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index e89e5d712a516..ef8a0d7742dc2 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -140,6 +140,20 @@ Internal invariants:
   with native `mb_substr()`, and explicit non-UTF-8 encodings fall back to
   byte-level `substr()` semantics
 
+## Invalid-Input Noncharacter Policy
+
+Noncharacter differentials intentionally remain valid-input-only. The
+current invalid-input divergence is pinned by smoke, not fuzz-expanded:
+on hosts using the PCRE-u branch,
+`wp_has_noncharacters( "\xC0\xEF\xBF\xBE" )` returns false because the
+regular expression fails on ill-formed UTF-8, while
+`_wp_has_noncharacters_fallback( "\xC0\xEF\xBF\xBE" )` returns true
+because the fallback scanner skips the invalid byte and finds U+FFFE.
+
+Do not add invalid-input noncharacter fuzzing until Core decides whether
+`wp_has_noncharacters()` is documented as valid-input-only or the public
+and fallback paths are aligned on ill-formed input.
+
 ## Inputs
 
 Random cases are fully determined by `(seed, case index)` **for a given

From 98303df4dd805a1482ff8d26b2bfce11581366c1 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Thu, 11 Jun 2026 17:42:29 +0200
Subject: [PATCH 14/14] Adapt encoding fuzzer to trunk noncharacter behavior

---
 tools/encoding-fuzz/README.md               | 61 ++++++++-------------
 tools/encoding-fuzz/lib/Checks.php          | 38 ++++---------
 tools/encoding-fuzz/lib/Oracles.php         | 58 +++++++++++++++-----
 tools/encoding-fuzz/lib/wp-stubs.php        |  4 ++
 tools/encoding-fuzz/tests/harness-smoke.php | 42 +++++++-------
 5 files changed, 105 insertions(+), 98 deletions(-)

diff --git a/tools/encoding-fuzz/README.md b/tools/encoding-fuzz/README.md
index ef8a0d7742dc2..75eb6a8ed16db 100644
--- a/tools/encoding-fuzz/README.md
+++ b/tools/encoding-fuzz/README.md
@@ -5,7 +5,7 @@ Differential fuzzer for the WordPress UTF-8 functions:
 - `wp_is_valid_utf8()` / `_wp_is_valid_utf8_fallback()`
 - `wp_scrub_utf8()` / `_wp_scrub_utf8_fallback()`
 - `_wp_utf8_encode_fallback()` / `_wp_utf8_decode_fallback()`
-- `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()` (valid input only)
+- `wp_has_noncharacters()` / `_wp_has_noncharacters_fallback()`
 - `_mb_chr()` / `_mb_ord()`
 - `_mb_substr()`
 - `_wp_utf8_codepoint_count()`, `_wp_utf8_codepoint_span()`, and the
@@ -23,7 +23,8 @@ Every result is compared against independent known-good implementations:
 
 | Oracle    | Backing                              | Validity | Scrub | Encode | Decode | Nonchars |
 |-----------|--------------------------------------|----------|-------|--------|--------|----------|
-| `mb`      | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` / `mb_str_split()`+`mb_ord()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) | ✓ (valid input only) |
+| `bytes`   | independent UTF-8 noncharacter byte-sequence list | | | | | ✓ (primary) |
+| `mb`      | `mb_check_encoding()` / `mb_scrub()` / `mb_convert_encoding()` / `mb_str_split()`+`mb_ord()` | ✓ | ✓ (primary) | ✓ (primary) | ✓ (primary) | ✓ (valid UTF-8 cross-check) |
 | `pcre`    | PCRE2 strict UTF validation          | ✓        |       |        |        |          |
 | `intl`    | ICU `UConverter::transcode()`        |          | ✓     |        |        |          |
 | `python3` | CPython codec, persistent subprocess | ✓        | ✓     |        |        |          |
@@ -46,14 +47,14 @@ the PHP 9 polyfill in `compat.php` prefers `mb_convert_encoding()`
 with `_wp_utf8_decode_fallback()` as its mbstring-less shadow
 (ticket #63863).
 
-The `mb` noncharacter oracle (a trivial decode-and-test over
-`mb_str_split()` / `mb_ord()`) backs the `wp_has_noncharacters()`
-differential. Like every oracle it must pass a hand-derived battery,
-which covers the boundaries and interior of the U+FDD0–U+FDEF block
-and the final two code points of every plane with their neighbors —
-the PCRE implementation under test enumerates each plane as a separate
-hand-typed escape, so per-plane coverage is the point. It is defined
-on valid input only — see the noncharacter policy under Checks.
+The primary noncharacter oracle is an independent list of UTF-8 byte
+sequences for U+FDD0–U+FDEF and the final two code points of every
+plane. It is defined over arbitrary bytes, matching the public
+function's byte-sequence contract. On valid UTF-8, a trivial mb
+decode-and-test oracle (`mb_str_split()` / `mb_ord()`) cross-checks the
+byte oracle. The battery covers boundaries and interior points of the
+U+FDD0–U+FDEF block, every plane-final pair with neighbors, and
+ill-formed surrounds.
 
 Because native and mb decoding agree on *every* valid code point
 (verified exhaustively over U+0000–U+10FFFF), the valid-input-only
@@ -86,22 +87,10 @@ decode oracle on valid input only). Oracle-vs-oracle disagreements
 are reported separately (`oracle-disagreement`) so they don't masquerade
 as WordPress bugs.
 
-Noncharacter detection is a three-way differential on **valid input
-only**: `wp_has_noncharacters()` (the PCRE branch on hosts with
-PCRE-u; without PCRE-u the public function aliases the fallback and
-the differential degenerates to two distinct implementations — the
-worker records which branch loaded as `pcre_u` in its environment
-metadata), `_wp_has_noncharacters_fallback()`, and the trivial mb
-reference must agree. On ill-formed input the public function's answer
-depends on which environment branch of `utf8.php` loaded — the PCRE
-branch returns false for any ill-formed input because `preg_match`
-fails, while the fallback skips invalid spans and reports the
-noncharacters around them (`"\xC0\xEF\xBF\xBE"`: PCRE false, fallback
-true). The fuzzer's stance is that behavior is undefined unless
-`wp_is_valid_utf8()`; the divergence itself is pinned by a fixed
-regression vector in the smoke test, and aligning the implementations
-(or documenting the stance in core) is an open upstream question for
-the function author.
+Noncharacter detection is checked on arbitrary bytes:
+`wp_has_noncharacters()`, the deprecated `_wp_has_noncharacters_fallback()`
+wrapper, and the independent byte-sequence reference must agree. On
+valid UTF-8, the mb decode-and-test reference must also agree.
 
 Internal invariants:
 
@@ -142,17 +131,12 @@ Internal invariants:
 
 ## Invalid-Input Noncharacter Policy
 
-Noncharacter differentials intentionally remain valid-input-only. The
-current invalid-input divergence is pinned by smoke, not fuzz-expanded:
-on hosts using the PCRE-u branch,
-`wp_has_noncharacters( "\xC0\xEF\xBF\xBE" )` returns false because the
-regular expression fails on ill-formed UTF-8, while
-`_wp_has_noncharacters_fallback( "\xC0\xEF\xBF\xBE" )` returns true
-because the fallback scanner skips the invalid byte and finds U+FFFE.
-
-Do not add invalid-input noncharacter fuzzing until Core decides whether
-`wp_has_noncharacters()` is documented as valid-input-only or the public
-and fallback paths are aligned on ill-formed input.
+Trunk aligned the invalid-input behavior: `wp_has_noncharacters()`
+matches the UTF-8 byte sequences for noncharacters directly, so
+malformed bytes elsewhere in the string do not suppress detection.
+`_wp_has_noncharacters_fallback()` is deprecated and delegates to the
+public function. The fuzzer therefore includes invalid-input
+noncharacter cases in the normal differential.
 
 ## Inputs
 
@@ -274,8 +258,7 @@ noncharacter-leaking `_wp_scan_utf8()`, noncharacter-missing
 stale-noncharacter-flag `_wp_scan_utf8()`)
 must all be caught. It also asserts generator determinism, the
 valid/invalid input mix, the deterministic short-boundary corpus, and
-the documented `wp_has_noncharacters()` divergence stance on ill-formed
-input.
+the aligned `wp_has_noncharacters()` behavior on ill-formed input.
 
 For end-to-end pipeline testing while the real implementations are
 healthy, `ENCODING_FUZZ_FAULT=accept-c0|non-maximal|encode-cp1252|decode-per-byte|nonchars-miss-fdd0|nonchars-overeager|span-off-by-one|span-invalid-bytes|span-found-max|span-found-stale|substr-byte-level|substr-scrub|substr-no-neg-len|substr-force-utf8|count-invalid-bytes|count-range-minus1|count-ignore-offset|scan-ignore-bytes|scan-nonchars-leak|scan-miss-nonchars|scan-ascii-overrun|scan-stale-nonchars`
diff --git a/tools/encoding-fuzz/lib/Checks.php b/tools/encoding-fuzz/lib/Checks.php
index 2a48994979cb1..2d040a7b9e6d9 100644
--- a/tools/encoding-fuzz/lib/Checks.php
+++ b/tools/encoding-fuzz/lib/Checks.php
@@ -28,11 +28,10 @@
  *    chunks reconstructs the same scrubbed text and always makes
  *    forward progress
  *
- * Noncharacter detection (VALID input only — the public function's
- * answer on ill-formed input depends on which environment branch of
- * `utf8.php` loaded, a documented divergence pinned by the smoke test):
+ * Noncharacter detection:
  *  - `wp_has_noncharacters()` and `_wp_has_noncharacters_fallback()` vs
- *    a trivial decode-and-test reference.
+ *    an independent UTF-8 noncharacter byte-sequence oracle, with an mb
+ *    decode-and-test cross-check on valid UTF-8.
  *
  * Legacy `utf8_encode()` / `utf8_decode()` fallbacks:
  *  - `_wp_utf8_encode_fallback()` vs every encode oracle on arbitrary
@@ -293,7 +292,7 @@ public function run( string $input ): array {
 			$failures[] = $failure;
 		}
 
-		// 11. Noncharacter detection, on valid input only.
+		// 11. Noncharacter detection.
 		foreach ( $this->check_noncharacters( $input, $ref_valid ) as $failure ) {
 			$failures[] = $failure;
 		}
@@ -1065,37 +1064,24 @@ private function check_mb_chr_ord( string $input ): array {
 	}
 
 	/**
-	 * Three-way differential for noncharacter detection on VALID input:
-	 * the public `wp_has_noncharacters()` (the PCRE branch on hosts with
-	 * PCRE-u; otherwise it aliases the fallback and this degenerates to
-	 * two distinct implementations), the `_wp_scan_utf8()`-based
-	 * fallback, and the trivial mb reference must all agree.
-	 *
-	 * Ill-formed input is deliberately skipped: the PCRE branch answers
-	 * false on any ill-formed input (`preg_match` fails) while the
-	 * fallback skips invalid spans and reports noncharacters around
-	 * them, so the same public function answers differently depending
-	 * on which environment branch loaded. That stance — behavior is
-	 * undefined unless `wp_is_valid_utf8()` — is pinned by a fixed
-	 * regression vector in the smoke test, not fuzzed.
+	 * Differential for noncharacter detection over arbitrary bytes. The
+	 * primary oracle searches for the UTF-8 byte sequences that encode
+	 * Unicode noncharacters. On valid UTF-8 input, the trivial mb
+	 * decode-and-test oracle is also cross-checked.
 	 *
 	 * @return array<int, array{check: string, signature: string, detail: array}>
 	 */
 	private function check_noncharacters( string $input, bool $ref_valid ): array {
-		if ( ! $ref_valid ) {
-			return array();
-		}
-
 		$oracles = $this->oracles->noncharacter_oracles();
-		if ( ! isset( $oracles['mb'] ) ) {
+		if ( ! isset( $oracles['bytes'] ) ) {
 			return array();
 		}
 
 		$failures = array();
-		$expected = $oracles['mb']( $input );
+		$expected = $oracles['bytes']( $input );
 
 		foreach ( $oracles as $name => $oracle ) {
-			if ( 'mb' === $name ) {
+			if ( 'bytes' === $name || ( 'mb' === $name && ! $ref_valid ) ) {
 				continue;
 			}
 
@@ -1138,7 +1124,7 @@ private function check_noncharacters( string $input, bool $ref_valid ): array {
 						'target'        => $key,
 						'got'           => $result,
 						'expected'      => $expected,
-						'oracle'        => 'mb',
+						'oracle'        => 'bytes',
 						'input_preview' => self::preview( $input ),
 					)
 				);
diff --git a/tools/encoding-fuzz/lib/Oracles.php b/tools/encoding-fuzz/lib/Oracles.php
index 27c8822c07b79..72c5fe736cdf7 100644
--- a/tools/encoding-fuzz/lib/Oracles.php
+++ b/tools/encoding-fuzz/lib/Oracles.php
@@ -8,9 +8,9 @@
  * Scrub oracles answer "what does maximal-subpart replacement produce?".
  * Encode oracles answer "what is this ISO-8859-1 text as UTF-8?".
  * Decode oracles answer "what is this UTF-8 text as ISO-8859-1?".
- * Noncharacter oracles answer "does this VALID UTF-8 text contain a
- * Unicode noncharacter?" (U+FDD0–U+FDEF, or any code point whose low
- * sixteen bits are FFFE or FFFF). They are defined on valid input only.
+ * Noncharacter oracles answer "do these bytes contain the UTF-8 encoding
+ * of a Unicode noncharacter?" (U+FDD0–U+FDEF, or any code point whose low
+ * sixteen bits are FFFE or FFFF).
  *
  *  - mbstring:  `mb_check_encoding()` / `mb_scrub()` (maximal subpart
  *               since PHP 8.1.6), `mb_convert_encoding()` for the
@@ -66,7 +66,7 @@ class Oracles {
 	/** @var array<string, bool> Decode oracles trusted on valid UTF-8 input only. */
 	private array $decode_valid_only = array();
 
-	/** @var array<string, callable(string): bool> Defined on valid UTF-8 input only. */
+	/** @var array<string, callable(string): bool> */
 	private array $noncharacters = array();
 
 	/** @var ExternalOracle[] */
@@ -120,12 +120,36 @@ public static function build( array $external_names ): self {
 			? 'mb_ord'
 			: ( function_exists( '_mb_ord' ) ? '_mb_ord' : null );
 
+		$oracles->noncharacters['bytes'] = static function ( string $bytes ): bool {
+			static $noncharacter_sequences = null;
+			if ( null === $noncharacter_sequences ) {
+				$noncharacter_sequences = array();
+
+				for ( $code_point = 0xFDD0; $code_point <= 0xFDEF; $code_point++ ) {
+					$noncharacter_sequences[] = Generator::encode_code_point( $code_point );
+				}
+
+				for ( $plane = 0; $plane <= 0x10; $plane++ ) {
+					$final                     = ( $plane << 16 ) | 0xFFFF;
+					$noncharacter_sequences[] = Generator::encode_code_point( $final - 1 );
+					$noncharacter_sequences[] = Generator::encode_code_point( $final );
+				}
+			}
+
+			foreach ( $noncharacter_sequences as $sequence ) {
+				if ( str_contains( $bytes, $sequence ) ) {
+					return true;
+				}
+			}
+
+			return false;
+		};
+
 		if ( function_exists( 'mb_str_split' ) && null !== $mb_ord ) {
 			/*
 			 * Trivial decode-and-test reference for noncharacter detection,
-			 * independent of both implementations under test (the PCRE
-			 * character-class regex and the `_wp_scan_utf8()`-based scan).
-			 * Callers must pass valid UTF-8.
+			 * independent of the byte-sequence search. Callers must pass
+			 * valid UTF-8.
 			 */
 			$oracles->noncharacters['mb'] = static function ( string $valid_utf8 ) use ( $mb_ord ): bool {
 				foreach ( mb_str_split( $valid_utf8, 1, 'UTF-8' ) as $character ) {
@@ -327,11 +351,9 @@ public static function decode_battery(): array {
 
 	/**
 	 * Known-answer vectors for the noncharacter oracles. All inputs are
-	 * valid UTF-8 (the question is only defined there) and cover the
-	 * boundaries AND interior of the U+FDD0–U+FDEF block plus the final
-	 * two code points of EVERY plane with their U+xFFFD neighbors — the
-	 * PCRE implementation under test enumerates each plane as a separate
-	 * hand-typed escape, exactly where a single-plane typo would hide.
+	 * valid UTF-8 and ill-formed surrounds, covering the boundaries AND
+	 * interior of the U+FDD0–U+FDEF block plus the final two code points
+	 * of EVERY plane with their U+xFFFD neighbors.
 	 *
 	 * Expectations are hand-derived from the Unicode definition; bytes
 	 * for the looped vectors come from the pure-arithmetic
@@ -339,12 +361,16 @@ public static function decode_battery(): array {
 	 * against `mb_chr()` by `tests/code-point-to-utf8-exhaustive.php`),
 	 * keeping the encoding independent of the mbstring-backed oracle.
 	 *
-	 * @return array<int, array{0: string, 1: bool}> [valid utf8 bytes, has noncharacters]
+	 * @return array<int, array{0: string, 1: bool}> [bytes, has noncharacters]
 	 */
 	public static function noncharacter_battery(): array {
 		$vectors = array(
 			array( '', false ),
 			array( 'abc', false ),
+			array( "\xC0abc", false ),
+			array( "\xC0\xEF\xBF\xBE", true ),
+			array( "\xC0a\xEF\xB7\x90b", true ),
+			array( "\xC0\xEF\xB7\x8F", false ),
 			array( "\u{FDCF}", false ),       // Last code point before the contiguous block.
 			array( "\u{FDD0}", true ),        // First of the contiguous block.
 			array( "\u{FDDA}", true ),        // Interior of the block: a lookup-table bug
@@ -423,6 +449,10 @@ private function verify_battery(): void {
 			list( $bytes, $expected ) = $vector;
 
 			foreach ( $this->noncharacters as $name => $check ) {
+				if ( 'mb' === $name && ( ! function_exists( 'mb_check_encoding' ) || ! mb_check_encoding( $bytes, 'UTF-8' ) ) ) {
+					continue;
+				}
+
 				$got = $check( $bytes );
 				if ( $got !== $expected ) {
 					$this->disable( $name, sprintf(
@@ -514,7 +544,7 @@ public function decode_oracle_is_valid_only( string $name ): bool {
 		return $this->decode_valid_only[ $name ] ?? false;
 	}
 
-	/** @return array<string, callable(string): bool> Defined on valid UTF-8 input only. */
+	/** @return array<string, callable(string): bool> */
 	public function noncharacter_oracles(): array {
 		return $this->noncharacters;
 	}
diff --git a/tools/encoding-fuzz/lib/wp-stubs.php b/tools/encoding-fuzz/lib/wp-stubs.php
index 2eaa662cbbeba..3ece6f3292650 100644
--- a/tools/encoding-fuzz/lib/wp-stubs.php
+++ b/tools/encoding-fuzz/lib/wp-stubs.php
@@ -25,3 +25,7 @@ function get_option( $option, $default_value = false ) {
 		return 'blog_charset' === $option ? 'UTF-8' : $default_value;
 	}
 }
+
+if ( ! function_exists( '_deprecated_function' ) ) {
+	function _deprecated_function( $function_name, $version, $replacement = '' ): void {}
+}
diff --git a/tools/encoding-fuzz/tests/harness-smoke.php b/tools/encoding-fuzz/tests/harness-smoke.php
index 6fcd660776fe6..31f188685a9bf 100644
--- a/tools/encoding-fuzz/tests/harness-smoke.php
+++ b/tools/encoding-fuzz/tests/harness-smoke.php
@@ -71,27 +71,31 @@ function check( string $label, bool $ok, string $detail = '' ): void {
 check( 'real targets clean on battery', array() === $battery_fails, implode( '; ', $battery_fails ) );
 
 /*
- * Documented stance: `wp_has_noncharacters()` is undefined on ill-formed
- * input. On hosts with PCRE-u the public function answers false on ANY
- * ill-formed input (`preg_match` fails) while the fallback skips invalid
- * spans and reports the noncharacters around them. This regression
- * vector pins the divergence; if it ever changes, the semantics were
- * touched and the valid-input-only fuzzing policy must be revisited.
+ * Trunk aligned invalid-input behavior by making the public function search
+ * for noncharacter UTF-8 byte sequences directly and deprecating the old
+ * private fallback into a wrapper.
  */
 $nonchar_probe = "\xC0\xEF\xBF\xBE"; // Invalid byte, then U+FFFE.
-if ( _wp_can_use_pcre_u() ) {
-	check(
-		'documented wp_has_noncharacters divergence on ill-formed input unchanged',
-		false === wp_has_noncharacters( $nonchar_probe ) && true === _wp_has_noncharacters_fallback( $nonchar_probe ),
-		sprintf(
-			'public: %s, fallback: %s',
-			var_export( wp_has_noncharacters( $nonchar_probe ), true ),
-			var_export( _wp_has_noncharacters_fallback( $nonchar_probe ), true )
-		)
-	);
-} else {
-	echo "SKIP documented wp_has_noncharacters divergence (no PCRE-u: public function aliases the fallback)\n";
-}
+check(
+	'wp_has_noncharacters detects noncharacters inside ill-formed input',
+	true === wp_has_noncharacters( $nonchar_probe ) && true === _wp_has_noncharacters_fallback( $nonchar_probe ),
+	sprintf(
+		'public: %s, fallback: %s',
+		var_export( wp_has_noncharacters( $nonchar_probe ), true ),
+		var_export( _wp_has_noncharacters_fallback( $nonchar_probe ), true )
+	)
+);
+
+$nonchar_absent_probe = "\xC0abc";
+check(
+	'wp_has_noncharacters ignores ill-formed input without noncharacters',
+	false === wp_has_noncharacters( $nonchar_absent_probe ) && false === _wp_has_noncharacters_fallback( $nonchar_absent_probe ),
+	sprintf(
+		'public: %s, fallback: %s',
+		var_export( wp_has_noncharacters( $nonchar_absent_probe ), true ),
+		var_export( _wp_has_noncharacters_fallback( $nonchar_absent_probe ), true )
+	)
+);
 
 // ---------------------------------------------------------------------
 // 3. Broken implementations must be caught.