From c6ec64a8a510f4e146c9e1624272ba4a58db58d4 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 22:10:21 +0200 Subject: [PATCH 1/3] Charset: Document legacy UTF-8 helper divergences --- handoffs/legacy-utf8-divergence-report.md | 397 ++++++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 handoffs/legacy-utf8-divergence-report.md diff --git a/handoffs/legacy-utf8-divergence-report.md b/handoffs/legacy-utf8-divergence-report.md new file mode 100644 index 0000000000000..9b26c144826c0 --- /dev/null +++ b/handoffs/legacy-utf8-divergence-report.md @@ -0,0 +1,397 @@ +# Legacy UTF-8 Helper Divergence Report + +## Scope + +This report covers the current `utf8-survey` checkout and the handoff in +`encoding-fuzzer/handoffs/legacy-utf8-divergence-survey.md`. + +No production code was changed for this survey. The throwaway runner lived at +`/private/tmp/legacy_utf8_divergence_survey.php` and reused the generator and +oracle battery from the adjacent `encoding-fuzzer/tools/encoding-fuzz/` +checkout. It loaded this checkout's `compat.php`, `compat-utf8.php`, +`utf8.php`, and `formatting.php` with minimal stubs. + +The generated pass was deterministic: case `N` used +`new EncodingFuzz\Prng( "legacy-utf8-divergence:N" )`. The exact command was: + +```sh +php /private/tmp/legacy_utf8_divergence_survey.php 3000000 256 > /private/tmp/legacy_utf8_divergence_survey_results.json +``` + +Important current-branch note: the handoff describes +`wp_check_invalid_utf8()` as PCRE-based. That is historically correct, but this +checkout already contains the 6.9-era rewrite from `d1e7f5625b`, so the current +implementation now calls `wp_is_valid_utf8()` and `wp_scrub_utf8()` when +`blog_charset` is UTF-8. + +## Environment + +- PHP: 8.4.21 +- Extensions present: `mbstring`, `intl` +- PCRE Unicode support: yes +- Generated pass: 3,000,000 inputs, 366,593,389 bytes, max generated input + size 256 bytes +- Generator strategies: random bytes, random ASCII, valid UTF-8, + mutated-valid UTF-8, atom splices, latin1-ish text, UTF-16 bytes, + ASCII fast paths, repeated motifs +- Oracle battery: `EncodingFuzz\Oracles::battery()` + +`wp_check_invalid_utf8()` caches the first `is_utf8_charset()` result in a +static. To compare UTF-8 and non-UTF-8 charset behavior in one process, the +runner used an equivalent uncached copy of the current implementation for that +matrix. This models first-call behavior in a fresh request under each charset. + +## Aggregate Results + +| Measurement | Count | +| --- | ---: | +| Generated inputs | 3,000,000 | +| Strict-valid inputs | 1,128,174 | +| Strict-invalid inputs | 1,871,826 | +| `seems_utf8()` accepted strict-invalid input | 92,007 | +| `seems_utf8()` rejected strict-invalid input | 1,779,819 | +| `seems_utf8()` rejected strict-valid input | 0 | +| `wp_check_invalid_utf8( $s, false )` returned `''` for invalid UTF-8 under UTF-8 charset | 1,871,826 | +| `wp_check_invalid_utf8( $s, true )` matched `wp_scrub_utf8( $s )` under UTF-8 charset | 1,871,826 | +| `wp_check_invalid_utf8( $s, true )` mismatched `wp_scrub_utf8( $s )` under UTF-8 charset | 0 | +| `wp_check_invalid_utf8()` passed invalid bytes through under ISO-8859-1 charset | 1,871,826 | + +`seems_utf8()` accepted strict-invalid inputs in exactly these buckets: + +| Divergence class | Generated examples | +| --- | ---: | +| UTF-16 surrogate sequence | 21,051 | +| Code point above `U+10FFFF` | 14,770 | +| Obsolete 5-byte sequence | 6,850 | +| Obsolete 6-byte sequence | 6,764 | +| Overlong 2-byte sequence | 14,686 | +| Overlong 3-byte sequence | 13,775 | +| Overlong 4-byte sequence | 14,111 | + +No generated class showed `wp_check_invalid_utf8( $s, true )` diverging from +`wp_scrub_utf8( $s )` when `blog_charset` was UTF-8. + +## Divergence Matrix + +All byte strings are hex. `R` means one `U+FFFD` replacement character, encoded +as `EF BF BD`. `same` means the original byte string is returned unchanged. + +| Input class | Minimal bytes | `wp_is_valid_utf8()` | `seems_utf8()` | `wp_check_invalid_utf8( false )`, UTF-8 charset | `wp_check_invalid_utf8( true )`, UTF-8 charset | `wp_check_invalid_utf8()`, non-UTF-8 charset | +| --- | --- | --- | --- | --- | --- | --- | +| ASCII | `41` | accept | accept | same | same | same | +| Valid 2-byte lower edge | `C2 80` | accept | accept | same | same | same | +| Valid 3-byte lower edge | `E0 A0 80` | accept | accept | same | same | same | +| Valid 4-byte upper edge | `F4 8F BF BF` | accept | accept | same | same | same | +| Noncharacter `U+FFFE` | `EF BF BE` | accept | accept | same | same | same | +| Replacement character `U+FFFD` | `EF BF BD` | accept | accept | same | same | same | +| Lone continuation | `80` | reject | reject | `''` | `R` | same | +| Invalid `FE`/`FF` lead | `FE` | reject | reject | `''` | `R` | same | +| Truncated 2-byte sequence | `C2` | reject | reject | `''` | `R` | same | +| Truncated 3-byte sequence | `E2 8C` | reject | reject | `''` | `R` | same | +| Truncated 4-byte sequence | `F1 80 80` | reject | reject | `''` | `R` | same | +| Overlong 2-byte sequence | `C0 80` | reject | accept | `''` | `R R` | same | +| Overlong 3-byte sequence | `E0 80 80` | reject | accept | `''` | `R R R` | same | +| Overlong 4-byte sequence | `F0 80 80 80` | reject | accept | `''` | `R R R R` | same | +| UTF-16 surrogate sequence | `ED A0 80` | reject | accept | `''` | `R R R` | same | +| Above `U+10FFFF`, `F4` form | `F4 90 80 80` | reject | accept | `''` | `R R R R` | same | +| Above `U+10FFFF`, `F5` form | `F5 80 80 80` | reject | accept | `''` | `R R R R` | same | +| Obsolete 5-byte sequence | `F8 80 80 80 80` | reject | accept | `''` | `R R R R R` | same | +| Obsolete 6-byte sequence | `FC 80 80 80 80 80` | reject | accept | `''` | `R R R R R R` | same | +| Valid text plus overlong bytes | `41 C0 80 5A` | reject | accept | `''` | `41 R R 5A` | same | + +## Divergence Classes + +### `seems_utf8()` accepts overlong encodings + +Representative inputs: `C0 80`, `E0 80 80`, `F0 80 80 80`. + +Classification: accidental if the caller expects valid UTF-8; historically +load-bearing only as a loose structural heuristic. + +Evidence: + +- `src/wp-includes/formatting.php` says the function checks whether the string + "fits a UTF-8 model", not whether it is well-formed UTF-8. +- Core Trac #38044 was specifically opened to make `seems_utf8()` RFC 3629 + compliant and calls out overlong acceptance as a defect: + https://core.trac.wordpress.org/ticket/38044 +- Commit `bb6ed3ba22` introduced `wp_is_valid_utf8()` and deprecated + `seems_utf8()` instead of tightening the old function in place. + +Impact: + +Replacing `seems_utf8()` with `wp_is_valid_utf8()` is behavior-changing for +saved data containing these bytes: old code reports "yes"; strict validation +reports "no". + +### `seems_utf8()` accepts UTF-16 surrogate encodings + +Representative input: `ED A0 80`. + +Classification: accidental. Surrogate halves are not Unicode scalar values and +are rejected by the strict validator and by the fuzzer battery. + +Evidence: + +- Trac #38044 explicitly names surrogate acceptance as part of the RFC 3629 + compliance problem: https://core.trac.wordpress.org/ticket/38044 +- The current `wp_is_valid_utf8()` docblock gives surrogate halves as invalid + examples. + +Impact: + +Same as overlongs: `wp_is_valid_utf8()` is the correct replacement for +validation, but it is not a byte-for-byte-compatible replacement. + +### `seems_utf8()` accepts code points above `U+10FFFF` + +Representative inputs: `F4 90 80 80`, `F5 80 80 80`. + +Classification: accidental. The code accepts any `F0`-`F7` lead followed by +three continuation bytes, but modern UTF-8 stops at `F4 8F BF BF`. + +Evidence: + +- The current `wp_is_valid_utf8()` docblock defines well-formed UTF-8 as + excluding characters above the representable range. +- Trac #38044 frames the replacement around RFC 3629 compliance, whose range is + `U+0000..U+10FFFF`. + +Impact: + +Strict replacement rejects bytes that the legacy heuristic accepted. Treat this +as a migration break for data-validation callers. + +### `seems_utf8()` accepts obsolete 5- and 6-byte forms + +Representative inputs: `F8 80 80 80 80`, `FC 80 80 80 80 80`. + +Classification: documented historical looseness, not valid UTF-8. The +docblock warns that the function checks 5-byte sequences even though UTF-8 has a +maximum length of 4 bytes; the code also accepts 6-byte forms. + +Evidence: + +- The 5-byte warning was added in the 2009 cleanup associated with Trac #9692: + https://core.trac.wordpress.org/ticket/9692 +- Trac #38044 records the later decision to deprecate rather than repair this + legacy behavior in place. + +Impact: + +This is the clearest documented non-strict behavior. A strict replacement is +still desirable for validation, but compatibility notes should call out the +change explicitly. + +### `wp_check_invalid_utf8( $s, false )` rejects the whole string + +Representative input: `41 C0 80 5A`. + +Classification: intentional security behavior. Under UTF-8 charset, any invalid +span makes the default mode return `''`; it does not preserve valid surrounding +text. + +Evidence: + +- Trac #8767 introduced the helper in a security/XSS context and discussed the + default empty-string behavior as the more conservative validator-like option: + https://core.trac.wordpress.org/ticket/8767 +- The current docblock documents this default mode. + +Impact: + +`wp_scrub_utf8()` is not a drop-in replacement for default-mode callers because +it preserves the string and inserts replacement characters. That can be a better +user experience in some contexts, but it changes escaping and sanitization +behavior. + +### `wp_check_invalid_utf8( $s, true )` now scrubs with `U+FFFD` + +Representative input: `C0 80` produces `R R`. + +Classification: intentional current behavior. On this branch, `$strip = true` +matches `wp_scrub_utf8()` for all generated strict-invalid inputs under UTF-8 +charset. + +Evidence: + +- Trac #63837 states the plan to rely on `wp_is_valid_utf8()` and add + `wp_scrub_utf8()` for replacement-character scrubbing: + https://core.trac.wordpress.org/ticket/63837 +- Commit `d1e7f5625b` says the old `$strip` defect was fixed and invalid bytes + are now replaced with `U+FFFD` for stronger security guarantees. + +Impact: + +For UTF-8 charset requests, `wp_scrub_utf8()` is behavior-equivalent to +`wp_check_invalid_utf8( $s, true )` except for the legacy function's +`blog_charset` gate. + +### `wp_check_invalid_utf8()` passes through all bytes for non-UTF-8 charsets + +Representative input under `ISO-8859-1` charset: `C0 80` returns `C0 80` in +both modes. + +Classification: intentional environment sensitivity. + +Evidence: + +- The current docblock says the function only performs work when + `blog_charset` is UTF-8. +- Trac #63837 calls out that the function assumes input strings are encoded + with `blog_charset`, and says that point is inherent to how it works: + https://core.trac.wordpress.org/ticket/63837 + +Impact: + +Neither `wp_is_valid_utf8()` nor `wp_scrub_utf8()` is a drop-in replacement +where the caller intentionally wants `blog_charset`-dependent passthrough. + +## Current Core Callers + +### `seems_utf8()` + +No production callers remain in this checkout. The only in-tree production +reference found by `rg` is the function definition itself. + +Migration guidance: + +- For validation callers, `wp_is_valid_utf8()` is the intended replacement, but + it is behavior-changing for overlongs, surrogates, above-range code points, + and 5/6-byte forms. +- For charset-guessing callers, `wp_is_valid_utf8()` is not a semantic drop-in. + Such callers should make the heuristic explicit instead of using + `seems_utf8()`. + +### `esc_js()` + +Current call: `wp_check_invalid_utf8( $text )`. + +Migration guidance: + +- `wp_is_valid_utf8()` is not a drop-in; it returns a boolean and does not + produce escaped text. +- `wp_scrub_utf8()` is behavior-changing; invalid input would be preserved with + `U+FFFD` instead of blanked before JavaScript escaping. +- Keep `wp_check_invalid_utf8()` unless the security model is explicitly + changed from whole-string rejection to scrubbing. + +### `esc_html()` + +Current call: `wp_check_invalid_utf8( $text )`. + +Migration guidance: + +- `wp_scrub_utf8()` is behavior-changing but may be a future product decision if + preserving partially valid display text is preferred. +- It is not a drop-in for current behavior because default-mode + `wp_check_invalid_utf8()` returns `''` for any invalid UTF-8 under UTF-8 + charset and passes raw bytes through under non-UTF-8 charset. + +### `esc_attr()` + +Current call: `wp_check_invalid_utf8( $text )`. + +Migration guidance: + +- Attribute context is especially sensitive to partial decoding and downstream + parser behavior. Keep whole-string rejection unless a dedicated security + review approves replacement-character scrubbing. +- `wp_is_valid_utf8()` is not a drop-in output function. + +### `esc_xml()` + +Current call: `wp_check_invalid_utf8( $text )`. + +Migration guidance: + +- `wp_scrub_utf8()` would be plausible for XML generation because XML requires + valid character data, but it changes the output contract from blanking to + replacement. +- A direct replacement needs XML-specific review, especially because XML also + has character restrictions beyond UTF-8 well-formedness. + +### `_sanitize_text_fields()`, via `sanitize_text_field()` and `sanitize_textarea_field()` + +Current call: `wp_check_invalid_utf8( $str )`. + +Migration guidance: + +- `wp_scrub_utf8()` is behavior-changing: stored/sanitized values that + currently become empty would retain valid surrounding text and replacement + characters. +- This may be user-friendlier, but it is not a drop-in. Treat it as a product + and compatibility decision. + +### `_wp_json_convert_string()` + +Current fallback call: `wp_check_invalid_utf8( $input_string, true )`, only when +`mb_convert_encoding()` is unavailable. + +Migration guidance: + +- Under UTF-8 charset on this branch, `wp_scrub_utf8()` is behavior-equivalent + for generated invalid inputs and is the clearer operation. +- It is still not a full drop-in because `wp_check_invalid_utf8()` preserves raw + input when `blog_charset` is not UTF-8. +- JSON output must be UTF-8, so this is the best candidate for a targeted future + migration away from `wp_check_invalid_utf8()`. + +## Recommendations + +### `seems_utf8()`: keep deprecated; do not repair in place + +The function is a loose structural heuristic with no remaining production core +callers. It accepts several classes of invalid UTF-8 by design of its bit-mask +model, and changing the implementation in place would silently change external +caller behavior. + +Recommended action: + +- Keep the existing deprecation to `wp_is_valid_utf8()`. +- Do not include it in continuous differential fuzzing against strict UTF-8 + validation; the known divergences are permanent unless the deprecated + function is removed or broken for compatibility. +- If docs are touched, say explicitly that it accepts overlongs, surrogates, + above-range code points, and obsolete 5/6-byte forms. The current docblock + mentions 5-byte sequences, but not the full divergence set. + +### `wp_check_invalid_utf8()`: document and leave for default-mode callers + +The current branch has already removed the historical PCRE dependency for +UTF-8 charset requests. The remaining divergences are semantic: + +- default mode rejects the entire invalid string; +- strip mode scrubs with `U+FFFD`; +- all modes pass bytes through when `blog_charset` is not UTF-8. + +Recommended action: + +- Keep default-mode calls in escaping and sanitization until each context has an + explicit security and compatibility decision. +- Prefer `wp_scrub_utf8()` for new code that unconditionally wants valid UTF-8 + output and does not want `blog_charset` sensitivity. +- Consider a targeted follow-up for `_wp_json_convert_string()`'s fallback path, + because JSON wants UTF-8 and current `$strip = true` behavior already matches + `wp_scrub_utf8()` under UTF-8 charset. + +## Sources Checked + +- Local function history: `git log -L :seems_utf8:src/wp-includes/formatting.php` +- Local function history: `git log -L :wp_check_invalid_utf8:src/wp-includes/formatting.php` +- Current `seems_utf8()` deprecation and `wp_is_valid_utf8()` introduction: + commit `bb6ed3ba22` +- Current `wp_check_invalid_utf8()` / `wp_scrub_utf8()` rewrite: + commit `d1e7f5625b` +- Trac #9692, `seems_utf8()` cleanup: + https://core.trac.wordpress.org/ticket/9692 +- Trac #8767, original `wp_check_invalid_utf8()` security refactor: + https://core.trac.wordpress.org/ticket/8767 +- Trac #38044, RFC 3629 compliance and `wp_is_valid_utf8()`: + https://core.trac.wordpress.org/ticket/38044 +- Trac #63837, `wp_check_invalid_utf8()` rewrite and `wp_scrub_utf8()`: + https://core.trac.wordpress.org/ticket/63837 +- Trac #29717, historical PCRE behavior and caller importance: + https://core.trac.wordpress.org/ticket/29717 +- Trac #63863, standardizing UTF-8 handling and fallbacks: + https://core.trac.wordpress.org/ticket/63863 From 700d7c8c910fa44d65d097cf31048eb4078abd60 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:07:16 +0200 Subject: [PATCH 2/3] Charset: Add legacy UTF-8 survey runner --- .../legacy-utf8-divergence-survey-runner.php | 365 ++++++++++++++++++ 1 file changed, 365 insertions(+) create mode 100644 handoffs/legacy-utf8-divergence-survey-runner.php diff --git a/handoffs/legacy-utf8-divergence-survey-runner.php b/handoffs/legacy-utf8-divergence-survey-runner.php new file mode 100644 index 0000000000000..12784fd6a2219 --- /dev/null +++ b/handoffs/legacy-utf8-divergence-survey-runner.php @@ -0,0 +1,365 @@ + /tmp/legacy_utf8_divergence_survey_results.json + * + * The runner expects the encoding fuzzer checkout beside this repository by + * default, or in ENCODING_FUZZER_ROOT when provided. It deliberately loads a + * small subset of WordPress with stubs because the report is about byte-level + * helper behavior, not full WordPress bootstrap behavior. + */ + +use EncodingFuzz\Generator; +use EncodingFuzz\Oracles; +use EncodingFuzz\Prng; + +function survey_repo_root(): string { + return dirname( __DIR__ ); +} + +function survey_fuzzer_root(): string { + $from_env = getenv( 'ENCODING_FUZZER_ROOT' ); + if ( is_string( $from_env ) && '' !== $from_env ) { + return rtrim( $from_env, '/' ); + } + + return dirname( survey_repo_root() ) . '/encoding-fuzzer'; +} + +$fuzzer_root = survey_fuzzer_root(); +require $fuzzer_root . '/tools/encoding-fuzz/lib/autoload.php'; + +$GLOBALS['survey_blog_charset'] = 'UTF-8'; + +function _deprecated_function( $function_name, $version, $replacement = '' ) {} +function _deprecated_argument( $function_name, $version, $message = '' ) {} +function get_option( $name ) { + return 'blog_charset' === $name ? $GLOBALS['survey_blog_charset'] : null; +} +function mbstring_binary_safe_encoding( $reset = false ) {} +function reset_mbstring_encoding() {} + +require survey_repo_root() . '/src/wp-includes/compat.php'; + +function is_utf8_charset( $blog_charset = null ) { + return _is_utf8_charset( $blog_charset ?? get_option( 'blog_charset' ) ); +} + +require survey_repo_root() . '/src/wp-includes/compat-utf8.php'; +require survey_repo_root() . '/src/wp-includes/utf8.php'; +require survey_repo_root() . '/src/wp-includes/formatting.php'; + +function hx( string $bytes ): string { + return strtoupper( trim( chunk_split( bin2hex( $bytes ), 2, ' ' ) ) ); +} + +function visible( string $bytes ): string { + if ( '' === $bytes ) { + return "''"; + } + + return hx( $bytes ); +} + +function check_invalid_with_charset( string $bytes, bool $strip, string $charset ): string { + $GLOBALS['survey_blog_charset'] = $charset; + + return wp_check_invalid_utf8_uncached( $bytes, $strip ); +} + +function wp_check_invalid_utf8_uncached( string $text, bool $strip ): string { + $text = (string) $text; + + if ( 0 === strlen( $text ) ) { + return ''; + } + + if ( ! is_utf8_charset() || wp_is_valid_utf8( $text ) ) { + return $text; + } + + return $strip ? wp_scrub_utf8( $text ) : ''; +} + +function invalid_class( string $bytes ): string { + $length = strlen( $bytes ); + for ( $i = 0; $i < $length; $i++ ) { + $b0 = ord( $bytes[ $i ] ); + if ( $b0 < 0x80 ) { + continue; + } + + if ( $b0 >= 0x80 && $b0 <= 0xBF ) { + return 'lone continuation byte'; + } + + if ( $b0 >= 0xC0 && $b0 <= 0xC1 ) { + return has_continuations( $bytes, $i + 1, 1 ) ? 'overlong 2-byte sequence' : 'truncated C0/C1 lead'; + } + + if ( $b0 >= 0xC2 && $b0 <= 0xDF ) { + if ( ! has_continuations( $bytes, $i + 1, 1 ) ) { + return 'truncated 2-byte sequence'; + } + $i += 1; + continue; + } + + if ( 0xE0 === $b0 ) { + if ( ! has_continuations( $bytes, $i + 1, 2 ) ) { + return 'truncated 3-byte sequence'; + } + if ( ord( $bytes[ $i + 1 ] ) < 0xA0 ) { + return 'overlong 3-byte sequence'; + } + $i += 2; + continue; + } + + if ( $b0 >= 0xE1 && $b0 <= 0xEC ) { + if ( ! has_continuations( $bytes, $i + 1, 2 ) ) { + return 'truncated 3-byte sequence'; + } + $i += 2; + continue; + } + + if ( 0xED === $b0 ) { + if ( ! has_continuations( $bytes, $i + 1, 2 ) ) { + return 'truncated 3-byte sequence'; + } + if ( ord( $bytes[ $i + 1 ] ) >= 0xA0 ) { + return 'UTF-16 surrogate sequence'; + } + $i += 2; + continue; + } + + if ( $b0 >= 0xEE && $b0 <= 0xEF ) { + if ( ! has_continuations( $bytes, $i + 1, 2 ) ) { + return 'truncated 3-byte sequence'; + } + $i += 2; + continue; + } + + if ( 0xF0 === $b0 ) { + if ( ! has_continuations( $bytes, $i + 1, 3 ) ) { + return 'truncated 4-byte sequence'; + } + if ( ord( $bytes[ $i + 1 ] ) < 0x90 ) { + return 'overlong 4-byte sequence'; + } + $i += 3; + continue; + } + + if ( $b0 >= 0xF1 && $b0 <= 0xF3 ) { + if ( ! has_continuations( $bytes, $i + 1, 3 ) ) { + return 'truncated 4-byte sequence'; + } + $i += 3; + continue; + } + + if ( 0xF4 === $b0 ) { + if ( ! has_continuations( $bytes, $i + 1, 3 ) ) { + return 'truncated 4-byte sequence'; + } + if ( ord( $bytes[ $i + 1 ] ) > 0x8F ) { + return 'code point above U+10FFFF'; + } + $i += 3; + continue; + } + + if ( $b0 >= 0xF5 && $b0 <= 0xF7 ) { + return has_continuations( $bytes, $i + 1, 3 ) ? 'code point above U+10FFFF' : 'invalid F5-F7 lead'; + } + + if ( $b0 >= 0xF8 && $b0 <= 0xFB ) { + return has_continuations( $bytes, $i + 1, 4 ) ? 'obsolete 5-byte sequence' : 'invalid F8-FB lead'; + } + + if ( $b0 >= 0xFC && $b0 <= 0xFD ) { + return has_continuations( $bytes, $i + 1, 5 ) ? 'obsolete 6-byte sequence' : 'invalid FC-FD lead'; + } + + return 'FE/FF invalid lead'; + } + + return 'valid'; +} + +function has_continuations( string $bytes, int $start, int $count ): bool { + for ( $i = 0; $i < $count; $i++ ) { + $at = $start + $i; + if ( $at >= strlen( $bytes ) ) { + return false; + } + + $b = ord( $bytes[ $at ] ); + if ( ( $b & 0xC0 ) !== 0x80 ) { + return false; + } + } + + return true; +} + +function vector_row( string $name, string $bytes ): array { + $valid = wp_is_valid_utf8( $bytes ); + $seems = seems_utf8( $bytes ); + $scrubbed = wp_scrub_utf8( $bytes ); + $check_keep = check_invalid_with_charset( $bytes, false, 'UTF-8' ); + $check_strip = check_invalid_with_charset( $bytes, true, 'UTF-8' ); + $latin_keep = check_invalid_with_charset( $bytes, false, 'ISO-8859-1' ); + $latin_strip = check_invalid_with_charset( $bytes, true, 'ISO-8859-1' ); + + return array( + 'name' => $name, + 'hex' => hx( $bytes ), + 'class' => invalid_class( $bytes ), + 'wp_is_valid_utf8' => $valid, + 'seems_utf8' => $seems, + 'wp_scrub_utf8_hex' => visible( $scrubbed ), + 'check_utf8_keep_hex' => visible( $check_keep ), + 'check_utf8_strip_hex' => visible( $check_strip ), + 'check_latin_keep_hex' => visible( $latin_keep ), + 'check_latin_strip_hex' => visible( $latin_strip ), + ); +} + +$vectors = array( + 'ascii' => 'A', + 'valid 2-byte lower edge' => "\xC2\x80", + 'valid 3-byte lower edge' => "\xE0\xA0\x80", + 'valid 4-byte upper edge' => "\xF4\x8F\xBF\xBF", + 'valid noncharacter U+FFFE' => "\xEF\xBF\xBE", + 'valid replacement U+FFFD' => "\xEF\xBF\xBD", + 'lone continuation' => "\x80", + 'FE invalid lead' => "\xFE", + 'truncated 2-byte' => "\xC2", + 'truncated 3-byte' => "\xE2\x8C", + 'truncated 4-byte' => "\xF1\x80\x80", + 'overlong 2-byte' => "\xC0\x80", + 'overlong 3-byte' => "\xE0\x80\x80", + 'overlong 4-byte' => "\xF0\x80\x80\x80", + 'surrogate U+D800' => "\xED\xA0\x80", + 'above U+10FFFF F4' => "\xF4\x90\x80\x80", + 'above U+10FFFF F5' => "\xF5\x80\x80\x80", + 'obsolete 5-byte' => "\xF8\x80\x80\x80\x80", + 'obsolete 6-byte' => "\xFC\x80\x80\x80\x80\x80", + 'mixed invalid in text' => "A\xC0\x80Z", +); + +$rows = array(); +foreach ( $vectors as $name => $bytes ) { + $rows[] = vector_row( $name, $bytes ); +} + +$battery_rows = array(); +foreach ( Oracles::battery() as $i => $vector ) { + $battery_rows[] = vector_row( "battery {$i}", $vector[0] ); +} + +$cases = (int) ( $argv[1] ?? 100000 ); +$max_bytes = (int) ( $argv[2] ?? 256 ); +$stats = array( + 'cases' => 0, + 'bytes' => 0, + 'strict_valid' => 0, + 'strict_invalid' => 0, + 'seems_accepts_strict_invalid' => 0, + 'seems_rejects_strict_invalid' => 0, + 'seems_rejects_strict_valid' => 0, + 'check_utf8_keep_empty_on_invalid' => 0, + 'check_utf8_strip_matches_scrub' => 0, + 'check_utf8_strip_mismatches_scrub' => 0, + 'check_latin1_passthrough_on_invalid' => 0, + 'seems_accepts_invalid_by_class' => array(), + 'seems_rejects_invalid_by_class' => array(), + 'first_example_by_class' => array(), + 'strategy_counts' => array(), +); + +$start = microtime( true ); +for ( $case = 0; $case < $cases; $case++ ) { + $prng = new Prng( "legacy-utf8-divergence:{$case}" ); + $generator = new Generator( $prng, $max_bytes ); + $generated = $generator->generate(); + $bytes = $generated['bytes']; + $strategy = $generated['strategy']; + $valid = wp_is_valid_utf8( $bytes ); + $seems = seems_utf8( $bytes ); + $class = $valid ? 'valid' : invalid_class( $bytes ); + $scrubbed = wp_scrub_utf8( $bytes ); + + ++$stats['cases']; + $stats['bytes'] += strlen( $bytes ); + $stats['strategy_counts'][ $strategy ] = ( $stats['strategy_counts'][ $strategy ] ?? 0 ) + 1; + + if ( $valid ) { + ++$stats['strict_valid']; + if ( ! $seems ) { + ++$stats['seems_rejects_strict_valid']; + } + continue; + } + + ++$stats['strict_invalid']; + if ( $seems ) { + ++$stats['seems_accepts_strict_invalid']; + $stats['seems_accepts_invalid_by_class'][ $class ] = ( $stats['seems_accepts_invalid_by_class'][ $class ] ?? 0 ) + 1; + } else { + ++$stats['seems_rejects_strict_invalid']; + $stats['seems_rejects_invalid_by_class'][ $class ] = ( $stats['seems_rejects_invalid_by_class'][ $class ] ?? 0 ) + 1; + } + + if ( ! isset( $stats['first_example_by_class'][ $class ] ) ) { + $stats['first_example_by_class'][ $class ] = hx( strlen( $bytes ) > 24 ? substr( $bytes, 0, 24 ) : $bytes ); + } + + if ( '' === check_invalid_with_charset( $bytes, false, 'UTF-8' ) ) { + ++$stats['check_utf8_keep_empty_on_invalid']; + } + + if ( check_invalid_with_charset( $bytes, true, 'UTF-8' ) === $scrubbed ) { + ++$stats['check_utf8_strip_matches_scrub']; + } else { + ++$stats['check_utf8_strip_mismatches_scrub']; + } + + if ( + check_invalid_with_charset( $bytes, false, 'ISO-8859-1' ) === $bytes && + check_invalid_with_charset( $bytes, true, 'ISO-8859-1' ) === $bytes + ) { + ++$stats['check_latin1_passthrough_on_invalid']; + } +} + +ksort( $stats['seems_accepts_invalid_by_class'] ); +ksort( $stats['seems_rejects_invalid_by_class'] ); +ksort( $stats['strategy_counts'] ); + +$stats['elapsed_sec'] = round( microtime( true ) - $start, 3 ); + +echo json_encode( + array( + 'environment' => array( + 'php' => PHP_VERSION, + 'mbstring' => extension_loaded( 'mbstring' ), + 'intl' => extension_loaded( 'intl' ), + 'pcre_u' => _wp_can_use_pcre_u(), + 'cases' => $cases, + 'max_bytes' => $max_bytes, + ), + 'vectors' => $rows, + 'battery' => $battery_rows, + 'stats' => $stats, + ), + JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES +); From bce0942d855443b3d4bdfeff6925c944a2da06c1 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:07:49 +0200 Subject: [PATCH 3/3] Charset: Remove legacy UTF-8 survey runner --- handoffs/legacy-utf8-divergence-report.md | 4 + .../legacy-utf8-divergence-survey-runner.php | 365 ------------------ 2 files changed, 4 insertions(+), 365 deletions(-) delete mode 100644 handoffs/legacy-utf8-divergence-survey-runner.php diff --git a/handoffs/legacy-utf8-divergence-report.md b/handoffs/legacy-utf8-divergence-report.md index 9b26c144826c0..e9519a449ec62 100644 --- a/handoffs/legacy-utf8-divergence-report.md +++ b/handoffs/legacy-utf8-divergence-report.md @@ -18,6 +18,10 @@ The generated pass was deterministic: case `N` used php /private/tmp/legacy_utf8_divergence_survey.php 3000000 256 > /private/tmp/legacy_utf8_divergence_survey_results.json ``` +For auditability, a cleaned-up copy of the throwaway runner was committed in +`700d7c8c910f` (`Charset: Add legacy UTF-8 survey runner`) and removed in the +follow-up commit after this report recorded that provenance. + Important current-branch note: the handoff describes `wp_check_invalid_utf8()` as PCRE-based. That is historically correct, but this checkout already contains the 6.9-era rewrite from `d1e7f5625b`, so the current diff --git a/handoffs/legacy-utf8-divergence-survey-runner.php b/handoffs/legacy-utf8-divergence-survey-runner.php deleted file mode 100644 index 12784fd6a2219..0000000000000 --- a/handoffs/legacy-utf8-divergence-survey-runner.php +++ /dev/null @@ -1,365 +0,0 @@ - /tmp/legacy_utf8_divergence_survey_results.json - * - * The runner expects the encoding fuzzer checkout beside this repository by - * default, or in ENCODING_FUZZER_ROOT when provided. It deliberately loads a - * small subset of WordPress with stubs because the report is about byte-level - * helper behavior, not full WordPress bootstrap behavior. - */ - -use EncodingFuzz\Generator; -use EncodingFuzz\Oracles; -use EncodingFuzz\Prng; - -function survey_repo_root(): string { - return dirname( __DIR__ ); -} - -function survey_fuzzer_root(): string { - $from_env = getenv( 'ENCODING_FUZZER_ROOT' ); - if ( is_string( $from_env ) && '' !== $from_env ) { - return rtrim( $from_env, '/' ); - } - - return dirname( survey_repo_root() ) . '/encoding-fuzzer'; -} - -$fuzzer_root = survey_fuzzer_root(); -require $fuzzer_root . '/tools/encoding-fuzz/lib/autoload.php'; - -$GLOBALS['survey_blog_charset'] = 'UTF-8'; - -function _deprecated_function( $function_name, $version, $replacement = '' ) {} -function _deprecated_argument( $function_name, $version, $message = '' ) {} -function get_option( $name ) { - return 'blog_charset' === $name ? $GLOBALS['survey_blog_charset'] : null; -} -function mbstring_binary_safe_encoding( $reset = false ) {} -function reset_mbstring_encoding() {} - -require survey_repo_root() . '/src/wp-includes/compat.php'; - -function is_utf8_charset( $blog_charset = null ) { - return _is_utf8_charset( $blog_charset ?? get_option( 'blog_charset' ) ); -} - -require survey_repo_root() . '/src/wp-includes/compat-utf8.php'; -require survey_repo_root() . '/src/wp-includes/utf8.php'; -require survey_repo_root() . '/src/wp-includes/formatting.php'; - -function hx( string $bytes ): string { - return strtoupper( trim( chunk_split( bin2hex( $bytes ), 2, ' ' ) ) ); -} - -function visible( string $bytes ): string { - if ( '' === $bytes ) { - return "''"; - } - - return hx( $bytes ); -} - -function check_invalid_with_charset( string $bytes, bool $strip, string $charset ): string { - $GLOBALS['survey_blog_charset'] = $charset; - - return wp_check_invalid_utf8_uncached( $bytes, $strip ); -} - -function wp_check_invalid_utf8_uncached( string $text, bool $strip ): string { - $text = (string) $text; - - if ( 0 === strlen( $text ) ) { - return ''; - } - - if ( ! is_utf8_charset() || wp_is_valid_utf8( $text ) ) { - return $text; - } - - return $strip ? wp_scrub_utf8( $text ) : ''; -} - -function invalid_class( string $bytes ): string { - $length = strlen( $bytes ); - for ( $i = 0; $i < $length; $i++ ) { - $b0 = ord( $bytes[ $i ] ); - if ( $b0 < 0x80 ) { - continue; - } - - if ( $b0 >= 0x80 && $b0 <= 0xBF ) { - return 'lone continuation byte'; - } - - if ( $b0 >= 0xC0 && $b0 <= 0xC1 ) { - return has_continuations( $bytes, $i + 1, 1 ) ? 'overlong 2-byte sequence' : 'truncated C0/C1 lead'; - } - - if ( $b0 >= 0xC2 && $b0 <= 0xDF ) { - if ( ! has_continuations( $bytes, $i + 1, 1 ) ) { - return 'truncated 2-byte sequence'; - } - $i += 1; - continue; - } - - if ( 0xE0 === $b0 ) { - if ( ! has_continuations( $bytes, $i + 1, 2 ) ) { - return 'truncated 3-byte sequence'; - } - if ( ord( $bytes[ $i + 1 ] ) < 0xA0 ) { - return 'overlong 3-byte sequence'; - } - $i += 2; - continue; - } - - if ( $b0 >= 0xE1 && $b0 <= 0xEC ) { - if ( ! has_continuations( $bytes, $i + 1, 2 ) ) { - return 'truncated 3-byte sequence'; - } - $i += 2; - continue; - } - - if ( 0xED === $b0 ) { - if ( ! has_continuations( $bytes, $i + 1, 2 ) ) { - return 'truncated 3-byte sequence'; - } - if ( ord( $bytes[ $i + 1 ] ) >= 0xA0 ) { - return 'UTF-16 surrogate sequence'; - } - $i += 2; - continue; - } - - if ( $b0 >= 0xEE && $b0 <= 0xEF ) { - if ( ! has_continuations( $bytes, $i + 1, 2 ) ) { - return 'truncated 3-byte sequence'; - } - $i += 2; - continue; - } - - if ( 0xF0 === $b0 ) { - if ( ! has_continuations( $bytes, $i + 1, 3 ) ) { - return 'truncated 4-byte sequence'; - } - if ( ord( $bytes[ $i + 1 ] ) < 0x90 ) { - return 'overlong 4-byte sequence'; - } - $i += 3; - continue; - } - - if ( $b0 >= 0xF1 && $b0 <= 0xF3 ) { - if ( ! has_continuations( $bytes, $i + 1, 3 ) ) { - return 'truncated 4-byte sequence'; - } - $i += 3; - continue; - } - - if ( 0xF4 === $b0 ) { - if ( ! has_continuations( $bytes, $i + 1, 3 ) ) { - return 'truncated 4-byte sequence'; - } - if ( ord( $bytes[ $i + 1 ] ) > 0x8F ) { - return 'code point above U+10FFFF'; - } - $i += 3; - continue; - } - - if ( $b0 >= 0xF5 && $b0 <= 0xF7 ) { - return has_continuations( $bytes, $i + 1, 3 ) ? 'code point above U+10FFFF' : 'invalid F5-F7 lead'; - } - - if ( $b0 >= 0xF8 && $b0 <= 0xFB ) { - return has_continuations( $bytes, $i + 1, 4 ) ? 'obsolete 5-byte sequence' : 'invalid F8-FB lead'; - } - - if ( $b0 >= 0xFC && $b0 <= 0xFD ) { - return has_continuations( $bytes, $i + 1, 5 ) ? 'obsolete 6-byte sequence' : 'invalid FC-FD lead'; - } - - return 'FE/FF invalid lead'; - } - - return 'valid'; -} - -function has_continuations( string $bytes, int $start, int $count ): bool { - for ( $i = 0; $i < $count; $i++ ) { - $at = $start + $i; - if ( $at >= strlen( $bytes ) ) { - return false; - } - - $b = ord( $bytes[ $at ] ); - if ( ( $b & 0xC0 ) !== 0x80 ) { - return false; - } - } - - return true; -} - -function vector_row( string $name, string $bytes ): array { - $valid = wp_is_valid_utf8( $bytes ); - $seems = seems_utf8( $bytes ); - $scrubbed = wp_scrub_utf8( $bytes ); - $check_keep = check_invalid_with_charset( $bytes, false, 'UTF-8' ); - $check_strip = check_invalid_with_charset( $bytes, true, 'UTF-8' ); - $latin_keep = check_invalid_with_charset( $bytes, false, 'ISO-8859-1' ); - $latin_strip = check_invalid_with_charset( $bytes, true, 'ISO-8859-1' ); - - return array( - 'name' => $name, - 'hex' => hx( $bytes ), - 'class' => invalid_class( $bytes ), - 'wp_is_valid_utf8' => $valid, - 'seems_utf8' => $seems, - 'wp_scrub_utf8_hex' => visible( $scrubbed ), - 'check_utf8_keep_hex' => visible( $check_keep ), - 'check_utf8_strip_hex' => visible( $check_strip ), - 'check_latin_keep_hex' => visible( $latin_keep ), - 'check_latin_strip_hex' => visible( $latin_strip ), - ); -} - -$vectors = array( - 'ascii' => 'A', - 'valid 2-byte lower edge' => "\xC2\x80", - 'valid 3-byte lower edge' => "\xE0\xA0\x80", - 'valid 4-byte upper edge' => "\xF4\x8F\xBF\xBF", - 'valid noncharacter U+FFFE' => "\xEF\xBF\xBE", - 'valid replacement U+FFFD' => "\xEF\xBF\xBD", - 'lone continuation' => "\x80", - 'FE invalid lead' => "\xFE", - 'truncated 2-byte' => "\xC2", - 'truncated 3-byte' => "\xE2\x8C", - 'truncated 4-byte' => "\xF1\x80\x80", - 'overlong 2-byte' => "\xC0\x80", - 'overlong 3-byte' => "\xE0\x80\x80", - 'overlong 4-byte' => "\xF0\x80\x80\x80", - 'surrogate U+D800' => "\xED\xA0\x80", - 'above U+10FFFF F4' => "\xF4\x90\x80\x80", - 'above U+10FFFF F5' => "\xF5\x80\x80\x80", - 'obsolete 5-byte' => "\xF8\x80\x80\x80\x80", - 'obsolete 6-byte' => "\xFC\x80\x80\x80\x80\x80", - 'mixed invalid in text' => "A\xC0\x80Z", -); - -$rows = array(); -foreach ( $vectors as $name => $bytes ) { - $rows[] = vector_row( $name, $bytes ); -} - -$battery_rows = array(); -foreach ( Oracles::battery() as $i => $vector ) { - $battery_rows[] = vector_row( "battery {$i}", $vector[0] ); -} - -$cases = (int) ( $argv[1] ?? 100000 ); -$max_bytes = (int) ( $argv[2] ?? 256 ); -$stats = array( - 'cases' => 0, - 'bytes' => 0, - 'strict_valid' => 0, - 'strict_invalid' => 0, - 'seems_accepts_strict_invalid' => 0, - 'seems_rejects_strict_invalid' => 0, - 'seems_rejects_strict_valid' => 0, - 'check_utf8_keep_empty_on_invalid' => 0, - 'check_utf8_strip_matches_scrub' => 0, - 'check_utf8_strip_mismatches_scrub' => 0, - 'check_latin1_passthrough_on_invalid' => 0, - 'seems_accepts_invalid_by_class' => array(), - 'seems_rejects_invalid_by_class' => array(), - 'first_example_by_class' => array(), - 'strategy_counts' => array(), -); - -$start = microtime( true ); -for ( $case = 0; $case < $cases; $case++ ) { - $prng = new Prng( "legacy-utf8-divergence:{$case}" ); - $generator = new Generator( $prng, $max_bytes ); - $generated = $generator->generate(); - $bytes = $generated['bytes']; - $strategy = $generated['strategy']; - $valid = wp_is_valid_utf8( $bytes ); - $seems = seems_utf8( $bytes ); - $class = $valid ? 'valid' : invalid_class( $bytes ); - $scrubbed = wp_scrub_utf8( $bytes ); - - ++$stats['cases']; - $stats['bytes'] += strlen( $bytes ); - $stats['strategy_counts'][ $strategy ] = ( $stats['strategy_counts'][ $strategy ] ?? 0 ) + 1; - - if ( $valid ) { - ++$stats['strict_valid']; - if ( ! $seems ) { - ++$stats['seems_rejects_strict_valid']; - } - continue; - } - - ++$stats['strict_invalid']; - if ( $seems ) { - ++$stats['seems_accepts_strict_invalid']; - $stats['seems_accepts_invalid_by_class'][ $class ] = ( $stats['seems_accepts_invalid_by_class'][ $class ] ?? 0 ) + 1; - } else { - ++$stats['seems_rejects_strict_invalid']; - $stats['seems_rejects_invalid_by_class'][ $class ] = ( $stats['seems_rejects_invalid_by_class'][ $class ] ?? 0 ) + 1; - } - - if ( ! isset( $stats['first_example_by_class'][ $class ] ) ) { - $stats['first_example_by_class'][ $class ] = hx( strlen( $bytes ) > 24 ? substr( $bytes, 0, 24 ) : $bytes ); - } - - if ( '' === check_invalid_with_charset( $bytes, false, 'UTF-8' ) ) { - ++$stats['check_utf8_keep_empty_on_invalid']; - } - - if ( check_invalid_with_charset( $bytes, true, 'UTF-8' ) === $scrubbed ) { - ++$stats['check_utf8_strip_matches_scrub']; - } else { - ++$stats['check_utf8_strip_mismatches_scrub']; - } - - if ( - check_invalid_with_charset( $bytes, false, 'ISO-8859-1' ) === $bytes && - check_invalid_with_charset( $bytes, true, 'ISO-8859-1' ) === $bytes - ) { - ++$stats['check_latin1_passthrough_on_invalid']; - } -} - -ksort( $stats['seems_accepts_invalid_by_class'] ); -ksort( $stats['seems_rejects_invalid_by_class'] ); -ksort( $stats['strategy_counts'] ); - -$stats['elapsed_sec'] = round( microtime( true ) - $start, 3 ); - -echo json_encode( - array( - 'environment' => array( - 'php' => PHP_VERSION, - 'mbstring' => extension_loaded( 'mbstring' ), - 'intl' => extension_loaded( 'intl' ), - 'pcre_u' => _wp_can_use_pcre_u(), - 'cases' => $cases, - 'max_bytes' => $max_bytes, - ), - 'vectors' => $rows, - 'battery' => $battery_rows, - 'stats' => $stats, - ), - JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES -);