From c6ec64a8a510f4e146c9e1624272ba4a58db58d4 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:10:21 +0200
Subject: [PATCH 1/3] Charset: Document legacy UTF-8 helper divergences

---
 handoffs/legacy-utf8-divergence-report.md | 397 ++++++++++++++++++++++
 1 file changed, 397 insertions(+)
 create mode 100644 handoffs/legacy-utf8-divergence-report.md

diff --git a/handoffs/legacy-utf8-divergence-report.md b/handoffs/legacy-utf8-divergence-report.md
new file mode 100644
index 0000000000000..9b26c144826c0
--- /dev/null
+++ b/handoffs/legacy-utf8-divergence-report.md
@@ -0,0 +1,397 @@
+# Legacy UTF-8 Helper Divergence Report
+
+## Scope
+
+This report covers the current `utf8-survey` checkout and the handoff in
+`encoding-fuzzer/handoffs/legacy-utf8-divergence-survey.md`.
+
+No production code was changed for this survey. The throwaway runner lived at
+`/private/tmp/legacy_utf8_divergence_survey.php` and reused the generator and
+oracle battery from the adjacent `encoding-fuzzer/tools/encoding-fuzz/`
+checkout. It loaded this checkout's `compat.php`, `compat-utf8.php`,
+`utf8.php`, and `formatting.php` with minimal stubs.
+
+The generated pass was deterministic: case `N` used
+`new EncodingFuzz\Prng( "legacy-utf8-divergence:N" )`. The exact command was:
+
+```sh
+php /private/tmp/legacy_utf8_divergence_survey.php 3000000 256 > /private/tmp/legacy_utf8_divergence_survey_results.json
+```
+
+Important current-branch note: the handoff describes
+`wp_check_invalid_utf8()` as PCRE-based. That is historically correct, but this
+checkout already contains the 6.9-era rewrite from `d1e7f5625b`, so the current
+implementation now calls `wp_is_valid_utf8()` and `wp_scrub_utf8()` when
+`blog_charset` is UTF-8.
+
+## Environment
+
+- PHP: 8.4.21
+- Extensions present: `mbstring`, `intl`
+- PCRE Unicode support: yes
+- Generated pass: 3,000,000 inputs, 366,593,389 bytes, max generated input
+  size 256 bytes
+- Generator strategies: random bytes, random ASCII, valid UTF-8,
+  mutated-valid UTF-8, atom splices, latin1-ish text, UTF-16 bytes,
+  ASCII fast paths, repeated motifs
+- Oracle battery: `EncodingFuzz\Oracles::battery()`
+
+`wp_check_invalid_utf8()` caches the first `is_utf8_charset()` result in a
+static. To compare UTF-8 and non-UTF-8 charset behavior in one process, the
+runner used an equivalent uncached copy of the current implementation for that
+matrix. This models first-call behavior in a fresh request under each charset.
+
+## Aggregate Results
+
+| Measurement | Count |
+| --- | ---: |
+| Generated inputs | 3,000,000 |
+| Strict-valid inputs | 1,128,174 |
+| Strict-invalid inputs | 1,871,826 |
+| `seems_utf8()` accepted strict-invalid input | 92,007 |
+| `seems_utf8()` rejected strict-invalid input | 1,779,819 |
+| `seems_utf8()` rejected strict-valid input | 0 |
+| `wp_check_invalid_utf8( $s, false )` returned `''` for invalid UTF-8 under UTF-8 charset | 1,871,826 |
+| `wp_check_invalid_utf8( $s, true )` matched `wp_scrub_utf8( $s )` under UTF-8 charset | 1,871,826 |
+| `wp_check_invalid_utf8( $s, true )` mismatched `wp_scrub_utf8( $s )` under UTF-8 charset | 0 |
+| `wp_check_invalid_utf8()` passed invalid bytes through under ISO-8859-1 charset | 1,871,826 |
+
+`seems_utf8()` accepted strict-invalid inputs in exactly these buckets:
+
+| Divergence class | Generated examples |
+| --- | ---: |
+| UTF-16 surrogate sequence | 21,051 |
+| Code point above `U+10FFFF` | 14,770 |
+| Obsolete 5-byte sequence | 6,850 |
+| Obsolete 6-byte sequence | 6,764 |
+| Overlong 2-byte sequence | 14,686 |
+| Overlong 3-byte sequence | 13,775 |
+| Overlong 4-byte sequence | 14,111 |
+
+No generated class showed `wp_check_invalid_utf8( $s, true )` diverging from
+`wp_scrub_utf8( $s )` when `blog_charset` was UTF-8.
+
+## Divergence Matrix
+
+All byte strings are hex. `R` means one `U+FFFD` replacement character, encoded
+as `EF BF BD`. `same` means the original byte string is returned unchanged.
+
+| Input class | Minimal bytes | `wp_is_valid_utf8()` | `seems_utf8()` | `wp_check_invalid_utf8( false )`, UTF-8 charset | `wp_check_invalid_utf8( true )`, UTF-8 charset | `wp_check_invalid_utf8()`, non-UTF-8 charset |
+| --- | --- | --- | --- | --- | --- | --- |
+| ASCII | `41` | accept | accept | same | same | same |
+| Valid 2-byte lower edge | `C2 80` | accept | accept | same | same | same |
+| Valid 3-byte lower edge | `E0 A0 80` | accept | accept | same | same | same |
+| Valid 4-byte upper edge | `F4 8F BF BF` | accept | accept | same | same | same |
+| Noncharacter `U+FFFE` | `EF BF BE` | accept | accept | same | same | same |
+| Replacement character `U+FFFD` | `EF BF BD` | accept | accept | same | same | same |
+| Lone continuation | `80` | reject | reject | `''` | `R` | same |
+| Invalid `FE`/`FF` lead | `FE` | reject | reject | `''` | `R` | same |
+| Truncated 2-byte sequence | `C2` | reject | reject | `''` | `R` | same |
+| Truncated 3-byte sequence | `E2 8C` | reject | reject | `''` | `R` | same |
+| Truncated 4-byte sequence | `F1 80 80` | reject | reject | `''` | `R` | same |
+| Overlong 2-byte sequence | `C0 80` | reject | accept | `''` | `R R` | same |
+| Overlong 3-byte sequence | `E0 80 80` | reject | accept | `''` | `R R R` | same |
+| Overlong 4-byte sequence | `F0 80 80 80` | reject | accept | `''` | `R R R R` | same |
+| UTF-16 surrogate sequence | `ED A0 80` | reject | accept | `''` | `R R R` | same |
+| Above `U+10FFFF`, `F4` form | `F4 90 80 80` | reject | accept | `''` | `R R R R` | same |
+| Above `U+10FFFF`, `F5` form | `F5 80 80 80` | reject | accept | `''` | `R R R R` | same |
+| Obsolete 5-byte sequence | `F8 80 80 80 80` | reject | accept | `''` | `R R R R R` | same |
+| Obsolete 6-byte sequence | `FC 80 80 80 80 80` | reject | accept | `''` | `R R R R R R` | same |
+| Valid text plus overlong bytes | `41 C0 80 5A` | reject | accept | `''` | `41 R R 5A` | same |
+
+## Divergence Classes
+
+### `seems_utf8()` accepts overlong encodings
+
+Representative inputs: `C0 80`, `E0 80 80`, `F0 80 80 80`.
+
+Classification: accidental if the caller expects valid UTF-8; historically
+load-bearing only as a loose structural heuristic.
+
+Evidence:
+
+- `src/wp-includes/formatting.php` says the function checks whether the string
+  "fits a UTF-8 model", not whether it is well-formed UTF-8.
+- Core Trac #38044 was specifically opened to make `seems_utf8()` RFC 3629
+  compliant and calls out overlong acceptance as a defect:
+  https://core.trac.wordpress.org/ticket/38044
+- Commit `bb6ed3ba22` introduced `wp_is_valid_utf8()` and deprecated
+  `seems_utf8()` instead of tightening the old function in place.
+
+Impact:
+
+Replacing `seems_utf8()` with `wp_is_valid_utf8()` is behavior-changing for
+saved data containing these bytes: old code reports "yes"; strict validation
+reports "no".
+
+### `seems_utf8()` accepts UTF-16 surrogate encodings
+
+Representative input: `ED A0 80`.
+
+Classification: accidental. Surrogate halves are not Unicode scalar values and
+are rejected by the strict validator and by the fuzzer battery.
+
+Evidence:
+
+- Trac #38044 explicitly names surrogate acceptance as part of the RFC 3629
+  compliance problem: https://core.trac.wordpress.org/ticket/38044
+- The current `wp_is_valid_utf8()` docblock gives surrogate halves as invalid
+  examples.
+
+Impact:
+
+Same as overlongs: `wp_is_valid_utf8()` is the correct replacement for
+validation, but it is not a byte-for-byte-compatible replacement.
+
+### `seems_utf8()` accepts code points above `U+10FFFF`
+
+Representative inputs: `F4 90 80 80`, `F5 80 80 80`.
+
+Classification: accidental. The code accepts any `F0`-`F7` lead followed by
+three continuation bytes, but modern UTF-8 stops at `F4 8F BF BF`.
+
+Evidence:
+
+- The current `wp_is_valid_utf8()` docblock defines well-formed UTF-8 as
+  excluding characters above the representable range.
+- Trac #38044 frames the replacement around RFC 3629 compliance, whose range is
+  `U+0000..U+10FFFF`.
+
+Impact:
+
+Strict replacement rejects bytes that the legacy heuristic accepted. Treat this
+as a migration break for data-validation callers.
+
+### `seems_utf8()` accepts obsolete 5- and 6-byte forms
+
+Representative inputs: `F8 80 80 80 80`, `FC 80 80 80 80 80`.
+
+Classification: documented historical looseness, not valid UTF-8. The
+docblock warns that the function checks 5-byte sequences even though UTF-8 has a
+maximum length of 4 bytes; the code also accepts 6-byte forms.
+
+Evidence:
+
+- The 5-byte warning was added in the 2009 cleanup associated with Trac #9692:
+  https://core.trac.wordpress.org/ticket/9692
+- Trac #38044 records the later decision to deprecate rather than repair this
+  legacy behavior in place.
+
+Impact:
+
+This is the clearest documented non-strict behavior. A strict replacement is
+still desirable for validation, but compatibility notes should call out the
+change explicitly.
+
+### `wp_check_invalid_utf8( $s, false )` rejects the whole string
+
+Representative input: `41 C0 80 5A`.
+
+Classification: intentional security behavior. Under UTF-8 charset, any invalid
+span makes the default mode return `''`; it does not preserve valid surrounding
+text.
+
+Evidence:
+
+- Trac #8767 introduced the helper in a security/XSS context and discussed the
+  default empty-string behavior as the more conservative validator-like option:
+  https://core.trac.wordpress.org/ticket/8767
+- The current docblock documents this default mode.
+
+Impact:
+
+`wp_scrub_utf8()` is not a drop-in replacement for default-mode callers because
+it preserves the string and inserts replacement characters. That can be a better
+user experience in some contexts, but it changes escaping and sanitization
+behavior.
+
+### `wp_check_invalid_utf8( $s, true )` now scrubs with `U+FFFD`
+
+Representative input: `C0 80` produces `R R`.
+
+Classification: intentional current behavior. On this branch, `$strip = true`
+matches `wp_scrub_utf8()` for all generated strict-invalid inputs under UTF-8
+charset.
+
+Evidence:
+
+- Trac #63837 states the plan to rely on `wp_is_valid_utf8()` and add
+  `wp_scrub_utf8()` for replacement-character scrubbing:
+  https://core.trac.wordpress.org/ticket/63837
+- Commit `d1e7f5625b` says the old `$strip` defect was fixed and invalid bytes
+  are now replaced with `U+FFFD` for stronger security guarantees.
+
+Impact:
+
+For UTF-8 charset requests, `wp_scrub_utf8()` is behavior-equivalent to
+`wp_check_invalid_utf8( $s, true )` except for the legacy function's
+`blog_charset` gate.
+
+### `wp_check_invalid_utf8()` passes through all bytes for non-UTF-8 charsets
+
+Representative input under `ISO-8859-1` charset: `C0 80` returns `C0 80` in
+both modes.
+
+Classification: intentional environment sensitivity.
+
+Evidence:
+
+- The current docblock says the function only performs work when
+  `blog_charset` is UTF-8.
+- Trac #63837 calls out that the function assumes input strings are encoded
+  with `blog_charset`, and says that point is inherent to how it works:
+  https://core.trac.wordpress.org/ticket/63837
+
+Impact:
+
+Neither `wp_is_valid_utf8()` nor `wp_scrub_utf8()` is a drop-in replacement
+where the caller intentionally wants `blog_charset`-dependent passthrough.
+
+## Current Core Callers
+
+### `seems_utf8()`
+
+No production callers remain in this checkout. The only in-tree production
+reference found by `rg` is the function definition itself.
+
+Migration guidance:
+
+- For validation callers, `wp_is_valid_utf8()` is the intended replacement, but
+  it is behavior-changing for overlongs, surrogates, above-range code points,
+  and 5/6-byte forms.
+- For charset-guessing callers, `wp_is_valid_utf8()` is not a semantic drop-in.
+  Such callers should make the heuristic explicit instead of using
+  `seems_utf8()`.
+
+### `esc_js()`
+
+Current call: `wp_check_invalid_utf8( $text )`.
+
+Migration guidance:
+
+- `wp_is_valid_utf8()` is not a drop-in; it returns a boolean and does not
+  produce escaped text.
+- `wp_scrub_utf8()` is behavior-changing; invalid input would be preserved with
+  `U+FFFD` instead of blanked before JavaScript escaping.
+- Keep `wp_check_invalid_utf8()` unless the security model is explicitly
+  changed from whole-string rejection to scrubbing.
+
+### `esc_html()`
+
+Current call: `wp_check_invalid_utf8( $text )`.
+
+Migration guidance:
+
+- `wp_scrub_utf8()` is behavior-changing but may be a future product decision if
+  preserving partially valid display text is preferred.
+- It is not a drop-in for current behavior because default-mode
+  `wp_check_invalid_utf8()` returns `''` for any invalid UTF-8 under UTF-8
+  charset and passes raw bytes through under non-UTF-8 charset.
+
+### `esc_attr()`
+
+Current call: `wp_check_invalid_utf8( $text )`.
+
+Migration guidance:
+
+- Attribute context is especially sensitive to partial decoding and downstream
+  parser behavior. Keep whole-string rejection unless a dedicated security
+  review approves replacement-character scrubbing.
+- `wp_is_valid_utf8()` is not a drop-in output function.
+
+### `esc_xml()`
+
+Current call: `wp_check_invalid_utf8( $text )`.
+
+Migration guidance:
+
+- `wp_scrub_utf8()` would be plausible for XML generation because XML requires
+  valid character data, but it changes the output contract from blanking to
+  replacement.
+- A direct replacement needs XML-specific review, especially because XML also
+  has character restrictions beyond UTF-8 well-formedness.
+
+### `_sanitize_text_fields()`, via `sanitize_text_field()` and `sanitize_textarea_field()`
+
+Current call: `wp_check_invalid_utf8( $str )`.
+
+Migration guidance:
+
+- `wp_scrub_utf8()` is behavior-changing: stored/sanitized values that
+  currently become empty would retain valid surrounding text and replacement
+  characters.
+- This may be user-friendlier, but it is not a drop-in. Treat it as a product
+  and compatibility decision.
+
+### `_wp_json_convert_string()`
+
+Current fallback call: `wp_check_invalid_utf8( $input_string, true )`, only when
+`mb_convert_encoding()` is unavailable.
+
+Migration guidance:
+
+- Under UTF-8 charset on this branch, `wp_scrub_utf8()` is behavior-equivalent
+  for generated invalid inputs and is the clearer operation.
+- It is still not a full drop-in because `wp_check_invalid_utf8()` preserves raw
+  input when `blog_charset` is not UTF-8.
+- JSON output must be UTF-8, so this is the best candidate for a targeted future
+  migration away from `wp_check_invalid_utf8()`.
+
+## Recommendations
+
+### `seems_utf8()`: keep deprecated; do not repair in place
+
+The function is a loose structural heuristic with no remaining production core
+callers. It accepts several classes of invalid UTF-8 by design of its bit-mask
+model, and changing the implementation in place would silently change external
+caller behavior.
+
+Recommended action:
+
+- Keep the existing deprecation to `wp_is_valid_utf8()`.
+- Do not include it in continuous differential fuzzing against strict UTF-8
+  validation; the known divergences are permanent unless the deprecated
+  function is removed or broken for compatibility.
+- If docs are touched, say explicitly that it accepts overlongs, surrogates,
+  above-range code points, and obsolete 5/6-byte forms. The current docblock
+  mentions 5-byte sequences, but not the full divergence set.
+
+### `wp_check_invalid_utf8()`: document and leave for default-mode callers
+
+The current branch has already removed the historical PCRE dependency for
+UTF-8 charset requests. The remaining divergences are semantic:
+
+- default mode rejects the entire invalid string;
+- strip mode scrubs with `U+FFFD`;
+- all modes pass bytes through when `blog_charset` is not UTF-8.
+
+Recommended action:
+
+- Keep default-mode calls in escaping and sanitization until each context has an
+  explicit security and compatibility decision.
+- Prefer `wp_scrub_utf8()` for new code that unconditionally wants valid UTF-8
+  output and does not want `blog_charset` sensitivity.
+- Consider a targeted follow-up for `_wp_json_convert_string()`'s fallback path,
+  because JSON wants UTF-8 and current `$strip = true` behavior already matches
+  `wp_scrub_utf8()` under UTF-8 charset.
+
+## Sources Checked
+
+- Local function history: `git log -L :seems_utf8:src/wp-includes/formatting.php`
+- Local function history: `git log -L :wp_check_invalid_utf8:src/wp-includes/formatting.php`
+- Current `seems_utf8()` deprecation and `wp_is_valid_utf8()` introduction:
+  commit `bb6ed3ba22`
+- Current `wp_check_invalid_utf8()` / `wp_scrub_utf8()` rewrite:
+  commit `d1e7f5625b`
+- Trac #9692, `seems_utf8()` cleanup:
+  https://core.trac.wordpress.org/ticket/9692
+- Trac #8767, original `wp_check_invalid_utf8()` security refactor:
+  https://core.trac.wordpress.org/ticket/8767
+- Trac #38044, RFC 3629 compliance and `wp_is_valid_utf8()`:
+  https://core.trac.wordpress.org/ticket/38044
+- Trac #63837, `wp_check_invalid_utf8()` rewrite and `wp_scrub_utf8()`:
+  https://core.trac.wordpress.org/ticket/63837
+- Trac #29717, historical PCRE behavior and caller importance:
+  https://core.trac.wordpress.org/ticket/29717
+- Trac #63863, standardizing UTF-8 handling and fallbacks:
+  https://core.trac.wordpress.org/ticket/63863

From 700d7c8c910fa44d65d097cf31048eb4078abd60 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Wed, 10 Jun 2026 23:07:16 +0200
Subject: [PATCH 2/3] Charset: Add legacy UTF-8 survey runner

---
 .../legacy-utf8-divergence-survey-runner.php  | 365 ++++++++++++++++++
 1 file changed, 365 insertions(+)
 create mode 100644 handoffs/legacy-utf8-divergence-survey-runner.php

diff --git a/handoffs/legacy-utf8-divergence-survey-runner.php b/handoffs/legacy-utf8-divergence-survey-runner.php
new file mode 100644
index 0000000000000..12784fd6a2219
--- /dev/null
+++ b/handoffs/legacy-utf8-divergence-survey-runner.php
@@ -0,0 +1,365 @@
+<?php
+/**
+ * One-shot runner for the legacy UTF-8 helper divergence report.
+ *
+ * Usage:
+ *
+ *     php handoffs/legacy-utf8-divergence-survey-runner.php 3000000 256 > /tmp/legacy_utf8_divergence_survey_results.json
+ *
+ * The runner expects the encoding fuzzer checkout beside this repository by
+ * default, or in ENCODING_FUZZER_ROOT when provided. It deliberately loads a
+ * small subset of WordPress with stubs because the report is about byte-level
+ * helper behavior, not full WordPress bootstrap behavior.
+ */
+
+use EncodingFuzz\Generator;
+use EncodingFuzz\Oracles;
+use EncodingFuzz\Prng;
+
+function survey_repo_root(): string {
+	return dirname( __DIR__ );
+}
+
+function survey_fuzzer_root(): string {
+	$from_env = getenv( 'ENCODING_FUZZER_ROOT' );
+	if ( is_string( $from_env ) && '' !== $from_env ) {
+		return rtrim( $from_env, '/' );
+	}
+
+	return dirname( survey_repo_root() ) . '/encoding-fuzzer';
+}
+
+$fuzzer_root = survey_fuzzer_root();
+require $fuzzer_root . '/tools/encoding-fuzz/lib/autoload.php';
+
+$GLOBALS['survey_blog_charset'] = 'UTF-8';
+
+function _deprecated_function( $function_name, $version, $replacement = '' ) {}
+function _deprecated_argument( $function_name, $version, $message = '' ) {}
+function get_option( $name ) {
+	return 'blog_charset' === $name ? $GLOBALS['survey_blog_charset'] : null;
+}
+function mbstring_binary_safe_encoding( $reset = false ) {}
+function reset_mbstring_encoding() {}
+
+require survey_repo_root() . '/src/wp-includes/compat.php';
+
+function is_utf8_charset( $blog_charset = null ) {
+	return _is_utf8_charset( $blog_charset ?? get_option( 'blog_charset' ) );
+}
+
+require survey_repo_root() . '/src/wp-includes/compat-utf8.php';
+require survey_repo_root() . '/src/wp-includes/utf8.php';
+require survey_repo_root() . '/src/wp-includes/formatting.php';
+
+function hx( string $bytes ): string {
+	return strtoupper( trim( chunk_split( bin2hex( $bytes ), 2, ' ' ) ) );
+}
+
+function visible( string $bytes ): string {
+	if ( '' === $bytes ) {
+		return "''";
+	}
+
+	return hx( $bytes );
+}
+
+function check_invalid_with_charset( string $bytes, bool $strip, string $charset ): string {
+	$GLOBALS['survey_blog_charset'] = $charset;
+
+	return wp_check_invalid_utf8_uncached( $bytes, $strip );
+}
+
+function wp_check_invalid_utf8_uncached( string $text, bool $strip ): string {
+	$text = (string) $text;
+
+	if ( 0 === strlen( $text ) ) {
+		return '';
+	}
+
+	if ( ! is_utf8_charset() || wp_is_valid_utf8( $text ) ) {
+		return $text;
+	}
+
+	return $strip ? wp_scrub_utf8( $text ) : '';
+}
+
+function invalid_class( string $bytes ): string {
+	$length = strlen( $bytes );
+	for ( $i = 0; $i < $length; $i++ ) {
+		$b0 = ord( $bytes[ $i ] );
+		if ( $b0 < 0x80 ) {
+			continue;
+		}
+
+		if ( $b0 >= 0x80 && $b0 <= 0xBF ) {
+			return 'lone continuation byte';
+		}
+
+		if ( $b0 >= 0xC0 && $b0 <= 0xC1 ) {
+			return has_continuations( $bytes, $i + 1, 1 ) ? 'overlong 2-byte sequence' : 'truncated C0/C1 lead';
+		}
+
+		if ( $b0 >= 0xC2 && $b0 <= 0xDF ) {
+			if ( ! has_continuations( $bytes, $i + 1, 1 ) ) {
+				return 'truncated 2-byte sequence';
+			}
+			$i += 1;
+			continue;
+		}
+
+		if ( 0xE0 === $b0 ) {
+			if ( ! has_continuations( $bytes, $i + 1, 2 ) ) {
+				return 'truncated 3-byte sequence';
+			}
+			if ( ord( $bytes[ $i + 1 ] ) < 0xA0 ) {
+				return 'overlong 3-byte sequence';
+			}
+			$i += 2;
+			continue;
+		}
+
+		if ( $b0 >= 0xE1 && $b0 <= 0xEC ) {
+			if ( ! has_continuations( $bytes, $i + 1, 2 ) ) {
+				return 'truncated 3-byte sequence';
+			}
+			$i += 2;
+			continue;
+		}
+
+		if ( 0xED === $b0 ) {
+			if ( ! has_continuations( $bytes, $i + 1, 2 ) ) {
+				return 'truncated 3-byte sequence';
+			}
+			if ( ord( $bytes[ $i + 1 ] ) >= 0xA0 ) {
+				return 'UTF-16 surrogate sequence';
+			}
+			$i += 2;
+			continue;
+		}
+
+		if ( $b0 >= 0xEE && $b0 <= 0xEF ) {
+			if ( ! has_continuations( $bytes, $i + 1, 2 ) ) {
+				return 'truncated 3-byte sequence';
+			}
+			$i += 2;
+			continue;
+		}
+
+		if ( 0xF0 === $b0 ) {
+			if ( ! has_continuations( $bytes, $i + 1, 3 ) ) {
+				return 'truncated 4-byte sequence';
+			}
+			if ( ord( $bytes[ $i + 1 ] ) < 0x90 ) {
+				return 'overlong 4-byte sequence';
+			}
+			$i += 3;
+			continue;
+		}
+
+		if ( $b0 >= 0xF1 && $b0 <= 0xF3 ) {
+			if ( ! has_continuations( $bytes, $i + 1, 3 ) ) {
+				return 'truncated 4-byte sequence';
+			}
+			$i += 3;
+			continue;
+		}
+
+		if ( 0xF4 === $b0 ) {
+			if ( ! has_continuations( $bytes, $i + 1, 3 ) ) {
+				return 'truncated 4-byte sequence';
+			}
+			if ( ord( $bytes[ $i + 1 ] ) > 0x8F ) {
+				return 'code point above U+10FFFF';
+			}
+			$i += 3;
+			continue;
+		}
+
+		if ( $b0 >= 0xF5 && $b0 <= 0xF7 ) {
+			return has_continuations( $bytes, $i + 1, 3 ) ? 'code point above U+10FFFF' : 'invalid F5-F7 lead';
+		}
+
+		if ( $b0 >= 0xF8 && $b0 <= 0xFB ) {
+			return has_continuations( $bytes, $i + 1, 4 ) ? 'obsolete 5-byte sequence' : 'invalid F8-FB lead';
+		}
+
+		if ( $b0 >= 0xFC && $b0 <= 0xFD ) {
+			return has_continuations( $bytes, $i + 1, 5 ) ? 'obsolete 6-byte sequence' : 'invalid FC-FD lead';
+		}
+
+		return 'FE/FF invalid lead';
+	}
+
+	return 'valid';
+}
+
+function has_continuations( string $bytes, int $start, int $count ): bool {
+	for ( $i = 0; $i < $count; $i++ ) {
+		$at = $start + $i;
+		if ( $at >= strlen( $bytes ) ) {
+			return false;
+		}
+
+		$b = ord( $bytes[ $at ] );
+		if ( ( $b & 0xC0 ) !== 0x80 ) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+function vector_row( string $name, string $bytes ): array {
+	$valid        = wp_is_valid_utf8( $bytes );
+	$seems        = seems_utf8( $bytes );
+	$scrubbed     = wp_scrub_utf8( $bytes );
+	$check_keep   = check_invalid_with_charset( $bytes, false, 'UTF-8' );
+	$check_strip  = check_invalid_with_charset( $bytes, true, 'UTF-8' );
+	$latin_keep   = check_invalid_with_charset( $bytes, false, 'ISO-8859-1' );
+	$latin_strip  = check_invalid_with_charset( $bytes, true, 'ISO-8859-1' );
+
+	return array(
+		'name'                  => $name,
+		'hex'                   => hx( $bytes ),
+		'class'                 => invalid_class( $bytes ),
+		'wp_is_valid_utf8'      => $valid,
+		'seems_utf8'            => $seems,
+		'wp_scrub_utf8_hex'     => visible( $scrubbed ),
+		'check_utf8_keep_hex'   => visible( $check_keep ),
+		'check_utf8_strip_hex'  => visible( $check_strip ),
+		'check_latin_keep_hex'  => visible( $latin_keep ),
+		'check_latin_strip_hex' => visible( $latin_strip ),
+	);
+}
+
+$vectors = array(
+	'ascii'                     => 'A',
+	'valid 2-byte lower edge'   => "\xC2\x80",
+	'valid 3-byte lower edge'   => "\xE0\xA0\x80",
+	'valid 4-byte upper edge'   => "\xF4\x8F\xBF\xBF",
+	'valid noncharacter U+FFFE' => "\xEF\xBF\xBE",
+	'valid replacement U+FFFD'  => "\xEF\xBF\xBD",
+	'lone continuation'         => "\x80",
+	'FE invalid lead'           => "\xFE",
+	'truncated 2-byte'          => "\xC2",
+	'truncated 3-byte'          => "\xE2\x8C",
+	'truncated 4-byte'          => "\xF1\x80\x80",
+	'overlong 2-byte'           => "\xC0\x80",
+	'overlong 3-byte'           => "\xE0\x80\x80",
+	'overlong 4-byte'           => "\xF0\x80\x80\x80",
+	'surrogate U+D800'          => "\xED\xA0\x80",
+	'above U+10FFFF F4'         => "\xF4\x90\x80\x80",
+	'above U+10FFFF F5'         => "\xF5\x80\x80\x80",
+	'obsolete 5-byte'           => "\xF8\x80\x80\x80\x80",
+	'obsolete 6-byte'           => "\xFC\x80\x80\x80\x80\x80",
+	'mixed invalid in text'     => "A\xC0\x80Z",
+);
+
+$rows = array();
+foreach ( $vectors as $name => $bytes ) {
+	$rows[] = vector_row( $name, $bytes );
+}
+
+$battery_rows = array();
+foreach ( Oracles::battery() as $i => $vector ) {
+	$battery_rows[] = vector_row( "battery {$i}", $vector[0] );
+}
+
+$cases     = (int) ( $argv[1] ?? 100000 );
+$max_bytes = (int) ( $argv[2] ?? 256 );
+$stats     = array(
+	'cases'                               => 0,
+	'bytes'                               => 0,
+	'strict_valid'                        => 0,
+	'strict_invalid'                      => 0,
+	'seems_accepts_strict_invalid'        => 0,
+	'seems_rejects_strict_invalid'        => 0,
+	'seems_rejects_strict_valid'          => 0,
+	'check_utf8_keep_empty_on_invalid'    => 0,
+	'check_utf8_strip_matches_scrub'      => 0,
+	'check_utf8_strip_mismatches_scrub'   => 0,
+	'check_latin1_passthrough_on_invalid' => 0,
+	'seems_accepts_invalid_by_class'      => array(),
+	'seems_rejects_invalid_by_class'      => array(),
+	'first_example_by_class'              => array(),
+	'strategy_counts'                     => array(),
+);
+
+$start = microtime( true );
+for ( $case = 0; $case < $cases; $case++ ) {
+	$prng      = new Prng( "legacy-utf8-divergence:{$case}" );
+	$generator = new Generator( $prng, $max_bytes );
+	$generated = $generator->generate();
+	$bytes     = $generated['bytes'];
+	$strategy  = $generated['strategy'];
+	$valid     = wp_is_valid_utf8( $bytes );
+	$seems     = seems_utf8( $bytes );
+	$class     = $valid ? 'valid' : invalid_class( $bytes );
+	$scrubbed  = wp_scrub_utf8( $bytes );
+
+	++$stats['cases'];
+	$stats['bytes'] += strlen( $bytes );
+	$stats['strategy_counts'][ $strategy ] = ( $stats['strategy_counts'][ $strategy ] ?? 0 ) + 1;
+
+	if ( $valid ) {
+		++$stats['strict_valid'];
+		if ( ! $seems ) {
+			++$stats['seems_rejects_strict_valid'];
+		}
+		continue;
+	}
+
+	++$stats['strict_invalid'];
+	if ( $seems ) {
+		++$stats['seems_accepts_strict_invalid'];
+		$stats['seems_accepts_invalid_by_class'][ $class ] = ( $stats['seems_accepts_invalid_by_class'][ $class ] ?? 0 ) + 1;
+	} else {
+		++$stats['seems_rejects_strict_invalid'];
+		$stats['seems_rejects_invalid_by_class'][ $class ] = ( $stats['seems_rejects_invalid_by_class'][ $class ] ?? 0 ) + 1;
+	}
+
+	if ( ! isset( $stats['first_example_by_class'][ $class ] ) ) {
+		$stats['first_example_by_class'][ $class ] = hx( strlen( $bytes ) > 24 ? substr( $bytes, 0, 24 ) : $bytes );
+	}
+
+	if ( '' === check_invalid_with_charset( $bytes, false, 'UTF-8' ) ) {
+		++$stats['check_utf8_keep_empty_on_invalid'];
+	}
+
+	if ( check_invalid_with_charset( $bytes, true, 'UTF-8' ) === $scrubbed ) {
+		++$stats['check_utf8_strip_matches_scrub'];
+	} else {
+		++$stats['check_utf8_strip_mismatches_scrub'];
+	}
+
+	if (
+		check_invalid_with_charset( $bytes, false, 'ISO-8859-1' ) === $bytes &&
+		check_invalid_with_charset( $bytes, true, 'ISO-8859-1' ) === $bytes
+	) {
+		++$stats['check_latin1_passthrough_on_invalid'];
+	}
+}
+
+ksort( $stats['seems_accepts_invalid_by_class'] );
+ksort( $stats['seems_rejects_invalid_by_class'] );
+ksort( $stats['strategy_counts'] );
+
+$stats['elapsed_sec'] = round( microtime( true ) - $start, 3 );
+
+echo json_encode(
+	array(
+		'environment' => array(
+			'php'       => PHP_VERSION,
+			'mbstring'  => extension_loaded( 'mbstring' ),
+			'intl'      => extension_loaded( 'intl' ),
+			'pcre_u'    => _wp_can_use_pcre_u(),
+			'cases'     => $cases,
+			'max_bytes' => $max_bytes,
+		),
+		'vectors'     => $rows,
+		'battery'     => $battery_rows,
+		'stats'       => $stats,
+	),
+	JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
+);

From bce0942d855443b3d4bdfeff6925c944a2da06c1 Mon Sep 17 00:00:00 2001
From: Jon Surrell <sirreal@users.noreply.github.com>
Date: Wed, 10 Jun 2026 23:07:49 +0200
Subject: [PATCH 3/3] Charset: Remove legacy UTF-8 survey runner

---
 handoffs/legacy-utf8-divergence-report.md     |   4 +
 .../legacy-utf8-divergence-survey-runner.php  | 365 ------------------
 2 files changed, 4 insertions(+), 365 deletions(-)
 delete mode 100644 handoffs/legacy-utf8-divergence-survey-runner.php

diff --git a/handoffs/legacy-utf8-divergence-report.md b/handoffs/legacy-utf8-divergence-report.md
index 9b26c144826c0..e9519a449ec62 100644
--- a/handoffs/legacy-utf8-divergence-report.md
+++ b/handoffs/legacy-utf8-divergence-report.md
@@ -18,6 +18,10 @@ The generated pass was deterministic: case `N` used
 php /private/tmp/legacy_utf8_divergence_survey.php 3000000 256 > /private/tmp/legacy_utf8_divergence_survey_results.json
 ```
 
+For auditability, a cleaned-up copy of the throwaway runner was committed in
+`700d7c8c910f` (`Charset: Add legacy UTF-8 survey runner`) and removed in the
+follow-up commit after this report recorded that provenance.
+
 Important current-branch note: the handoff describes
 `wp_check_invalid_utf8()` as PCRE-based. That is historically correct, but this
 checkout already contains the 6.9-era rewrite from `d1e7f5625b`, so the current
diff --git a/handoffs/legacy-utf8-divergence-survey-runner.php b/handoffs/legacy-utf8-divergence-survey-runner.php
deleted file mode 100644
index 12784fd6a2219..0000000000000
--- a/handoffs/legacy-utf8-divergence-survey-runner.php
+++ /dev/null
@@ -1,365 +0,0 @@
-<?php
-/**
- * One-shot runner for the legacy UTF-8 helper divergence report.
- *
- * Usage:
- *
- *     php handoffs/legacy-utf8-divergence-survey-runner.php 3000000 256 > /tmp/legacy_utf8_divergence_survey_results.json
- *
- * The runner expects the encoding fuzzer checkout beside this repository by
- * default, or in ENCODING_FUZZER_ROOT when provided. It deliberately loads a
- * small subset of WordPress with stubs because the report is about byte-level
- * helper behavior, not full WordPress bootstrap behavior.
- */
-
-use EncodingFuzz\Generator;
-use EncodingFuzz\Oracles;
-use EncodingFuzz\Prng;
-
-function survey_repo_root(): string {
-	return dirname( __DIR__ );
-}
-
-function survey_fuzzer_root(): string {
-	$from_env = getenv( 'ENCODING_FUZZER_ROOT' );
-	if ( is_string( $from_env ) && '' !== $from_env ) {
-		return rtrim( $from_env, '/' );
-	}
-
-	return dirname( survey_repo_root() ) . '/encoding-fuzzer';
-}
-
-$fuzzer_root = survey_fuzzer_root();
-require $fuzzer_root . '/tools/encoding-fuzz/lib/autoload.php';
-
-$GLOBALS['survey_blog_charset'] = 'UTF-8';
-
-function _deprecated_function( $function_name, $version, $replacement = '' ) {}
-function _deprecated_argument( $function_name, $version, $message = '' ) {}
-function get_option( $name ) {
-	return 'blog_charset' === $name ? $GLOBALS['survey_blog_charset'] : null;
-}
-function mbstring_binary_safe_encoding( $reset = false ) {}
-function reset_mbstring_encoding() {}
-
-require survey_repo_root() . '/src/wp-includes/compat.php';
-
-function is_utf8_charset( $blog_charset = null ) {
-	return _is_utf8_charset( $blog_charset ?? get_option( 'blog_charset' ) );
-}
-
-require survey_repo_root() . '/src/wp-includes/compat-utf8.php';
-require survey_repo_root() . '/src/wp-includes/utf8.php';
-require survey_repo_root() . '/src/wp-includes/formatting.php';
-
-function hx( string $bytes ): string {
-	return strtoupper( trim( chunk_split( bin2hex( $bytes ), 2, ' ' ) ) );
-}
-
-function visible( string $bytes ): string {
-	if ( '' === $bytes ) {
-		return "''";
-	}
-
-	return hx( $bytes );
-}
-
-function check_invalid_with_charset( string $bytes, bool $strip, string $charset ): string {
-	$GLOBALS['survey_blog_charset'] = $charset;
-
-	return wp_check_invalid_utf8_uncached( $bytes, $strip );
-}
-
-function wp_check_invalid_utf8_uncached( string $text, bool $strip ): string {
-	$text = (string) $text;
-
-	if ( 0 === strlen( $text ) ) {
-		return '';
-	}
-
-	if ( ! is_utf8_charset() || wp_is_valid_utf8( $text ) ) {
-		return $text;
-	}
-
-	return $strip ? wp_scrub_utf8( $text ) : '';
-}
-
-function invalid_class( string $bytes ): string {
-	$length = strlen( $bytes );
-	for ( $i = 0; $i < $length; $i++ ) {
-		$b0 = ord( $bytes[ $i ] );
-		if ( $b0 < 0x80 ) {
-			continue;
-		}
-
-		if ( $b0 >= 0x80 && $b0 <= 0xBF ) {
-			return 'lone continuation byte';
-		}
-
-		if ( $b0 >= 0xC0 && $b0 <= 0xC1 ) {
-			return has_continuations( $bytes, $i + 1, 1 ) ? 'overlong 2-byte sequence' : 'truncated C0/C1 lead';
-		}
-
-		if ( $b0 >= 0xC2 && $b0 <= 0xDF ) {
-			if ( ! has_continuations( $bytes, $i + 1, 1 ) ) {
-				return 'truncated 2-byte sequence';
-			}
-			$i += 1;
-			continue;
-		}
-
-		if ( 0xE0 === $b0 ) {
-			if ( ! has_continuations( $bytes, $i + 1, 2 ) ) {
-				return 'truncated 3-byte sequence';
-			}
-			if ( ord( $bytes[ $i + 1 ] ) < 0xA0 ) {
-				return 'overlong 3-byte sequence';
-			}
-			$i += 2;
-			continue;
-		}
-
-		if ( $b0 >= 0xE1 && $b0 <= 0xEC ) {
-			if ( ! has_continuations( $bytes, $i + 1, 2 ) ) {
-				return 'truncated 3-byte sequence';
-			}
-			$i += 2;
-			continue;
-		}
-
-		if ( 0xED === $b0 ) {
-			if ( ! has_continuations( $bytes, $i + 1, 2 ) ) {
-				return 'truncated 3-byte sequence';
-			}
-			if ( ord( $bytes[ $i + 1 ] ) >= 0xA0 ) {
-				return 'UTF-16 surrogate sequence';
-			}
-			$i += 2;
-			continue;
-		}
-
-		if ( $b0 >= 0xEE && $b0 <= 0xEF ) {
-			if ( ! has_continuations( $bytes, $i + 1, 2 ) ) {
-				return 'truncated 3-byte sequence';
-			}
-			$i += 2;
-			continue;
-		}
-
-		if ( 0xF0 === $b0 ) {
-			if ( ! has_continuations( $bytes, $i + 1, 3 ) ) {
-				return 'truncated 4-byte sequence';
-			}
-			if ( ord( $bytes[ $i + 1 ] ) < 0x90 ) {
-				return 'overlong 4-byte sequence';
-			}
-			$i += 3;
-			continue;
-		}
-
-		if ( $b0 >= 0xF1 && $b0 <= 0xF3 ) {
-			if ( ! has_continuations( $bytes, $i + 1, 3 ) ) {
-				return 'truncated 4-byte sequence';
-			}
-			$i += 3;
-			continue;
-		}
-
-		if ( 0xF4 === $b0 ) {
-			if ( ! has_continuations( $bytes, $i + 1, 3 ) ) {
-				return 'truncated 4-byte sequence';
-			}
-			if ( ord( $bytes[ $i + 1 ] ) > 0x8F ) {
-				return 'code point above U+10FFFF';
-			}
-			$i += 3;
-			continue;
-		}
-
-		if ( $b0 >= 0xF5 && $b0 <= 0xF7 ) {
-			return has_continuations( $bytes, $i + 1, 3 ) ? 'code point above U+10FFFF' : 'invalid F5-F7 lead';
-		}
-
-		if ( $b0 >= 0xF8 && $b0 <= 0xFB ) {
-			return has_continuations( $bytes, $i + 1, 4 ) ? 'obsolete 5-byte sequence' : 'invalid F8-FB lead';
-		}
-
-		if ( $b0 >= 0xFC && $b0 <= 0xFD ) {
-			return has_continuations( $bytes, $i + 1, 5 ) ? 'obsolete 6-byte sequence' : 'invalid FC-FD lead';
-		}
-
-		return 'FE/FF invalid lead';
-	}
-
-	return 'valid';
-}
-
-function has_continuations( string $bytes, int $start, int $count ): bool {
-	for ( $i = 0; $i < $count; $i++ ) {
-		$at = $start + $i;
-		if ( $at >= strlen( $bytes ) ) {
-			return false;
-		}
-
-		$b = ord( $bytes[ $at ] );
-		if ( ( $b & 0xC0 ) !== 0x80 ) {
-			return false;
-		}
-	}
-
-	return true;
-}
-
-function vector_row( string $name, string $bytes ): array {
-	$valid        = wp_is_valid_utf8( $bytes );
-	$seems        = seems_utf8( $bytes );
-	$scrubbed     = wp_scrub_utf8( $bytes );
-	$check_keep   = check_invalid_with_charset( $bytes, false, 'UTF-8' );
-	$check_strip  = check_invalid_with_charset( $bytes, true, 'UTF-8' );
-	$latin_keep   = check_invalid_with_charset( $bytes, false, 'ISO-8859-1' );
-	$latin_strip  = check_invalid_with_charset( $bytes, true, 'ISO-8859-1' );
-
-	return array(
-		'name'                  => $name,
-		'hex'                   => hx( $bytes ),
-		'class'                 => invalid_class( $bytes ),
-		'wp_is_valid_utf8'      => $valid,
-		'seems_utf8'            => $seems,
-		'wp_scrub_utf8_hex'     => visible( $scrubbed ),
-		'check_utf8_keep_hex'   => visible( $check_keep ),
-		'check_utf8_strip_hex'  => visible( $check_strip ),
-		'check_latin_keep_hex'  => visible( $latin_keep ),
-		'check_latin_strip_hex' => visible( $latin_strip ),
-	);
-}
-
-$vectors = array(
-	'ascii'                     => 'A',
-	'valid 2-byte lower edge'   => "\xC2\x80",
-	'valid 3-byte lower edge'   => "\xE0\xA0\x80",
-	'valid 4-byte upper edge'   => "\xF4\x8F\xBF\xBF",
-	'valid noncharacter U+FFFE' => "\xEF\xBF\xBE",
-	'valid replacement U+FFFD'  => "\xEF\xBF\xBD",
-	'lone continuation'         => "\x80",
-	'FE invalid lead'           => "\xFE",
-	'truncated 2-byte'          => "\xC2",
-	'truncated 3-byte'          => "\xE2\x8C",
-	'truncated 4-byte'          => "\xF1\x80\x80",
-	'overlong 2-byte'           => "\xC0\x80",
-	'overlong 3-byte'           => "\xE0\x80\x80",
-	'overlong 4-byte'           => "\xF0\x80\x80\x80",
-	'surrogate U+D800'          => "\xED\xA0\x80",
-	'above U+10FFFF F4'         => "\xF4\x90\x80\x80",
-	'above U+10FFFF F5'         => "\xF5\x80\x80\x80",
-	'obsolete 5-byte'           => "\xF8\x80\x80\x80\x80",
-	'obsolete 6-byte'           => "\xFC\x80\x80\x80\x80\x80",
-	'mixed invalid in text'     => "A\xC0\x80Z",
-);
-
-$rows = array();
-foreach ( $vectors as $name => $bytes ) {
-	$rows[] = vector_row( $name, $bytes );
-}
-
-$battery_rows = array();
-foreach ( Oracles::battery() as $i => $vector ) {
-	$battery_rows[] = vector_row( "battery {$i}", $vector[0] );
-}
-
-$cases     = (int) ( $argv[1] ?? 100000 );
-$max_bytes = (int) ( $argv[2] ?? 256 );
-$stats     = array(
-	'cases'                               => 0,
-	'bytes'                               => 0,
-	'strict_valid'                        => 0,
-	'strict_invalid'                      => 0,
-	'seems_accepts_strict_invalid'        => 0,
-	'seems_rejects_strict_invalid'        => 0,
-	'seems_rejects_strict_valid'          => 0,
-	'check_utf8_keep_empty_on_invalid'    => 0,
-	'check_utf8_strip_matches_scrub'      => 0,
-	'check_utf8_strip_mismatches_scrub'   => 0,
-	'check_latin1_passthrough_on_invalid' => 0,
-	'seems_accepts_invalid_by_class'      => array(),
-	'seems_rejects_invalid_by_class'      => array(),
-	'first_example_by_class'              => array(),
-	'strategy_counts'                     => array(),
-);
-
-$start = microtime( true );
-for ( $case = 0; $case < $cases; $case++ ) {
-	$prng      = new Prng( "legacy-utf8-divergence:{$case}" );
-	$generator = new Generator( $prng, $max_bytes );
-	$generated = $generator->generate();
-	$bytes     = $generated['bytes'];
-	$strategy  = $generated['strategy'];
-	$valid     = wp_is_valid_utf8( $bytes );
-	$seems     = seems_utf8( $bytes );
-	$class     = $valid ? 'valid' : invalid_class( $bytes );
-	$scrubbed  = wp_scrub_utf8( $bytes );
-
-	++$stats['cases'];
-	$stats['bytes'] += strlen( $bytes );
-	$stats['strategy_counts'][ $strategy ] = ( $stats['strategy_counts'][ $strategy ] ?? 0 ) + 1;
-
-	if ( $valid ) {
-		++$stats['strict_valid'];
-		if ( ! $seems ) {
-			++$stats['seems_rejects_strict_valid'];
-		}
-		continue;
-	}
-
-	++$stats['strict_invalid'];
-	if ( $seems ) {
-		++$stats['seems_accepts_strict_invalid'];
-		$stats['seems_accepts_invalid_by_class'][ $class ] = ( $stats['seems_accepts_invalid_by_class'][ $class ] ?? 0 ) + 1;
-	} else {
-		++$stats['seems_rejects_strict_invalid'];
-		$stats['seems_rejects_invalid_by_class'][ $class ] = ( $stats['seems_rejects_invalid_by_class'][ $class ] ?? 0 ) + 1;
-	}
-
-	if ( ! isset( $stats['first_example_by_class'][ $class ] ) ) {
-		$stats['first_example_by_class'][ $class ] = hx( strlen( $bytes ) > 24 ? substr( $bytes, 0, 24 ) : $bytes );
-	}
-
-	if ( '' === check_invalid_with_charset( $bytes, false, 'UTF-8' ) ) {
-		++$stats['check_utf8_keep_empty_on_invalid'];
-	}
-
-	if ( check_invalid_with_charset( $bytes, true, 'UTF-8' ) === $scrubbed ) {
-		++$stats['check_utf8_strip_matches_scrub'];
-	} else {
-		++$stats['check_utf8_strip_mismatches_scrub'];
-	}
-
-	if (
-		check_invalid_with_charset( $bytes, false, 'ISO-8859-1' ) === $bytes &&
-		check_invalid_with_charset( $bytes, true, 'ISO-8859-1' ) === $bytes
-	) {
-		++$stats['check_latin1_passthrough_on_invalid'];
-	}
-}
-
-ksort( $stats['seems_accepts_invalid_by_class'] );
-ksort( $stats['seems_rejects_invalid_by_class'] );
-ksort( $stats['strategy_counts'] );
-
-$stats['elapsed_sec'] = round( microtime( true ) - $start, 3 );
-
-echo json_encode(
-	array(
-		'environment' => array(
-			'php'       => PHP_VERSION,
-			'mbstring'  => extension_loaded( 'mbstring' ),
-			'intl'      => extension_loaded( 'intl' ),
-			'pcre_u'    => _wp_can_use_pcre_u(),
-			'cases'     => $cases,
-			'max_bytes' => $max_bytes,
-		),
-		'vectors'     => $rows,
-		'battery'     => $battery_rows,
-		'stats'       => $stats,
-	),
-	JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
-);