From 9ca76c85c8ce225f68d3f8e299e2f1c444c51c49 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 12:34:02 +0200 Subject: [PATCH 1/2] Charset: Limit UTF-8 ASCII scan to code point budget When _wp_scan_utf8() is called with max_code_points but no max_bytes, the ASCII fast path previously called strspn() across the entire remaining ASCII run before checking the code point limit. This made _wp_utf8_codepoint_span( large ASCII text, 0, 5 ) scan the full string. Bound strspn() by the remaining code point budget. ASCII code points are one byte, so this preserves the returned span and found count while avoiding work past the budget. Local benchmark, 10 MB ASCII _wp_utf8_codepoint_span( ..., 0, 5 ): original 3.183709 ms, patched 0.001250 ms. Differential fuzz: 189847 span/found cases and 2750 valid mb_substr sanity cases, no mismatches. --- src/wp-includes/compat-utf8.php | 2 +- .../tests/compat/wpUtf8CodePointSpan.php | 104 ++++++++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 tests/phpunit/tests/compat/wpUtf8CodePointSpan.php diff --git a/src/wp-includes/compat-utf8.php b/src/wp-includes/compat-utf8.php index e1cab36ea3244..5fa8cde158789 100644 --- a/src/wp-includes/compat-utf8.php +++ b/src/wp-includes/compat-utf8.php @@ -65,7 +65,7 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", $i, - $end - $i + min( $end - $i, $max_count - $count ) ); if ( $count + $ascii_byte_count >= $max_count ) { diff --git a/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php b/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php new file mode 100644 index 0000000000000..4464f7da824d1 --- /dev/null +++ b/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php @@ -0,0 +1,104 @@ +assertSame( + $expected_span, + _wp_utf8_codepoint_span( $text, $byte_offset, $max_code_points, $found_code_points ), + 'Should have found the expected byte span.' + ); + + $this->assertSame( + $expected_found, + $found_code_points, + 'Should have reported the expected number of code points.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_codepoint_spans() { + $long_ascii_run = str_repeat( 'a', 1024 ); + + return array( + 'zero code point budget' => array( + 'abcdef', + 0, + 0, + 0, + 0, + ), + 'long ASCII run at start' => array( + $long_ascii_run, + 0, + 5, + 5, + 5, + ), + 'long ASCII run from non-zero offset' => array( + "zz{$long_ascii_run}", + 2, + 5, + 5, + 5, + ), + 'multibyte character before the boundary' => array( + "ab\u{1F170}cd", + 0, + 2, + 2, + 2, + ), + 'multibyte character at the boundary' => array( + "ab\u{1F170}cd", + 0, + 3, + strlen( "ab\u{1F170}" ), + 3, + ), + 'invalid span after the boundary' => array( + "ab\xF0\x9Fzz", + 0, + 2, + 2, + 2, + ), + 'invalid span at the boundary' => array( + "ab\xF0\x9Fzz", + 0, + 3, + 4, + 3, + ), + ); + } +} From a70518d2d6f1c63b857800aab3987117fdf5b6c5 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 13:13:40 +0200 Subject: [PATCH 2/2] Tests: Refine UTF-8 code point span docs --- tests/phpunit/tests/compat/wpUtf8CodePointSpan.php | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php b/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php index 4464f7da824d1..da66095ce79af 100644 --- a/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php +++ b/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php @@ -5,8 +5,6 @@ * @package WordPress * @subpackage Charset * - * @since 6.9.0 - * * @group compat * * @covers ::_wp_utf8_codepoint_span() @@ -15,8 +13,6 @@ class Tests_Compat_wpUtf8CodePointSpan extends WP_UnitTestCase { /** * Ensures that the span accounts for the requested number of code points. * - * @ticket 63863 - * * @dataProvider data_codepoint_spans * * @param string $text @@ -44,7 +40,7 @@ public function test_finds_codepoint_spans( string $text, int $byte_offset, int /** * Data provider. * - * @return array[] + * @return array */ public static function data_codepoint_spans() { $long_ascii_run = str_repeat( 'a', 1024 );