diff --git a/src/wp-includes/compat-utf8.php b/src/wp-includes/compat-utf8.php index e1cab36ea3244..5fa8cde158789 100644 --- a/src/wp-includes/compat-utf8.php +++ b/src/wp-includes/compat-utf8.php @@ -65,7 +65,7 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", $i, - $end - $i + min( $end - $i, $max_count - $count ) ); if ( $count + $ascii_byte_count >= $max_count ) { diff --git a/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php b/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php new file mode 100644 index 0000000000000..da66095ce79af --- /dev/null +++ b/tests/phpunit/tests/compat/wpUtf8CodePointSpan.php @@ -0,0 +1,100 @@ +assertSame( + $expected_span, + _wp_utf8_codepoint_span( $text, $byte_offset, $max_code_points, $found_code_points ), + 'Should have found the expected byte span.' + ); + + $this->assertSame( + $expected_found, + $found_code_points, + 'Should have reported the expected number of code points.' + ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_codepoint_spans() { + $long_ascii_run = str_repeat( 'a', 1024 ); + + return array( + 'zero code point budget' => array( + 'abcdef', + 0, + 0, + 0, + 0, + ), + 'long ASCII run at start' => array( + $long_ascii_run, + 0, + 5, + 5, + 5, + ), + 'long ASCII run from non-zero offset' => array( + "zz{$long_ascii_run}", + 2, + 5, + 5, + 5, + ), + 'multibyte character before the boundary' => array( + "ab\u{1F170}cd", + 0, + 2, + 2, + 2, + ), + 'multibyte character at the boundary' => array( + "ab\u{1F170}cd", + 0, + 3, + strlen( "ab\u{1F170}" ), + 3, + ), + 'invalid span after the boundary' => array( + "ab\xF0\x9Fzz", + 0, + 2, + 2, + 2, + ), + 'invalid span at the boundary' => array( + "ab\xF0\x9Fzz", + 0, + 3, + 4, + 3, + ), + ); + } +}