diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index d14009d3d9fb8..e3da947bf952f 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -378,12 +378,14 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * character reference table but the match doesn't end in `;`. * It may be allowed if it's followed by something unambiguous. */ + $follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null; $ambiguous_follower = ( - $after_name < $length && - $name_at < $length && + null !== $follower_byte && ( - ctype_alnum( $text[ $after_name ] ) || - '=' === $text[ $after_name ] + ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) || + ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) || + ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) || + 0x3D === $follower_byte ) ); diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 97954f4eb3e30..2d46ee39753be 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -61,6 +61,80 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); } + /** + * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes. + */ + public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() { + $previous_locale = setlocale( LC_CTYPE, 0 ); + $affected_locale = setlocale( LC_CTYPE, 'C.UTF-8', 'en_US.UTF-8', 'de_DE.UTF-8', 'fr_FR.UTF-8' ); + + if ( false === $affected_locale || ! ctype_alnum( "\xC2" ) ) { + if ( false !== $previous_locale ) { + setlocale( LC_CTYPE, $previous_locale ); + } + + $this->markTestSkipped( 'Requires an LC_CTYPE locale where ctype_alnum() classifies high-bit bytes as alphanumeric.' ); + } + + $raw_attribute = "Á\xC2\x80"; + + try { + $this->assertSame( + "\xC3\x81\xC2\x80", + WP_HTML_Decoder::decode_attribute( $raw_attribute ), + 'Should have decoded the semicolonless legacy reference before a multibyte follower.' + ); + + $match_byte_length = null; + $this->assertSame( + "\xC3\x81", + WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), + 'Should have matched the semicolonless legacy reference before a multibyte follower.' + ); + $this->assertSame( strlen( 'Á' ), $match_byte_length ); + } finally { + if ( false !== $previous_locale ) { + setlocale( LC_CTYPE, $previous_locale ); + } + } + } + + /** + * Ensures semicolonless legacy references remain ambiguous before ASCII alnum or equals. + * + * @dataProvider data_ambiguous_ascii_attribute_followers + * + * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower. + */ + public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous( $raw_attribute ) { + $this->assertSame( + $raw_attribute, + WP_HTML_Decoder::decode_attribute( $raw_attribute ), + 'Should not have decoded an ambiguous semicolonless legacy reference.' + ); + + $match_byte_length = 'sentinel'; + $this->assertNull( + WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ), + 'Should not have matched an ambiguous semicolonless legacy reference.' + ); + $this->assertSame( 'sentinel', $match_byte_length ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_ambiguous_ascii_attribute_followers() { + return array( + 'ASCII digit' => array( 'Á0' ), + 'ASCII uppercase alpha' => array( 'ÁA' ), + 'ASCII lowercase alpha' => array( 'Áa' ), + 'equals' => array( 'Á=' ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. *