diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php
index d14009d3d9fb8..e3da947bf952f 100644
--- a/src/wp-includes/html-api/class-wp-html-decoder.php
+++ b/src/wp-includes/html-api/class-wp-html-decoder.php
@@ -378,12 +378,14 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
* character reference table but the match doesn't end in `;`.
* It may be allowed if it's followed by something unambiguous.
*/
+ $follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null;
$ambiguous_follower = (
- $after_name < $length &&
- $name_at < $length &&
+ null !== $follower_byte &&
(
- ctype_alnum( $text[ $after_name ] ) ||
- '=' === $text[ $after_name ]
+ ( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) ||
+ ( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) ||
+ ( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) ||
+ 0x3D === $follower_byte
)
);
diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php
index 97954f4eb3e30..2d46ee39753be 100644
--- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php
+++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php
@@ -61,6 +61,80 @@ static function ( int $errno, string $errstr ) use ( &$errors ) {
$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
}
+ /**
+ * Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes.
+ */
+ public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() {
+ $previous_locale = setlocale( LC_CTYPE, 0 );
+ $affected_locale = setlocale( LC_CTYPE, 'C.UTF-8', 'en_US.UTF-8', 'de_DE.UTF-8', 'fr_FR.UTF-8' );
+
+ if ( false === $affected_locale || ! ctype_alnum( "\xC2" ) ) {
+ if ( false !== $previous_locale ) {
+ setlocale( LC_CTYPE, $previous_locale );
+ }
+
+ $this->markTestSkipped( 'Requires an LC_CTYPE locale where ctype_alnum() classifies high-bit bytes as alphanumeric.' );
+ }
+
+ $raw_attribute = "Á\xC2\x80";
+
+ try {
+ $this->assertSame(
+ "\xC3\x81\xC2\x80",
+ WP_HTML_Decoder::decode_attribute( $raw_attribute ),
+ 'Should have decoded the semicolonless legacy reference before a multibyte follower.'
+ );
+
+ $match_byte_length = null;
+ $this->assertSame(
+ "\xC3\x81",
+ WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ),
+ 'Should have matched the semicolonless legacy reference before a multibyte follower.'
+ );
+ $this->assertSame( strlen( 'Á' ), $match_byte_length );
+ } finally {
+ if ( false !== $previous_locale ) {
+ setlocale( LC_CTYPE, $previous_locale );
+ }
+ }
+ }
+
+ /**
+ * Ensures semicolonless legacy references remain ambiguous before ASCII alnum or equals.
+ *
+ * @dataProvider data_ambiguous_ascii_attribute_followers
+ *
+ * @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower.
+ */
+ public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous( $raw_attribute ) {
+ $this->assertSame(
+ $raw_attribute,
+ WP_HTML_Decoder::decode_attribute( $raw_attribute ),
+ 'Should not have decoded an ambiguous semicolonless legacy reference.'
+ );
+
+ $match_byte_length = 'sentinel';
+ $this->assertNull(
+ WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ),
+ 'Should not have matched an ambiguous semicolonless legacy reference.'
+ );
+ $this->assertSame( 'sentinel', $match_byte_length );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array[].
+ */
+ public static function data_ambiguous_ascii_attribute_followers() {
+ return array(
+ 'ASCII digit' => array( 'Á0' ),
+ 'ASCII uppercase alpha' => array( 'ÁA' ),
+ 'ASCII lowercase alpha' => array( 'Áa' ),
+ 'equals' => array( 'Á=' ),
+ );
+ }
+
/**
* Ensures proper detection of attribute prefixes ignoring ASCII case.
*