sirreal · sirreal · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php
@@ -195,6 +195,8 @@ public static function decode( $context, $text ): string {
 	 *     7    === $token_length; // `&notin;`
 	 *
 	 * @since 6.6.0
+	 * @since 7.1.0 Detects ambiguous followers of semicolon-less references
+	 *              by ASCII classification only, independent of the locale.
 	 *
 	 * @global WP_Token_Map $html5_named_character_references Mappings for HTML5 named character references.
 	 *
@@ -377,14 +379,20 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
 		 * At this point though there's a match for an entry in the named
 		 * character reference table but the match doesn't end in `;`.
 		 * It may be allowed if it's followed by something unambiguous.
+		 *
+		 * Only an ASCII alphanumeric or U+003D EQUALS SIGN is ambiguous.
+		 * `ctype_alnum()` must be avoided here: its classification of
+		 * bytes 0x80 and above depends on the process locale, but only
+		 * these specific ASCII characters prevent decoding.
+		 *
+		 * @see https://html.spec.whatwg.org/#named-character-reference-state
 		 */
+		$follower           = $after_name < $length ? $text[ $after_name ] : '';
 		$ambiguous_follower = (
-			$after_name < $length &&
-			$name_at < $length &&
-			(
-				ctype_alnum( $text[ $after_name ] ) ||
-				'=' === $text[ $after_name ]
-			)
+			( 'a' <= $follower && 'z' >= $follower ) ||
+			( 'A' <= $follower && 'Z' >= $follower ) ||
+			( '0' <= $follower && '9' >= $follower ) ||
+			'=' === $follower
 		);
 
 		// It's non-ambiguous, safe to leave it in.

diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -5254,6 +5254,8 @@ public function get_namespace(): string {
 	 *     $processor->get_tag() === null;
 	 *
 	 * @since 6.4.0
+	 * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD,
+	 *              matching the tokenizer replacement browsers apply.
 	 *
 	 * @return string|null Name of currently matched tag in input HTML, or `null` if none found.
 	 */
@@ -5315,6 +5317,8 @@ public function has_self_closing_flag(): bool {
 	 * of the document without matching a token.
 	 *
 	 * @since 6.6.0 Subclassed for the HTML Processor.
+	 * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD,
+	 *              matching the tokenizer replacement browsers apply.
 	 *
 	 * @return string|null Name of the matched token.
 	 */

diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -1174,7 +1174,13 @@ public function paused_at_incomplete_token(): bool {
 	 *     }
 	 *     // Outputs: "free <egg> lang-en "
 	 *
+	 * Class names from the input document already carry the tokenizer's
+	 * U+FFFD replacement of NULL bytes through `get_attribute()`; values
+	 * supplied through the API are returned verbatim, as `Element.classList`
+	 * does in the DOM.
+	 *
 	 * @since 6.4.0
+	 * @since 7.1.0 No longer replaces NULL bytes in API-supplied class values.
 	 *
 	 * @return Generator<int, non-empty-string>
 	 */
@@ -1208,7 +1214,7 @@ public function class_list() {
 				return;
 			}
 
-			$name = str_replace( "\x00", "\u{FFFD}", substr( $class, $at, $length ) );
+			$name = substr( $class, $at, $length );
 			if ( $is_quirks ) {
 				$name = strtolower( $name );
 			}
@@ -2231,9 +2237,16 @@ private function parse_next_attribute(): bool {
 		 * > case-insensitive match for each other.
 		 *     - HTML 5 spec
 		 *
+		 * The tokenizer would have replaced U+0000 NULL bytes in attribute
+		 * names with U+FFFD, so names which differ only by those bytes are
+		 * duplicates. The replacement applies to the comparable name — a
+		 * comparison artifact — while the raw span in the document remains
+		 * untouched.
+		 *
 		 * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
+		 * @see https://html.spec.whatwg.org/#attribute-name-state
 		 */
-		$comparable_name = strtolower( $attribute_name );
+		$comparable_name = strtolower( str_replace( "\x00", "\u{FFFD}", $attribute_name ) );
 
 		// If an attribute is listed many times, only use the first declaration and ignore the rest.
 		if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
@@ -2359,13 +2372,7 @@ private function class_name_updates_to_attributes_updates(): void {
 		}
 
 		if ( false === $existing_class && isset( $this->attributes['class'] ) ) {
-			$existing_class = WP_HTML_Decoder::decode_attribute(
-				substr(
-					$this->html,
-					$this->attributes['class']->value_starts_at,
-					$this->attributes['class']->value_length
-				)
-			);
+			$existing_class = $this->get_decoded_source_attribute_value( $this->attributes['class'] );
 		}
 
 		if ( false === $existing_class ) {
@@ -2771,6 +2778,11 @@ private function get_enqueued_attribute_value( string $comparable_name ) {
 	 *     $p->get_attribute( 'class' ) === null;
 	 *
 	 * @since 6.2.0
+	 * @since 7.1.0 Applies input-stream preprocessing: newlines in the source value
+	 *              are normalized and NULL bytes are replaced with U+FFFD, as
+	 *              browsers do before decoding character references. Attributes
+	 *              whose source name contains a NULL byte are addressed by the
+	 *              name with U+FFFD in its place, as in the DOM.
 	 *
 	 * @param string $name Name of attribute whose value is requested.
 	 * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
@@ -2793,7 +2805,7 @@ public function get_attribute( $name ) {
 		 * attribute values. If any exist, those enqueued class changes must first be flushed out
 		 * into an attribute value update.
 		 */
-		if ( 'class' === $name ) {
+		if ( 'class' === $comparable ) {
 			$this->class_name_updates_to_attributes_updates();
 		}
 
@@ -2824,9 +2836,58 @@ public function get_attribute( $name ) {
 			return true;
 		}
 
+		return $this->get_decoded_source_attribute_value( $attribute );
+	}
+
+	/**
+	 * Returns the decoded value of an attribute found in the input document.
+	 *
+	 * The Tag Processor defers the HTML input-stream preprocessing and the
+	 * tokenizer's replacements while scanning; they must be applied when
+	 * reading a value out of the document: newlines are normalized before
+	 * character references decode, and U+0000 NULL bytes are replaced
+	 * with U+FFFD. The replacements operate on bytes; NULL bytes inside
+	 * invalid UTF-8 sequences are replaced individually where a browser,
+	 * decoding the byte stream into characters first, may differ.
+	 *
+	 * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
+	 * @see https://html.spec.whatwg.org/#attribute-value-(double-quoted)-state
+	 *
+	 * @since 7.1.0
+	 *
+	 * @param WP_HTML_Attribute_Token $attribute Attribute token from the input document.
+	 * @return string Decoded attribute value.
+	 */
+	private function get_decoded_source_attribute_value( WP_HTML_Attribute_Token $attribute ): string {
 		$raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length );
 
-		return WP_HTML_Decoder::decode_attribute( $raw_value );
+		/*
+		 * Newline normalization is part of preprocessing the input stream
+		 * and precedes character reference decoding: `&#13;` decodes into
+		 * a carriage return which must be preserved. The check avoids
+		 * scanning the value again when it contains no carriage return;
+		 * most values contain none.
+		 */
+		if ( false !== strpos( $raw_value, "\r" ) ) {
+			$raw_value = str_replace( "\r\n", "\n", $raw_value );
+			$raw_value = str_replace( "\r", "\n", $raw_value );
+		}
+
+		$decoded_value = WP_HTML_Decoder::decode_attribute( $raw_value );
+
+		/*
+		 * The tokenizer replaces U+0000 NULL bytes as it consumes input:
+		 * character references see the raw NULL byte — an unambiguous
+		 * follower for references without a terminating semicolon — and
+		 * no character reference decodes into NULL, so the replacement
+		 * applies equivalently after decoding, where it cannot disturb
+		 * how references parse.
+		 */
+		if ( false !== strpos( $decoded_value, "\x00" ) ) {
+			$decoded_value = str_replace( "\x00", "\u{FFFD}", $decoded_value );
+		}
+
+		return $decoded_value;
 	}
 
 	/**
@@ -2849,6 +2910,10 @@ public function get_attribute( $name ) {
 	 *     $p->get_attribute_names_with_prefix( 'data-' ) === null;
 	 *
 	 * @since 6.2.0
+	 * @since 7.1.0 NULL bytes in source attribute names are returned as U+FFFD,
+	 *              matching the tokenizer replacement browsers apply. The prefix
+	 *              is matched verbatim against these replaced names; a prefix
+	 *              containing a NULL byte matches nothing.
 	 *
 	 * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
 	 *
@@ -2898,6 +2963,8 @@ public function get_namespace(): string {
 	 *     $p->get_tag() === null;
 	 *
 	 * @since 6.2.0
+	 * @since 7.1.0 NULL bytes in the source tag name are returned as U+FFFD,
+	 *              matching the tokenizer replacement browsers apply.
 	 *
 	 * @return string|null Name of currently matched tag in input HTML, or `null` if none found.
 	 */
@@ -2906,7 +2973,15 @@ public function get_tag(): ?string {
 			return null;
 		}
 
-		$tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
+		/*
+		 * The tokenizer would have replaced U+0000 NULL bytes in the tag
+		 * name with U+FFFD; this is deferred to this read boundary. The
+		 * replacement never applies to internal identification, which
+		 * compares raw bytes (`scr\x00ipt` is not SCRIPT in browsers either).
+		 *
+		 * @see https://html.spec.whatwg.org/#tag-name-state
+		 */
+		$tag_name = str_replace( "\x00", "\u{FFFD}", substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ) );
 
 		if ( self::STATE_MATCHED_TAG === $this->parser_state ) {
 			return strtoupper( $tag_name );
@@ -2927,6 +3002,8 @@ public function get_tag(): ?string {
 	 * account the current parsing context, whether HTML, SVG, or MathML.
 	 *
 	 * @since 6.7.0
+	 * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD,
+	 *              matching the tokenizer replacement browsers apply.
 	 *
 	 * @return string|null Name of current tag name.
 	 */
@@ -3425,6 +3502,8 @@ public function get_token_type(): ?string {
 	 * of the document without matching a token.
 	 *
 	 * @since 6.5.0
+	 * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD,
+	 *              matching the tokenizer replacement browsers apply.
 	 *
 	 * @return string|null Name of the matched token.
 	 */
@@ -3782,6 +3861,12 @@ public function get_modifiable_text(): string {
 	 *     // Renders as “Eggs &amp; Milk” in a browser, encoded as `<p>Eggs &amp;amp; Milk</p>`.
 	 *     $processor->set_modifiable_text( 'Eggs &amp; Milk' );
 	 *
+	 * Note: unlike attribute values set through `set_attribute()`, which read
+	 * back verbatim, text set through this method currently reads back through
+	 * `get_modifiable_text()` with newlines normalized and NULL bytes handled
+	 * as if the text had come from the input document. In the DOM, API-supplied
+	 * text round-trips verbatim; this asymmetry is a known limitation.
+	 *
 	 * @since 6.7.0
 	 * @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
 	 *
@@ -4770,14 +4855,37 @@ private function matches(): bool {
 		}
 
 		// Does the tag name match the requested tag name in a case-insensitive manner?
-		if (
-			isset( $this->sought_tag_name ) &&
-			(
-				strlen( $this->sought_tag_name ) !== $this->tag_name_length ||
-				0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true )
-			)
-		) {
-			return false;
+		if ( isset( $this->sought_tag_name ) ) {
+			$tag_name_matches = (
+				strlen( $this->sought_tag_name ) === $this->tag_name_length &&
+				0 === substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true )
+			);
+
+			/*
+			 * Names are matched in the same alphabet `get_tag()` exposes,
+			 * where U+0000 NULL bytes appear as U+FFFD: a sought name
+			 * containing U+FFFD matches source names with NULL bytes in
+			 * its place, and a sought name containing a NULL byte matches
+			 * nothing, since no exposed name contains one. The byte
+			 * comparison above already agrees for names without NULL
+			 * bytes, so this only resolves the rare disagreements.
+			 */
+			if ( $tag_name_matches ) {
+				$tag_name_matches = false === strpos( $this->sought_tag_name, "\x00" );
+			} elseif ( false !== strpos( $this->sought_tag_name, "\u{FFFD}" ) ) {
+				$raw_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
+				if ( false !== strpos( $raw_name, "\x00" ) ) {
+					$exposed_name     = str_replace( "\x00", "\u{FFFD}", $raw_name );
+					$tag_name_matches = (
+						strlen( $this->sought_tag_name ) === strlen( $exposed_name ) &&
+						0 === substr_compare( $exposed_name, $this->sought_tag_name, 0, strlen( $exposed_name ), true )
+					);
+				}
+			}
+
+			if ( ! $tag_name_matches ) {
+				return false;
+			}
 		}
 
 		if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {

diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php
@@ -61,6 +61,84 @@ static function ( int $errno, string $errstr ) use ( &$errors ) {
 		$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
 	}
 
+	/**
+	 * Ensures that numeric character references for U+0000 decode to U+FFFD
+	 * while raw NULL bytes pass through the decoder untransformed.
+	 *
+	 * The tokenizer, not the decoder, is responsible for replacing raw NULL
+	 * bytes; in the Tag Processor that responsibility falls on the methods
+	 * which read values out of the input document.
+	 *
+	 * @ticket 65372
+	 *
+	 * @dataProvider data_null_code_points
+	 *
+	 * @param string $raw_value     Raw attribute value.
+	 * @param string $decoded_value The expected decoded attribute value.
+	 */
+	public function test_null_code_points_in_attribute_values( string $raw_value, string $decoded_value ) {
+		$this->assertSame(
+			$decoded_value,
+			WP_HTML_Decoder::decode_attribute( $raw_value ),
+			'Improperly decoded raw attribute value.'
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[]
+	 */
+	public static function data_null_code_points() {
+		return array(
+			'Decimal zero'                 => array( 'a&#0;b', "a\u{FFFD}b" ),
+			'Hexadecimal zero'             => array( 'a&#x0;b', "a\u{FFFD}b" ),
+			'Multiple zeros'               => array( 'a&#0000;b', "a\u{FFFD}b" ),
+			'Raw NULL byte passes through' => array( "a\x00b", "a\x00b" ),
+		);
+	}
+
+	/**
+	 * Ensures that the ambiguous-follower check for character references
+	 * lacking a terminating semicolon treats only ASCII alphanumerics and
+	 * the equals sign as ambiguous, regardless of the process locale.
+	 *
+	 * `ctype_alnum()` classifies bytes 0x80 and above as alphanumeric under
+	 * UTF-8 locales, wrongly suppressing decodes whose follower is a
+	 * non-ASCII byte, such as U+FFFD produced by NULL-byte replacement.
+	 *
+	 * @ticket 65372
+	 *
+	 * @see https://html.spec.whatwg.org/#named-character-reference-state
+	 *
+	 * @dataProvider data_semicolon_less_references_with_followers
+	 *
+	 * @param string $raw_value     Raw attribute value.
+	 * @param string $decoded_value The expected decoded attribute value.
+	 */
+	public function test_semicolon_less_reference_followers( string $raw_value, string $decoded_value ) {
+		$this->assertSame(
+			$decoded_value,
+			WP_HTML_Decoder::decode_attribute( $raw_value ),
+			'Improperly decoded raw attribute value.'
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[]
+	 */
+	public static function data_semicolon_less_references_with_followers() {
+		return array(
+			'U+FFFD follower decodes'            => array( "x&amp\u{FFFD};y", "x&\u{FFFD};y" ),
+			'Non-ASCII follower decodes'         => array( "x&amp\u{E9}y", "x&\u{E9}y" ),
+			'ASCII letter follower is ambiguous' => array( 'x&ampzy', 'x&ampzy' ),
+			'ASCII digit follower is ambiguous'  => array( 'x&amp1y', 'x&amp1y' ),
+			'Equals sign follower is ambiguous'  => array( 'x&amp=y', 'x&amp=y' ),
+		);
+	}
+
 	/**
 	 * Ensures proper detection of attribute prefixes ignoring ASCII case.
 	 *