Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
cee0661
HTML API: Add tests for attribute value input preprocessing.
sirreal Jun 11, 2026
82a26aa
HTML API: Apply input preprocessing in get_attribute().
sirreal Jun 11, 2026
48d8fb4
HTML API: Add tests for class updates over preprocessed values.
sirreal Jun 11, 2026
d1f852c
HTML API: Apply input preprocessing when flushing class updates.
sirreal Jun 11, 2026
020155a
HTML API: Add tests for NULL bytes in attribute names.
sirreal Jun 11, 2026
442e820
HTML API: Replace NULL bytes in comparable attribute names.
sirreal Jun 11, 2026
135157f
HTML API: Add tests for NULL bytes in tag names.
sirreal Jun 11, 2026
5b8ad27
HTML API: Replace NULL bytes in tag names at the read boundary.
sirreal Jun 11, 2026
f6f58fd
HTML API: Add test for NULL bytes in API-supplied class values.
sirreal Jun 11, 2026
ba93ef4
HTML API: Stop replacing NULL bytes in API-supplied class values.
sirreal Jun 11, 2026
9baceb6
HTML API: Avoid re-scanning attribute values without CR or NULL bytes.
sirreal Jun 11, 2026
3b415d1
HTML API: Add tests for character references preceding replaced bytes.
sirreal Jun 11, 2026
8f5e8b2
HTML API: Replace NULL bytes after decoding attribute values.
sirreal Jun 11, 2026
e18f389
HTML API: Detect ambiguous character reference followers by ASCII only.
sirreal Jun 11, 2026
449bf72
HTML API: Add tests for tag-name queries over replaced names.
sirreal Jun 11, 2026
5c52634
HTML API: Match tag-name queries against replaced names.
sirreal Jun 11, 2026
5292c7d
HTML API: Add test for case-insensitive class update flushing.
sirreal Jun 11, 2026
8c26adf
HTML API: Flush class updates for any case spelling of "class".
sirreal Jun 11, 2026
e41d168
HTML API: Pin edge cases of replaced names and document boundaries.
sirreal Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions src/wp-includes/html-api/class-wp-html-decoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ public static function decode( $context, $text ): string {
* 7 === $token_length; // `∉`
*
* @since 6.6.0
* @since 7.1.0 Detects ambiguous followers of semicolon-less references
* by ASCII classification only, independent of the locale.
*
* @global WP_Token_Map $html5_named_character_references Mappings for HTML5 named character references.
*
Expand Down Expand Up @@ -377,14 +379,20 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
* At this point though there's a match for an entry in the named
* character reference table but the match doesn't end in `;`.
* It may be allowed if it's followed by something unambiguous.
*
* Only an ASCII alphanumeric or U+003D EQUALS SIGN is ambiguous.
* `ctype_alnum()` must be avoided here: its classification of
* bytes 0x80 and above depends on the process locale, but only
* these specific ASCII characters prevent decoding.
*
* @see https://html.spec.whatwg.org/#named-character-reference-state
*/
$follower = $after_name < $length ? $text[ $after_name ] : '';
$ambiguous_follower = (
$after_name < $length &&
$name_at < $length &&
(
ctype_alnum( $text[ $after_name ] ) ||
'=' === $text[ $after_name ]
)
( 'a' <= $follower && 'z' >= $follower ) ||
( 'A' <= $follower && 'Z' >= $follower ) ||
( '0' <= $follower && '9' >= $follower ) ||
'=' === $follower
);

// It's non-ambiguous, safe to leave it in.
Expand Down
4 changes: 4 additions & 0 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -5254,6 +5254,8 @@ public function get_namespace(): string {
* $processor->get_tag() === null;
*
* @since 6.4.0
* @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD,
* matching the tokenizer replacement browsers apply.
*
* @return string|null Name of currently matched tag in input HTML, or `null` if none found.
*/
Expand Down Expand Up @@ -5315,6 +5317,8 @@ public function has_self_closing_flag(): bool {
* of the document without matching a token.
*
* @since 6.6.0 Subclassed for the HTML Processor.
* @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD,
* matching the tokenizer replacement browsers apply.
*
* @return string|null Name of the matched token.
*/
Expand Down
148 changes: 128 additions & 20 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,13 @@ public function paused_at_incomplete_token(): bool {
* }
* // Outputs: "free <egg> lang-en "
*
* Class names from the input document already carry the tokenizer's
* U+FFFD replacement of NULL bytes through `get_attribute()`; values
* supplied through the API are returned verbatim, as `Element.classList`
* does in the DOM.
*
* @since 6.4.0
* @since 7.1.0 No longer replaces NULL bytes in API-supplied class values.
*
* @return Generator<int, non-empty-string>
*/
Expand Down Expand Up @@ -1208,7 +1214,7 @@ public function class_list() {
return;
}

$name = str_replace( "\x00", "\u{FFFD}", substr( $class, $at, $length ) );
$name = substr( $class, $at, $length );
if ( $is_quirks ) {
$name = strtolower( $name );
}
Expand Down Expand Up @@ -2231,9 +2237,16 @@ private function parse_next_attribute(): bool {
* > case-insensitive match for each other.
* - HTML 5 spec
*
* The tokenizer would have replaced U+0000 NULL bytes in attribute
* names with U+FFFD, so names which differ only by those bytes are
* duplicates. The replacement applies to the comparable name — a
* comparison artifact — while the raw span in the document remains
* untouched.
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
* @see https://html.spec.whatwg.org/#attribute-name-state
*/
$comparable_name = strtolower( $attribute_name );
$comparable_name = strtolower( str_replace( "\x00", "\u{FFFD}", $attribute_name ) );

// If an attribute is listed many times, only use the first declaration and ignore the rest.
if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
Expand Down Expand Up @@ -2359,13 +2372,7 @@ private function class_name_updates_to_attributes_updates(): void {
}

if ( false === $existing_class && isset( $this->attributes['class'] ) ) {
$existing_class = WP_HTML_Decoder::decode_attribute(
substr(
$this->html,
$this->attributes['class']->value_starts_at,
$this->attributes['class']->value_length
)
);
$existing_class = $this->get_decoded_source_attribute_value( $this->attributes['class'] );
}

if ( false === $existing_class ) {
Expand Down Expand Up @@ -2771,6 +2778,11 @@ private function get_enqueued_attribute_value( string $comparable_name ) {
* $p->get_attribute( 'class' ) === null;
*
* @since 6.2.0
* @since 7.1.0 Applies input-stream preprocessing: newlines in the source value
* are normalized and NULL bytes are replaced with U+FFFD, as
* browsers do before decoding character references. Attributes
* whose source name contains a NULL byte are addressed by the
* name with U+FFFD in its place, as in the DOM.
*
* @param string $name Name of attribute whose value is requested.
* @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
Expand All @@ -2793,7 +2805,7 @@ public function get_attribute( $name ) {
* attribute values. If any exist, those enqueued class changes must first be flushed out
* into an attribute value update.
*/
if ( 'class' === $name ) {
if ( 'class' === $comparable ) {
$this->class_name_updates_to_attributes_updates();
}

Expand Down Expand Up @@ -2824,9 +2836,58 @@ public function get_attribute( $name ) {
return true;
}

return $this->get_decoded_source_attribute_value( $attribute );
}

/**
* Returns the decoded value of an attribute found in the input document.
*
* The Tag Processor defers the HTML input-stream preprocessing and the
* tokenizer's replacements while scanning; they must be applied when
* reading a value out of the document: newlines are normalized before
* character references decode, and U+0000 NULL bytes are replaced
* with U+FFFD. The replacements operate on bytes; NULL bytes inside
* invalid UTF-8 sequences are replaced individually where a browser,
* decoding the byte stream into characters first, may differ.
*
* @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
* @see https://html.spec.whatwg.org/#attribute-value-(double-quoted)-state
*
* @since 7.1.0
*
* @param WP_HTML_Attribute_Token $attribute Attribute token from the input document.
* @return string Decoded attribute value.
*/
private function get_decoded_source_attribute_value( WP_HTML_Attribute_Token $attribute ): string {
$raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length );

return WP_HTML_Decoder::decode_attribute( $raw_value );
/*
* Newline normalization is part of preprocessing the input stream
* and precedes character reference decoding: `&#13;` decodes into
* a carriage return which must be preserved. The check avoids
* scanning the value again when it contains no carriage return;
* most values contain none.
*/
if ( false !== strpos( $raw_value, "\r" ) ) {
$raw_value = str_replace( "\r\n", "\n", $raw_value );
$raw_value = str_replace( "\r", "\n", $raw_value );
}

$decoded_value = WP_HTML_Decoder::decode_attribute( $raw_value );

/*
* The tokenizer replaces U+0000 NULL bytes as it consumes input:
* character references see the raw NULL byte — an unambiguous
* follower for references without a terminating semicolon — and
* no character reference decodes into NULL, so the replacement
* applies equivalently after decoding, where it cannot disturb
* how references parse.
*/
if ( false !== strpos( $decoded_value, "\x00" ) ) {
$decoded_value = str_replace( "\x00", "\u{FFFD}", $decoded_value );
}

return $decoded_value;
}

/**
Expand All @@ -2849,6 +2910,10 @@ public function get_attribute( $name ) {
* $p->get_attribute_names_with_prefix( 'data-' ) === null;
*
* @since 6.2.0
* @since 7.1.0 NULL bytes in source attribute names are returned as U+FFFD,
* matching the tokenizer replacement browsers apply. The prefix
* is matched verbatim against these replaced names; a prefix
* containing a NULL byte matches nothing.
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*
Expand Down Expand Up @@ -2898,6 +2963,8 @@ public function get_namespace(): string {
* $p->get_tag() === null;
*
* @since 6.2.0
* @since 7.1.0 NULL bytes in the source tag name are returned as U+FFFD,
* matching the tokenizer replacement browsers apply.
*
* @return string|null Name of currently matched tag in input HTML, or `null` if none found.
*/
Expand All @@ -2906,7 +2973,15 @@ public function get_tag(): ?string {
return null;
}

$tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
/*
* The tokenizer would have replaced U+0000 NULL bytes in the tag
* name with U+FFFD; this is deferred to this read boundary. The
* replacement never applies to internal identification, which
* compares raw bytes (`scr\x00ipt` is not SCRIPT in browsers either).
*
* @see https://html.spec.whatwg.org/#tag-name-state
*/
$tag_name = str_replace( "\x00", "\u{FFFD}", substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ) );

if ( self::STATE_MATCHED_TAG === $this->parser_state ) {
return strtoupper( $tag_name );
Expand All @@ -2927,6 +3002,8 @@ public function get_tag(): ?string {
* account the current parsing context, whether HTML, SVG, or MathML.
*
* @since 6.7.0
* @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD,
* matching the tokenizer replacement browsers apply.
*
* @return string|null Name of current tag name.
*/
Expand Down Expand Up @@ -3425,6 +3502,8 @@ public function get_token_type(): ?string {
* of the document without matching a token.
*
* @since 6.5.0
* @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD,
* matching the tokenizer replacement browsers apply.
*
* @return string|null Name of the matched token.
*/
Expand Down Expand Up @@ -3782,6 +3861,12 @@ public function get_modifiable_text(): string {
* // Renders as “Eggs &amp; Milk” in a browser, encoded as `<p>Eggs &amp;amp; Milk</p>`.
* $processor->set_modifiable_text( 'Eggs &amp; Milk' );
*
* Note: unlike attribute values set through `set_attribute()`, which read
* back verbatim, text set through this method currently reads back through
* `get_modifiable_text()` with newlines normalized and NULL bytes handled
* as if the text had come from the input document. In the DOM, API-supplied
* text round-trips verbatim; this asymmetry is a known limitation.
*
* @since 6.7.0
* @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
*
Expand Down Expand Up @@ -4770,14 +4855,37 @@ private function matches(): bool {
}

// Does the tag name match the requested tag name in a case-insensitive manner?
if (
isset( $this->sought_tag_name ) &&
(
strlen( $this->sought_tag_name ) !== $this->tag_name_length ||
0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true )
)
) {
return false;
if ( isset( $this->sought_tag_name ) ) {
$tag_name_matches = (
strlen( $this->sought_tag_name ) === $this->tag_name_length &&
0 === substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true )
);

/*
* Names are matched in the same alphabet `get_tag()` exposes,
* where U+0000 NULL bytes appear as U+FFFD: a sought name
* containing U+FFFD matches source names with NULL bytes in
* its place, and a sought name containing a NULL byte matches
* nothing, since no exposed name contains one. The byte
* comparison above already agrees for names without NULL
* bytes, so this only resolves the rare disagreements.
*/
if ( $tag_name_matches ) {
$tag_name_matches = false === strpos( $this->sought_tag_name, "\x00" );
} elseif ( false !== strpos( $this->sought_tag_name, "\u{FFFD}" ) ) {
$raw_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
if ( false !== strpos( $raw_name, "\x00" ) ) {
$exposed_name = str_replace( "\x00", "\u{FFFD}", $raw_name );
$tag_name_matches = (
strlen( $this->sought_tag_name ) === strlen( $exposed_name ) &&
0 === substr_compare( $exposed_name, $this->sought_tag_name, 0, strlen( $exposed_name ), true )
);
}
}

if ( ! $tag_name_matches ) {
return false;
}
}

if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {
Expand Down
78 changes: 78 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlDecoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,84 @@ static function ( int $errno, string $errstr ) use ( &$errors ) {
$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
}

/**
* Ensures that numeric character references for U+0000 decode to U+FFFD
* while raw NULL bytes pass through the decoder untransformed.
*
* The tokenizer, not the decoder, is responsible for replacing raw NULL
* bytes; in the Tag Processor that responsibility falls on the methods
* which read values out of the input document.
*
* @ticket 65372
*
* @dataProvider data_null_code_points
*
* @param string $raw_value Raw attribute value.
* @param string $decoded_value The expected decoded attribute value.
*/
public function test_null_code_points_in_attribute_values( string $raw_value, string $decoded_value ) {
$this->assertSame(
$decoded_value,
WP_HTML_Decoder::decode_attribute( $raw_value ),
'Improperly decoded raw attribute value.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_null_code_points() {
return array(
'Decimal zero' => array( 'a&#0;b', "a\u{FFFD}b" ),
'Hexadecimal zero' => array( 'a&#x0;b', "a\u{FFFD}b" ),
'Multiple zeros' => array( 'a&#0000;b', "a\u{FFFD}b" ),
'Raw NULL byte passes through' => array( "a\x00b", "a\x00b" ),
);
}

/**
* Ensures that the ambiguous-follower check for character references
* lacking a terminating semicolon treats only ASCII alphanumerics and
* the equals sign as ambiguous, regardless of the process locale.
*
* `ctype_alnum()` classifies bytes 0x80 and above as alphanumeric under
* UTF-8 locales, wrongly suppressing decodes whose follower is a
* non-ASCII byte, such as U+FFFD produced by NULL-byte replacement.
*
* @ticket 65372
*
* @see https://html.spec.whatwg.org/#named-character-reference-state
*
* @dataProvider data_semicolon_less_references_with_followers
*
* @param string $raw_value Raw attribute value.
* @param string $decoded_value The expected decoded attribute value.
*/
public function test_semicolon_less_reference_followers( string $raw_value, string $decoded_value ) {
$this->assertSame(
$decoded_value,
WP_HTML_Decoder::decode_attribute( $raw_value ),
'Improperly decoded raw attribute value.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_semicolon_less_references_with_followers() {
return array(
'U+FFFD follower decodes' => array( "x&amp\u{FFFD};y", "x&\u{FFFD};y" ),
'Non-ASCII follower decodes' => array( "x&amp\u{E9}y", "x&\u{E9}y" ),
'ASCII letter follower is ambiguous' => array( 'x&ampzy', 'x&ampzy' ),
'ASCII digit follower is ambiguous' => array( 'x&amp1y', 'x&amp1y' ),
'Equals sign follower is ambiguous' => array( 'x&amp=y', 'x&amp=y' ),
);
}

/**
* Ensures proper detection of attribute prefixes ignoring ASCII case.
*
Expand Down
Loading
Loading