From cee066132e6064a46171e6d77da7936766ba7e93 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:17:52 +0200 Subject: [PATCH 01/19] HTML API: Add tests for attribute value input preprocessing. Red TDD step: browser-verified expectations for raw CR/CRLF/NUL in attribute values; passing pins for encoded /� and for verbatim pass-through of API-supplied values. See #65372. --- .../phpunit/tests/html-api/wpHtmlDecoder.php | 37 ++++++ ...wpHtmlTagProcessor-input-preprocessing.php | 113 ++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 97954f4eb3e30..0622563f93cb0 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -61,6 +61,43 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); } + /** + * Ensures that numeric character references for U+0000 decode to U+FFFD + * while raw NULL bytes pass through the decoder untransformed. + * + * The tokenizer, not the decoder, is responsible for replacing raw NULL + * bytes; in the Tag Processor that responsibility falls on the methods + * which read values out of the input document. + * + * @ticket 65372 + * + * @dataProvider data_null_code_points + * + * @param string $raw_value Raw attribute value. + * @param string $decoded_value The expected decoded attribute value. + */ + public function test_null_code_points_in_attribute_values( string $raw_value, string $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_attribute( $raw_value ), + 'Improperly decoded raw attribute value.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_null_code_points() { + return array( + 'Decimal zero' => array( 'a�b', "a\u{FFFD}b" ), + 'Hexadecimal zero' => array( 'a�b', "a\u{FFFD}b" ), + 'Multiple zeros' => array( 'a�b', "a\u{FFFD}b" ), + 'Raw NULL byte passes through' => array( "a\x00b", "a\x00b" ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. * diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php new file mode 100644 index 0000000000000..763b0d0e3df3e --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -0,0 +1,113 @@ +assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( $expected, $processor->get_attribute( 'a' ) ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_attribute_values_with_preprocessing() { + return array( + 'Raw CR' => array( "
", "x\ny" ), + 'Raw CRLF' => array( "
", "x\ny" ), + 'Raw CR then CRLF' => array( "
", "x\n\ny" ), + 'Double-quoted raw CR' => array( "
", "x\ny" ), + 'NULL byte' => array( "
", "x\u{FFFD}y" ), + 'NULL byte unquoted' => array( "
", "x\u{FFFD}y" ), + 'Encoded CR is preserved' => array( "
", "x\ry" ), + 'Encoded NULL becomes U+FFFD' => array( "
", "x\u{FFFD}y" ), + 'Raw CR before encoded CR' => array( "
", "x\n\ry" ), + ); + } + + /** + * Ensures that values enqueued through `set_attribute()` are returned verbatim. + * + * Input-stream preprocessing applies only to the input document. API-supplied + * values are plaintext, equivalent to DOM `setAttribute()`, which performs + * no replacements. Browser-verified. + * + * @ticket 65372 + * + * @covers ::get_attribute + * + * @dataProvider data_enqueued_attribute_values + * + * @param string $value Plaintext attribute value to set and expect back unchanged. + */ + public function test_get_attribute_returns_enqueued_values_verbatim( string $value ) { + $processor = new WP_HTML_Tag_Processor( '
' ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->set_attribute( 'a', $value ), 'Should have enqueued the attribute update.' ); + $this->assertSame( $value, $processor->get_attribute( 'a' ) ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_enqueued_attribute_values() { + return array( + 'Carriage return' => array( "x\ry" ), + 'CRLF' => array( "x\r\ny" ), + 'NULL byte' => array( "x\x00y" ), + ); + } + + /** + * Ensures numeric character references for U+0000 decode to U+FFFD in text. + * + * @ticket 65372 + * + * @covers ::get_modifiable_text + */ + public function test_encoded_null_in_text_node_decodes_to_replacement_character() { + $processor = new WP_HTML_Tag_Processor( 'a�b' ); + + $this->assertTrue( $processor->next_token(), 'Should have found the text node.' ); + $this->assertSame( "a\u{FFFD}b", $processor->get_modifiable_text() ); + } +} From 82a26aa76c8f1696d138593adb13ef9a0d901827 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:19:41 +0200 Subject: [PATCH 02/19] HTML API: Apply input preprocessing in get_attribute(). Attribute values read from the input document now normalize newlines (CRLF/CR to LF) and replace U+0000 NULL bytes with U+FFFD before decoding character references, matching what browsers produce for the same markup. Values enqueued through set_attribute() are plaintext API values and continue to pass through unchanged. See #65372. --- .../html-api/class-wp-html-tag-processor.php | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 77c1a471db5b1..de4bf7133228d 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2771,6 +2771,9 @@ private function get_enqueued_attribute_value( string $comparable_name ) { * $p->get_attribute( 'class' ) === null; * * @since 6.2.0 + * @since 7.1.0 Applies input-stream preprocessing: newlines in the source value + * are normalized and NULL bytes are replaced with U+FFFD, as + * browsers do before decoding character references. * * @param string $name Name of attribute whose value is requested. * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. @@ -2824,8 +2827,33 @@ public function get_attribute( $name ) { return true; } + return $this->get_decoded_source_attribute_value( $attribute ); + } + + /** + * Returns the decoded value of an attribute found in the input document. + * + * The Tag Processor defers the HTML input-stream preprocessing and the + * tokenizer's replacements while scanning; they must be applied when + * reading a value out of the document: newlines are normalized and + * U+0000 NULL bytes are replaced with U+FFFD, both before character + * references are decoded. + * + * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream + * @see https://html.spec.whatwg.org/#attribute-value-(double-quoted)-state + * + * @since 7.1.0 + * + * @param WP_HTML_Attribute_Token $attribute Attribute token from the input document. + * @return string Decoded attribute value. + */ + private function get_decoded_source_attribute_value( WP_HTML_Attribute_Token $attribute ): string { $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); + $raw_value = str_replace( "\r\n", "\n", $raw_value ); + $raw_value = str_replace( "\r", "\n", $raw_value ); + $raw_value = str_replace( "\x00", "\u{FFFD}", $raw_value ); + return WP_HTML_Decoder::decode_attribute( $raw_value ); } From 48d8fb41e6e75d219b41afc0d9ee04bdda5713a1 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:20:26 +0200 Subject: [PATCH 03/19] HTML API: Add tests for class updates over preprocessed values. Red TDD step: flushing add_class()/remove_class() updates must read the existing class attribute through the same input preprocessing as get_attribute(), normalizing newlines and replacing NULL bytes. See #65372. --- ...wpHtmlTagProcessor-input-preprocessing.php | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php index 763b0d0e3df3e..b460eeafe347b 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -97,6 +97,40 @@ public static function data_enqueued_attribute_values() { ); } + /** + * Ensures the existing class attribute value is preprocessed when enqueued + * class updates are flushed into an attribute update. + * + * @ticket 65372 + * + * @covers ::add_class + * + * @dataProvider data_class_updates_with_preprocessing + * + * @param string $html HTML containing a tag with a class attribute. + * @param string $expected_html Expected document after adding a class. + */ + public function test_class_updates_apply_input_preprocessing_to_existing_value( string $html, string $expected_html ) { + $processor = new WP_HTML_Tag_Processor( $html ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->add_class( 'added' ), 'Should have enqueued the class addition.' ); + $this->assertSame( $expected_html, $processor->get_updated_html() ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_class_updates_with_preprocessing() { + return array( + 'Raw CR' => array( "
", "
" ), + 'Raw CRLF' => array( "
", "
" ), + 'NULL byte' => array( "
", "
" ), + ); + } + /** * Ensures numeric character references for U+0000 decode to U+FFFD in text. * From d1f852ce32ab9f497d1a84db55628ba0c26f2fe4 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:21:04 +0200 Subject: [PATCH 04/19] HTML API: Apply input preprocessing when flushing class updates. class_name_updates_to_attributes_updates() reads the existing class value through the same preprocessing helper as get_attribute(), so add_class()/remove_class() no longer rebuild the attribute from raw source bytes containing CR or NULL. See #65372. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index de4bf7133228d..d1503eb04d0c7 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2359,13 +2359,7 @@ private function class_name_updates_to_attributes_updates(): void { } if ( false === $existing_class && isset( $this->attributes['class'] ) ) { - $existing_class = WP_HTML_Decoder::decode_attribute( - substr( - $this->html, - $this->attributes['class']->value_starts_at, - $this->attributes['class']->value_length - ) - ); + $existing_class = $this->get_decoded_source_attribute_value( $this->attributes['class'] ); } if ( false === $existing_class ) { From 020155a53857190d5ab47c2e7b942feaee5c59ef Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:21:48 +0200 Subject: [PATCH 05/19] HTML API: Add tests for NULL bytes in attribute names. Red TDD step: browser-verified expectations that attribute names are exposed and addressed with U+FFFD replacing NULL bytes, that names collapsing after replacement behave as duplicates of one attribute, and that attribute updates target the replaced name. See #65372. --- ...wpHtmlTagProcessor-input-preprocessing.php | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php index b460eeafe347b..c192c25922699 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -131,6 +131,73 @@ public static function data_class_updates_with_preprocessing() { ); } + /** + * Ensures attribute names containing NULL bytes are exposed with U+FFFD and + * are addressable only by their replaced name, as browsers expose them. + * + * Browser-verified: `getAttribute("da\u{FFFD}ta")` finds the attribute + * parsed from `da\x00ta`; `getAttribute("da\x00ta")` does not. + * + * @ticket 65372 + * + * @covers ::get_attribute + * @covers ::get_attribute_names_with_prefix + */ + public function test_attribute_names_replace_null_bytes() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "da\u{FFFD}ta" ), $processor->get_attribute_names_with_prefix( '' ) ); + $this->assertSame( '1', $processor->get_attribute( "da\u{FFFD}ta" ), 'Should have found the attribute by its replaced name.' ); + $this->assertNull( $processor->get_attribute( "da\x00ta" ), 'Should not have found the attribute by its raw source name.' ); + + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "da\u{FFFD}ta" ), $processor->get_attribute_names_with_prefix( '' ), 'Should have lowercased the name around the replacement character.' ); + } + + /** + * Ensures attribute names which collapse to the same name after NULL-byte + * replacement are duplicates of one attribute: the first in document order + * provides the value and removal removes every collapsed copy. + * + * Browser-verified: `
` produces a single + * attribute `da\u{FFFD}ta` with value "1". + * + * @ticket 65372 + * + * @covers ::get_attribute + * @covers ::remove_attribute + */ + public function test_attribute_names_collapsing_after_null_replacement_are_duplicates() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "da\u{FFFD}ta" ), $processor->get_attribute_names_with_prefix( '' ) ); + $this->assertSame( '1', $processor->get_attribute( "da\u{FFFD}ta" ), 'First duplicate should provide the value.' ); + + $this->assertTrue( $processor->remove_attribute( "da\u{FFFD}ta" ), 'Should have removed the attribute.' ); + $this->assertSame( '
', $processor->get_updated_html(), 'Should have removed all duplicates of the attribute.' ); + } + + /** + * Ensures setting an attribute by its U+FFFD-replaced name updates the + * source attribute whose raw name contains a NULL byte instead of adding + * a second attribute. + * + * @ticket 65372 + * + * @covers ::set_attribute + */ + public function test_set_attribute_updates_attribute_with_null_byte_in_source_name() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->set_attribute( "da\u{FFFD}ta", 'new' ), 'Should have set the attribute.' ); + $this->assertSame( "
", $processor->get_updated_html() ); + } + /** * Ensures numeric character references for U+0000 decode to U+FFFD in text. * From 442e82051429e925fed77561e8f92df532fb3a64 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:23:09 +0200 Subject: [PATCH 06/19] HTML API: Replace NULL bytes in comparable attribute names. Attribute lookup keys are normalized where they are created, in parse_next_attribute(): NULL bytes are replaced with U+FFFD before lowercasing, as the tokenizer does in browsers. Names which collapse to the same replaced name are duplicates of one attribute (first one wins), lookups by the raw NULL spelling no longer match, and updates or removals by the replaced name target the source attribute. Raw document spans are untouched. See #65372. --- .../html-api/class-wp-html-tag-processor.php | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index d1503eb04d0c7..b64d07689e64b 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2231,9 +2231,16 @@ private function parse_next_attribute(): bool { * > case-insensitive match for each other. * - HTML 5 spec * + * The tokenizer would have replaced U+0000 NULL bytes in attribute + * names with U+FFFD, so names which differ only by those bytes are + * duplicates. The replacement applies to the comparable name — a + * comparison artifact — while the raw span in the document remains + * untouched. + * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + * @see https://html.spec.whatwg.org/#attribute-name-state */ - $comparable_name = strtolower( $attribute_name ); + $comparable_name = strtolower( str_replace( "\x00", "\u{FFFD}", $attribute_name ) ); // If an attribute is listed many times, only use the first declaration and ignore the rest. if ( ! isset( $this->attributes[ $comparable_name ] ) ) { @@ -2767,7 +2774,9 @@ private function get_enqueued_attribute_value( string $comparable_name ) { * @since 6.2.0 * @since 7.1.0 Applies input-stream preprocessing: newlines in the source value * are normalized and NULL bytes are replaced with U+FFFD, as - * browsers do before decoding character references. + * browsers do before decoding character references. Attributes + * whose source name contains a NULL byte are addressed by the + * name with U+FFFD in its place, as in the DOM. * * @param string $name Name of attribute whose value is requested. * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. @@ -2871,6 +2880,8 @@ private function get_decoded_source_attribute_value( WP_HTML_Attribute_Token $at * $p->get_attribute_names_with_prefix( 'data-' ) === null; * * @since 6.2.0 + * @since 7.1.0 NULL bytes in source attribute names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive * From 135157fbfb5bfef42edf87b3a797305a19e3a46e Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:23:58 +0200 Subject: [PATCH 07/19] HTML API: Add tests for NULL bytes in tag names. Red TDD step: tag names are exposed with U+FFFD replacing NULL bytes; passing pins confirm NULL bytes never select rawtext parsing and never appear in PI-lookalike comment tag names. See #65372. --- ...wpHtmlTagProcessor-input-preprocessing.php | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php index c192c25922699..29a07e7ac01d1 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -198,6 +198,65 @@ public function test_set_attribute_updates_attribute_with_null_byte_in_source_na $this->assertSame( "
", $processor->get_updated_html() ); } + /** + * Ensures tag names containing NULL bytes are exposed with U+FFFD, + * matching the tokenizer's tag-name-state replacement in browsers. + * + * @ticket 65372 + * + * @covers ::get_tag + * @covers ::get_token_name + */ + public function test_get_tag_replaces_null_bytes() { + $processor = new WP_HTML_Tag_Processor( "x" ); + + $this->assertTrue( $processor->next_token(), 'Should have found the tag opener.' ); + $this->assertSame( "DI\u{FFFD}V", $processor->get_tag() ); + $this->assertSame( "DI\u{FFFD}V", $processor->get_token_name() ); + + $this->assertTrue( $processor->next_token(), 'Should have found the text node.' ); + $this->assertSame( 'x', $processor->get_modifiable_text() ); + + $this->assertTrue( $processor->next_token(), 'Should have found the tag closer.' ); + $this->assertTrue( $processor->is_tag_closer(), 'Should have matched the tag closer.' ); + $this->assertSame( "DI\u{FFFD}V", $processor->get_tag() ); + } + + /** + * Ensures NULL bytes in tag names do not affect special-element detection: + * `` is not SCRIPT and does not switch into rawtext parsing, + * in browsers or here. Internal identification uses raw bytes. + * + * @ticket 65372 + * + * @covers ::get_tag + */ + public function test_null_byte_in_tag_name_does_not_select_rawtext_parsing() { + $processor = new WP_HTML_Tag_Processor( "" ); + + $this->assertTrue( $processor->next_token(), 'Should have found the tag opener.' ); + $this->assertSame( "SCR\u{FFFD}IPT", $processor->get_tag() ); + + $this->assertTrue( $processor->next_token(), 'Should have found the B tag, not raw text.' ); + $this->assertSame( 'B', $processor->get_tag() ); + } + + /** + * Ensures NULL bytes cannot appear in PI-lookalike comment tag names, + * whose targets are restricted to ASCII name characters. + * + * @ticket 65372 + * + * @covers ::get_tag + */ + public function test_pi_lookalike_target_stops_before_null_byte() { + $processor = new WP_HTML_Tag_Processor( "" ); + + $this->assertTrue( $processor->next_token(), 'Should have found the comment.' ); + $this->assertSame( WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE, $processor->get_comment_type() ); + $this->assertSame( 'px', $processor->get_tag() ); + } + /** * Ensures numeric character references for U+0000 decode to U+FFFD in text. * From 5b8ad27855e0431baad1d8e880c7ab26245c46eb Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:27:27 +0200 Subject: [PATCH 08/19] HTML API: Replace NULL bytes in tag names at the read boundary. get_tag() (and get_token_name(), which delegates to it) returns tag names with U+0000 NULL bytes replaced by U+FFFD, as the tokenizer does in browsers. Internal token identification continues to compare raw bytes: a NULL byte in a tag name already prevents rawtext detection, matching browsers, where the replaced name likewise never equals SCRIPT or the other special names. See #65372. --- .../html-api/class-wp-html-tag-processor.php | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index b64d07689e64b..fe28911c525be 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2931,6 +2931,8 @@ public function get_namespace(): string { * $p->get_tag() === null; * * @since 6.2.0 + * @since 7.1.0 NULL bytes in the source tag name are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ @@ -2939,7 +2941,15 @@ public function get_tag(): ?string { return null; } - $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + /* + * The tokenizer would have replaced U+0000 NULL bytes in the tag + * name with U+FFFD; this is deferred to this read boundary. The + * replacement never applies to internal identification, which + * compares raw bytes (`scr\x00ipt` is not SCRIPT in browsers either). + * + * @see https://html.spec.whatwg.org/#tag-name-state + */ + $tag_name = str_replace( "\x00", "\u{FFFD}", substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ) ); if ( self::STATE_MATCHED_TAG === $this->parser_state ) { return strtoupper( $tag_name ); @@ -3458,6 +3468,8 @@ public function get_token_type(): ?string { * of the document without matching a token. * * @since 6.5.0 + * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of the matched token. */ From f6f58fdd15aff13f562da949f52ba16024a27a8b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:27:57 +0200 Subject: [PATCH 09/19] HTML API: Add test for NULL bytes in API-supplied class values. Red TDD step: browser-verified expectation that classList-equivalent reads preserve NULL bytes in values set through the API; the U+0000 replacement belongs to the tokenizer, and document-sourced values already receive it in get_attribute(). See #65372. --- ...wpHtmlTagProcessor-input-preprocessing.php | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php index 29a07e7ac01d1..8a5c34717efda 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -257,6 +257,28 @@ public function test_pi_lookalike_target_stops_before_null_byte() { $this->assertSame( 'px', $processor->get_tag() ); } + /** + * Ensures class_list does not replace NULL bytes in API-supplied values. + * + * Browser-verified: `setAttribute('class', "a\x00b")` then reading + * `classList` yields the token "a\x00b" with the NULL byte preserved; + * U+0000 replacement happens only in the tokenizer, and values from the + * input document already receive it through `get_attribute()`. + * + * @ticket 65372 + * + * @covers ::class_list + * @covers ::has_class + */ + public function test_class_list_preserves_null_bytes_in_enqueued_values() { + $processor = new WP_HTML_Tag_Processor( '
' ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->set_attribute( 'class', "a\x00b c\u{FFFD}d" ), 'Should have set the class attribute.' ); + $this->assertSame( array( "a\x00b", "c\u{FFFD}d" ), iterator_to_array( $processor->class_list(), false ), 'Should have preserved the NULL byte in the API-supplied class.' ); + $this->assertTrue( $processor->has_class( "a\x00b" ) ); + } + /** * Ensures numeric character references for U+0000 decode to U+FFFD in text. * From ba93ef4e4acca87233e0dff939757a4680d13fe2 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:28:40 +0200 Subject: [PATCH 10/19] HTML API: Stop replacing NULL bytes in API-supplied class values. class_list() received its NULL-byte replacement when reading raw class values; that replacement now happens in get_attribute() for values from the input document. Performing it on API-supplied values diverged from browsers, where classList preserves NULL bytes in values set via setAttribute(). See #65372. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index fe28911c525be..9bc45dba69ac3 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1174,7 +1174,13 @@ public function paused_at_incomplete_token(): bool { * } * // Outputs: "free lang-en " * + * Class names from the input document already carry the tokenizer's + * U+FFFD replacement of NULL bytes through `get_attribute()`; values + * supplied through the API are returned verbatim, as `Element.classList` + * does in the DOM. + * * @since 6.4.0 + * @since 7.1.0 No longer replaces NULL bytes in API-supplied class values. * * @return Generator */ @@ -1208,7 +1214,7 @@ public function class_list() { return; } - $name = str_replace( "\x00", "\u{FFFD}", substr( $class, $at, $length ) ); + $name = substr( $class, $at, $length ); if ( $is_quirks ) { $name = strtolower( $name ); } From 9baceb6e2945372d2df46c2bb85293f20ec3a702 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:31:45 +0200 Subject: [PATCH 11/19] HTML API: Avoid re-scanning attribute values without CR or NULL bytes. Benchmark-guided: reading an attribute value applies up to three str_replace passes which doubled read cost for long values containing no bytes needing replacement. Guarding with strpos keeps the common case at two fast scans; values are typically free of CR and NULL. Benchmark (PHP 8.4, medians of 3): scanning 100-tag documents reading 3 attributes each, 2000 iterations: trunk 667ms, unguarded 714ms, guarded 699ms. Reading a 10.8KB clean attribute value 200k times: trunk 147ms, unguarded 313ms, guarded 258ms. The remaining cost is the unavoidable byte inspection. See #65372. --- .../html-api/class-wp-html-tag-processor.php | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 9bc45dba69ac3..dab4dcdef514e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2859,9 +2859,19 @@ public function get_attribute( $name ) { private function get_decoded_source_attribute_value( WP_HTML_Attribute_Token $attribute ): string { $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); - $raw_value = str_replace( "\r\n", "\n", $raw_value ); - $raw_value = str_replace( "\r", "\n", $raw_value ); - $raw_value = str_replace( "\x00", "\u{FFFD}", $raw_value ); + /* + * The checks before each replacement avoid scanning the value + * multiple times when it contains none of the rare bytes which + * require replacing; most values contain neither. + */ + if ( false !== strpos( $raw_value, "\r" ) ) { + $raw_value = str_replace( "\r\n", "\n", $raw_value ); + $raw_value = str_replace( "\r", "\n", $raw_value ); + } + + if ( false !== strpos( $raw_value, "\x00" ) ) { + $raw_value = str_replace( "\x00", "\u{FFFD}", $raw_value ); + } return WP_HTML_Decoder::decode_attribute( $raw_value ); } From 3b415d1b3634742aa7fa99c435ccd119af311acd Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:02:23 +0200 Subject: [PATCH 12/19] HTML API: Add tests for character references preceding replaced bytes. Red TDD step from adversarial review: a named character reference without a terminating semicolon must decode when followed by a NULL byte or any non-ASCII byte. Replacing NULL with U+FFFD before decoding fed the decoder a multi-byte follower whose classification by ctype_alnum() depends on the process locale, suppressing valid decodes in attribute values, diverging from browsers and from trunk. See #65372. --- .../phpunit/tests/html-api/wpHtmlDecoder.php | 41 +++++++++++++++++++ ...wpHtmlTagProcessor-input-preprocessing.php | 10 +++-- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 0622563f93cb0..9527739edd23b 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -98,6 +98,47 @@ public static function data_null_code_points() { ); } + /** + * Ensures that the ambiguous-follower check for character references + * lacking a terminating semicolon treats only ASCII alphanumerics and + * the equals sign as ambiguous, regardless of the process locale. + * + * `ctype_alnum()` classifies bytes 0x80 and above as alphanumeric under + * UTF-8 locales, wrongly suppressing decodes whose follower is a + * non-ASCII byte, such as U+FFFD produced by NULL-byte replacement. + * + * @ticket 65372 + * + * @see https://html.spec.whatwg.org/#named-character-reference-state + * + * @dataProvider data_semicolon_less_references_with_followers + * + * @param string $raw_value Raw attribute value. + * @param string $decoded_value The expected decoded attribute value. + */ + public function test_semicolon_less_reference_followers( string $raw_value, string $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_attribute( $raw_value ), + 'Improperly decoded raw attribute value.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_semicolon_less_references_with_followers() { + return array( + 'U+FFFD follower decodes' => array( "x&\u{FFFD};y", "x&\u{FFFD};y" ), + 'Non-ASCII follower decodes' => array( "x&\u{E9}y", "x&\u{E9}y" ), + 'ASCII letter follower is ambiguous' => array( 'x&zy', 'x&zy' ), + 'ASCII digit follower is ambiguous' => array( 'x&1y', 'x&1y' ), + 'Equals sign follower is ambiguous' => array( 'x&=y', 'x&=y' ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. * diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php index 8a5c34717efda..e0103e5949e09 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -58,6 +58,9 @@ public static function data_attribute_values_with_preprocessing() { 'Encoded CR is preserved' => array( "
", "x\ry" ), 'Encoded NULL becomes U+FFFD' => array( "
", "x\u{FFFD}y" ), 'Raw CR before encoded CR' => array( "
", "x\n\ry" ), + 'Raw CR and NULL byte' => array( "
", "x\n\u{FFFD}y" ), + 'Named reference before NULL' => array( "
", "x&\u{FFFD};y" ), + 'Named reference before CR' => array( "
", "x&\ny" ), ); } @@ -125,9 +128,10 @@ public function test_class_updates_apply_input_preprocessing_to_existing_value( */ public static function data_class_updates_with_preprocessing() { return array( - 'Raw CR' => array( "
", "
" ), - 'Raw CRLF' => array( "
", "
" ), - 'NULL byte' => array( "
", "
" ), + 'Raw CR' => array( "
", "
" ), + 'Raw CRLF' => array( "
", "
" ), + 'NULL byte' => array( "
", "
" ), + 'Named reference before NULL' => array( "
", "
" ), ); } From 8f5e8b2f23c6b5e5d0b6e826cd14c5284cdb5e65 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:03:41 +0200 Subject: [PATCH 13/19] HTML API: Replace NULL bytes after decoding attribute values. The tokenizer replaces U+0000 NULL bytes as it consumes input, so a character reference without a terminating semicolon sees the raw NULL byte as its follower, which is unambiguous, and the reference decodes. Replacing before decoding handed the decoder U+FFFD's lead byte, whose ctype_alnum() classification depends on the process locale, wrongly suppressing the decode under UTF-8 locales. No character reference decodes into NULL, so replacing after decoding is equivalent for the value's own bytes and faithful to the tokenizer's order. See #65372. --- .../html-api/class-wp-html-tag-processor.php | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index dab4dcdef514e..8d2fc58ab03ea 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2844,9 +2844,11 @@ public function get_attribute( $name ) { * * The Tag Processor defers the HTML input-stream preprocessing and the * tokenizer's replacements while scanning; they must be applied when - * reading a value out of the document: newlines are normalized and - * U+0000 NULL bytes are replaced with U+FFFD, both before character - * references are decoded. + * reading a value out of the document: newlines are normalized before + * character references decode, and U+0000 NULL bytes are replaced + * with U+FFFD. The replacements operate on bytes; NULL bytes inside + * invalid UTF-8 sequences are replaced individually where a browser, + * decoding the byte stream into characters first, may differ. * * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream * @see https://html.spec.whatwg.org/#attribute-value-(double-quoted)-state @@ -2860,20 +2862,32 @@ private function get_decoded_source_attribute_value( WP_HTML_Attribute_Token $at $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); /* - * The checks before each replacement avoid scanning the value - * multiple times when it contains none of the rare bytes which - * require replacing; most values contain neither. + * Newline normalization is part of preprocessing the input stream + * and precedes character reference decoding: ` ` decodes into + * a carriage return which must be preserved. The check avoids + * scanning the value again when it contains no carriage return; + * most values contain none. */ if ( false !== strpos( $raw_value, "\r" ) ) { $raw_value = str_replace( "\r\n", "\n", $raw_value ); $raw_value = str_replace( "\r", "\n", $raw_value ); } - if ( false !== strpos( $raw_value, "\x00" ) ) { - $raw_value = str_replace( "\x00", "\u{FFFD}", $raw_value ); + $decoded_value = WP_HTML_Decoder::decode_attribute( $raw_value ); + + /* + * The tokenizer replaces U+0000 NULL bytes as it consumes input: + * character references see the raw NULL byte — an unambiguous + * follower for references without a terminating semicolon — and + * no character reference decodes into NULL, so the replacement + * applies equivalently after decoding, where it cannot disturb + * how references parse. + */ + if ( false !== strpos( $decoded_value, "\x00" ) ) { + $decoded_value = str_replace( "\x00", "\u{FFFD}", $decoded_value ); } - return WP_HTML_Decoder::decode_attribute( $raw_value ); + return $decoded_value; } /** From e18f389c249bf3d603fa182837903544abeb0c59 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:05:11 +0200 Subject: [PATCH 14/19] HTML API: Detect ambiguous character reference followers by ASCII only. Per the named-character-reference state, a semicolon-less reference is ambiguous only when followed by an ASCII alphanumeric or equals sign. ctype_alnum() classifies bytes 0x80 and above as alphanumeric under UTF-8 locales, wrongly suppressing decodes followed by any non-ASCII byte and making decoding depend on the process locale. See #65372. --- .../html-api/class-wp-html-decoder.php | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index d902f4b7cabc4..9f33056de0c14 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -195,6 +195,8 @@ public static function decode( $context, $text ): string { * 7 === $token_length; // `∉` * * @since 6.6.0 + * @since 7.1.0 Detects ambiguous followers of semicolon-less references + * by ASCII classification only, independent of the locale. * * @global WP_Token_Map $html5_named_character_references Mappings for HTML5 named character references. * @@ -377,14 +379,20 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * At this point though there's a match for an entry in the named * character reference table but the match doesn't end in `;`. * It may be allowed if it's followed by something unambiguous. + * + * Only an ASCII alphanumeric or U+003D EQUALS SIGN is ambiguous. + * `ctype_alnum()` must be avoided here: its classification of + * bytes 0x80 and above depends on the process locale, but only + * these specific ASCII characters prevent decoding. + * + * @see https://html.spec.whatwg.org/#named-character-reference-state */ + $follower = $after_name < $length ? $text[ $after_name ] : ''; $ambiguous_follower = ( - $after_name < $length && - $name_at < $length && - ( - ctype_alnum( $text[ $after_name ] ) || - '=' === $text[ $after_name ] - ) + ( 'a' <= $follower && 'z' >= $follower ) || + ( 'A' <= $follower && 'Z' >= $follower ) || + ( '0' <= $follower && '9' >= $follower ) || + '=' === $follower ); // It's non-ambiguous, safe to leave it in. From 449bf722ad8b9c85193fa3b3a70644e90176ba51 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:05:52 +0200 Subject: [PATCH 15/19] HTML API: Add tests for tag-name queries over replaced names. Red TDD step from adversarial review: next_tag() must match tag names in the same U+FFFD-replaced alphabet that get_tag() exposes, so the getter round-trips into queries, raw NULL spellings match nothing, and the Tag Processor agrees with the HTML Processor, whose queries already compare against the replaced token name. See #65372. --- ...wpHtmlTagProcessor-input-preprocessing.php | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php index e0103e5949e09..6996ad9aee9fa 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -261,6 +261,43 @@ public function test_pi_lookalike_target_stops_before_null_byte() { $this->assertSame( 'px', $processor->get_tag() ); } + /** + * Ensures tag-name queries match in the same replaced alphabet that + * `get_tag()` exposes: a sought name containing U+FFFD matches source + * names whose raw bytes contain NULL in its place, a sought name + * containing a raw NULL byte matches nothing, and the value returned + * by `get_tag()` round-trips into a successful query. + * + * This is also how WP_HTML_Processor::next_tag() matches, since it + * compares sought names against the token name. + * + * @ticket 65372 + * + * @covers ::next_tag + */ + public function test_tag_name_queries_match_replaced_names() { + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertTrue( $processor->next_tag( "DI\u{FFFD}V" ), 'Should have matched the tag by its replaced name.' ); + + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $tag_name = $processor->get_tag(); + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertTrue( $processor->next_tag( array( 'tag_name' => $tag_name ) ), 'The name returned by get_tag() should match in a query.' ); + + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertFalse( $processor->next_tag( "DI\x00V" ), 'Should not have matched the tag by its raw source name.' ); + + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertTrue( $processor->next_tag( "DI\u{FFFD}V" ), 'Should have matched a raw U+FFFD name.' ); + + $processor = WP_HTML_Processor::create_full_parser( "" ); + $this->assertTrue( $processor->next_tag( array( 'tag_name' => "DI\u{FFFD}V" ) ), 'The HTML Processor should match the replaced name.' ); + + $processor = WP_HTML_Processor::create_full_parser( "" ); + $this->assertFalse( $processor->next_tag( array( 'tag_name' => "DI\x00V" ) ), 'The HTML Processor should not match the raw source name.' ); + } + /** * Ensures class_list does not replace NULL bytes in API-supplied values. * From 5c52634556dc03c221d5cb2b639eeb93f15d5513 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:06:37 +0200 Subject: [PATCH 16/19] HTML API: Match tag-name queries against replaced names. next_tag() compared sought tag names against raw document bytes while get_tag() returns names with NULL bytes replaced by U+FFFD, breaking the getter-to-query round trip and disagreeing with the HTML Processor's queries. Matching now happens in the exposed alphabet; the existing byte comparison is unchanged for names without NULL bytes, so the hot path costs the same. See #65372. --- .../html-api/class-wp-html-tag-processor.php | 39 +++++++++++++++---- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 8d2fc58ab03ea..b33ba3f681036 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -4845,14 +4845,37 @@ private function matches(): bool { } // Does the tag name match the requested tag name in a case-insensitive manner? - if ( - isset( $this->sought_tag_name ) && - ( - strlen( $this->sought_tag_name ) !== $this->tag_name_length || - 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) - ) - ) { - return false; + if ( isset( $this->sought_tag_name ) ) { + $tag_name_matches = ( + strlen( $this->sought_tag_name ) === $this->tag_name_length && + 0 === substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) + ); + + /* + * Names are matched in the same alphabet `get_tag()` exposes, + * where U+0000 NULL bytes appear as U+FFFD: a sought name + * containing U+FFFD matches source names with NULL bytes in + * its place, and a sought name containing a NULL byte matches + * nothing, since no exposed name contains one. The byte + * comparison above already agrees for names without NULL + * bytes, so this only resolves the rare disagreements. + */ + if ( $tag_name_matches ) { + $tag_name_matches = false === strpos( $this->sought_tag_name, "\x00" ); + } elseif ( false !== strpos( $this->sought_tag_name, "\u{FFFD}" ) ) { + $raw_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + if ( false !== strpos( $raw_name, "\x00" ) ) { + $exposed_name = str_replace( "\x00", "\u{FFFD}", $raw_name ); + $tag_name_matches = ( + strlen( $this->sought_tag_name ) === strlen( $exposed_name ) && + 0 === substr_compare( $exposed_name, $this->sought_tag_name, 0, strlen( $exposed_name ), true ) + ); + } + } + + if ( ! $tag_name_matches ) { + return false; + } } if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { From 5292c7deba317c2b16dfb690aae3bc58b6d8be98 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:07:10 +0200 Subject: [PATCH 17/19] HTML API: Add test for case-insensitive class update flushing. Red TDD step from adversarial review: get_attribute( 'CLASS' ) returned a stale value when class updates were pending, because the flush guard compared the attribute name case-sensitively. See #65372. --- .../wpHtmlTagProcessor-input-preprocessing.php | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php index 6996ad9aee9fa..cabb011f2314f 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -320,6 +320,23 @@ public function test_class_list_preserves_null_bytes_in_enqueued_values() { $this->assertTrue( $processor->has_class( "a\x00b" ) ); } + /** + * Ensures pending class updates are flushed for any case spelling of + * the "class" attribute name, since attribute names are matched + * ASCII-case-insensitively. + * + * @ticket 65372 + * + * @covers ::get_attribute + */ + public function test_get_attribute_flushes_class_updates_case_insensitively() { + $processor = new WP_HTML_Tag_Processor( '
' ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->add_class( 'b' ), 'Should have enqueued the class addition.' ); + $this->assertSame( 'a b', $processor->get_attribute( 'CLASS' ), 'Should have included pending class updates for an uppercase lookup.' ); + } + /** * Ensures numeric character references for U+0000 decode to U+FFFD in text. * From 8c26adf528f2a061125b82feab7ff13aed7a3a12 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:07:38 +0200 Subject: [PATCH 18/19] HTML API: Flush class updates for any case spelling of "class". Attribute lookups are ASCII-case-insensitive, but the pending-class flush in get_attribute() compared the requested name case-sensitively, returning a stale value for spellings like "CLASS". See #65372. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index b33ba3f681036..9e49f0e39399e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2805,7 +2805,7 @@ public function get_attribute( $name ) { * attribute values. If any exist, those enqueued class changes must first be flushed out * into an attribute value update. */ - if ( 'class' === $name ) { + if ( 'class' === $comparable ) { $this->class_name_updates_to_attributes_updates(); } From e41d168ff7b2da4c68479df1b142f6f9460e10c4 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 19:10:17 +0200 Subject: [PATCH 19/19] HTML API: Pin edge cases of replaced names and document boundaries. From adversarial review: pins for class helpers over replaced source values, boolean attributes with NULL-byte names, verbatim prefix matching in get_attribute_names_with_prefix(), and HTML Processor end-tag matching across NULL and U+FFFD spellings (browser-verified: both spellings tokenize to the same name). Documents the @since 7.1.0 behavior on indirectly-affected getters and the known asymmetry of set_modifiable_text(), whose value reads back normalized unlike attribute values, which round-trip verbatim. See #65372. --- .../html-api/class-wp-html-processor.php | 4 + .../html-api/class-wp-html-tag-processor.php | 12 ++- ...wpHtmlTagProcessor-input-preprocessing.php | 75 +++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 35d91fad3129c..c46151f05d9be 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5254,6 +5254,8 @@ public function get_namespace(): string { * $processor->get_tag() === null; * * @since 6.4.0 + * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ @@ -5315,6 +5317,8 @@ public function has_self_closing_flag(): bool { * of the document without matching a token. * * @since 6.6.0 Subclassed for the HTML Processor. + * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of the matched token. */ diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 9e49f0e39399e..b73c837af0077 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2911,7 +2911,9 @@ private function get_decoded_source_attribute_value( WP_HTML_Attribute_Token $at * * @since 6.2.0 * @since 7.1.0 NULL bytes in source attribute names are returned as U+FFFD, - * matching the tokenizer replacement browsers apply. + * matching the tokenizer replacement browsers apply. The prefix + * is matched verbatim against these replaced names; a prefix + * containing a NULL byte matches nothing. * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive * @@ -3000,6 +3002,8 @@ public function get_tag(): ?string { * account the current parsing context, whether HTML, SVG, or MathML. * * @since 6.7.0 + * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of current tag name. */ @@ -3857,6 +3861,12 @@ public function get_modifiable_text(): string { * // Renders as “Eggs & Milk” in a browser, encoded as `

Eggs &amp; Milk

`. * $processor->set_modifiable_text( 'Eggs & Milk' ); * + * Note: unlike attribute values set through `set_attribute()`, which read + * back verbatim, text set through this method currently reads back through + * `get_modifiable_text()` with newlines normalized and NULL bytes handled + * as if the text had come from the input document. In the DOM, API-supplied + * text round-trips verbatim; this asymmetry is a known limitation. + * * @since 6.7.0 * @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping. * diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php index cabb011f2314f..1180fa7110c88 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -320,6 +320,81 @@ public function test_class_list_preserves_null_bytes_in_enqueued_values() { $this->assertTrue( $processor->has_class( "a\x00b" ) ); } + /** + * Ensures the class helpers operate on the replaced source value: + * a class containing a NULL byte in the document is exposed, matched, + * and queried by its U+FFFD spelling only. + * + * @ticket 65372 + * + * @covers ::class_list + * @covers ::has_class + * @covers ::next_tag + */ + public function test_class_helpers_use_replaced_source_values() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "a\u{FFFD}b" ), iterator_to_array( $processor->class_list(), false ), 'Should have exposed the replaced class name.' ); + $this->assertTrue( $processor->has_class( "a\u{FFFD}b" ), 'Should have matched the replaced class name.' ); + $this->assertFalse( $processor->has_class( "a\x00b" ), 'Should not have matched the raw source class name.' ); + + $processor = new WP_HTML_Tag_Processor( "
" ); + $this->assertTrue( $processor->next_tag( array( 'class_name' => "a\u{FFFD}b" ) ), 'Should have matched a class_name query by the replaced name.' ); + } + + /** + * Ensures boolean attributes whose names contain NULL bytes are + * addressable by their replaced name. + * + * @ticket 65372 + * + * @covers ::get_attribute + */ + public function test_boolean_attribute_with_null_byte_in_name() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->get_attribute( "da\u{FFFD}ta" ), 'Should have reported the boolean attribute by its replaced name.' ); + } + + /** + * Ensures attribute-name prefixes are matched verbatim against the + * replaced names: a prefix spelled with U+FFFD matches, and a prefix + * containing a raw NULL byte matches nothing. + * + * @ticket 65372 + * + * @covers ::get_attribute_names_with_prefix + */ + public function test_attribute_name_prefixes_match_replaced_names() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "da\u{FFFD}ta" ), $processor->get_attribute_names_with_prefix( "da\u{FFFD}" ), 'A replaced-name prefix should match.' ); + $this->assertSame( array(), $processor->get_attribute_names_with_prefix( "da\x00" ), 'A raw NULL prefix should match nothing.' ); + } + + /** + * Ensures the replaced tag names flow through HTML Processor tree + * construction: an end tag spelled with U+FFFD closes an element + * whose start tag was spelled with a raw NULL byte, as in browsers, + * where both spellings tokenize to the same name. + * + * @ticket 65372 + */ + public function test_html_processor_matches_end_tags_across_null_byte_spellings() { + $this->assertSame( + "xy", + WP_HTML_Processor::normalize( "xy" ), + 'The U+FFFD-spelled end tag should have closed the NULL-spelled element.' + ); + + $processor = WP_HTML_Processor::create_full_parser( "xy" ); + $this->assertTrue( $processor->next_tag( array( 'tag_name' => "DI\u{FFFD}V" ) ), 'Should have found the element by its replaced name.' ); + $this->assertSame( array( 'HTML', 'BODY', "DI\u{FFFD}V" ), $processor->get_breadcrumbs(), 'Should have built breadcrumbs from replaced names.' ); + } + /** * Ensures pending class updates are flushed for any case spelling of * the "class" attribute name, since attribute names are matched