diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index d902f4b7cabc4..9f33056de0c14 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -195,6 +195,8 @@ public static function decode( $context, $text ): string { * 7 === $token_length; // `∉` * * @since 6.6.0 + * @since 7.1.0 Detects ambiguous followers of semicolon-less references + * by ASCII classification only, independent of the locale. * * @global WP_Token_Map $html5_named_character_references Mappings for HTML5 named character references. * @@ -377,14 +379,20 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * At this point though there's a match for an entry in the named * character reference table but the match doesn't end in `;`. * It may be allowed if it's followed by something unambiguous. + * + * Only an ASCII alphanumeric or U+003D EQUALS SIGN is ambiguous. + * `ctype_alnum()` must be avoided here: its classification of + * bytes 0x80 and above depends on the process locale, but only + * these specific ASCII characters prevent decoding. + * + * @see https://html.spec.whatwg.org/#named-character-reference-state */ + $follower = $after_name < $length ? $text[ $after_name ] : ''; $ambiguous_follower = ( - $after_name < $length && - $name_at < $length && - ( - ctype_alnum( $text[ $after_name ] ) || - '=' === $text[ $after_name ] - ) + ( 'a' <= $follower && 'z' >= $follower ) || + ( 'A' <= $follower && 'Z' >= $follower ) || + ( '0' <= $follower && '9' >= $follower ) || + '=' === $follower ); // It's non-ambiguous, safe to leave it in. diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 35d91fad3129c..c46151f05d9be 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5254,6 +5254,8 @@ public function get_namespace(): string { * $processor->get_tag() === null; * * @since 6.4.0 + * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ @@ -5315,6 +5317,8 @@ public function has_self_closing_flag(): bool { * of the document without matching a token. * * @since 6.6.0 Subclassed for the HTML Processor. + * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of the matched token. */ diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 77c1a471db5b1..b73c837af0077 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1174,7 +1174,13 @@ public function paused_at_incomplete_token(): bool { * } * // Outputs: "free lang-en " * + * Class names from the input document already carry the tokenizer's + * U+FFFD replacement of NULL bytes through `get_attribute()`; values + * supplied through the API are returned verbatim, as `Element.classList` + * does in the DOM. + * * @since 6.4.0 + * @since 7.1.0 No longer replaces NULL bytes in API-supplied class values. * * @return Generator */ @@ -1208,7 +1214,7 @@ public function class_list() { return; } - $name = str_replace( "\x00", "\u{FFFD}", substr( $class, $at, $length ) ); + $name = substr( $class, $at, $length ); if ( $is_quirks ) { $name = strtolower( $name ); } @@ -2231,9 +2237,16 @@ private function parse_next_attribute(): bool { * > case-insensitive match for each other. * - HTML 5 spec * + * The tokenizer would have replaced U+0000 NULL bytes in attribute + * names with U+FFFD, so names which differ only by those bytes are + * duplicates. The replacement applies to the comparable name — a + * comparison artifact — while the raw span in the document remains + * untouched. + * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + * @see https://html.spec.whatwg.org/#attribute-name-state */ - $comparable_name = strtolower( $attribute_name ); + $comparable_name = strtolower( str_replace( "\x00", "\u{FFFD}", $attribute_name ) ); // If an attribute is listed many times, only use the first declaration and ignore the rest. if ( ! isset( $this->attributes[ $comparable_name ] ) ) { @@ -2359,13 +2372,7 @@ private function class_name_updates_to_attributes_updates(): void { } if ( false === $existing_class && isset( $this->attributes['class'] ) ) { - $existing_class = WP_HTML_Decoder::decode_attribute( - substr( - $this->html, - $this->attributes['class']->value_starts_at, - $this->attributes['class']->value_length - ) - ); + $existing_class = $this->get_decoded_source_attribute_value( $this->attributes['class'] ); } if ( false === $existing_class ) { @@ -2771,6 +2778,11 @@ private function get_enqueued_attribute_value( string $comparable_name ) { * $p->get_attribute( 'class' ) === null; * * @since 6.2.0 + * @since 7.1.0 Applies input-stream preprocessing: newlines in the source value + * are normalized and NULL bytes are replaced with U+FFFD, as + * browsers do before decoding character references. Attributes + * whose source name contains a NULL byte are addressed by the + * name with U+FFFD in its place, as in the DOM. * * @param string $name Name of attribute whose value is requested. * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. @@ -2793,7 +2805,7 @@ public function get_attribute( $name ) { * attribute values. If any exist, those enqueued class changes must first be flushed out * into an attribute value update. */ - if ( 'class' === $name ) { + if ( 'class' === $comparable ) { $this->class_name_updates_to_attributes_updates(); } @@ -2824,9 +2836,58 @@ public function get_attribute( $name ) { return true; } + return $this->get_decoded_source_attribute_value( $attribute ); + } + + /** + * Returns the decoded value of an attribute found in the input document. + * + * The Tag Processor defers the HTML input-stream preprocessing and the + * tokenizer's replacements while scanning; they must be applied when + * reading a value out of the document: newlines are normalized before + * character references decode, and U+0000 NULL bytes are replaced + * with U+FFFD. The replacements operate on bytes; NULL bytes inside + * invalid UTF-8 sequences are replaced individually where a browser, + * decoding the byte stream into characters first, may differ. + * + * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream + * @see https://html.spec.whatwg.org/#attribute-value-(double-quoted)-state + * + * @since 7.1.0 + * + * @param WP_HTML_Attribute_Token $attribute Attribute token from the input document. + * @return string Decoded attribute value. + */ + private function get_decoded_source_attribute_value( WP_HTML_Attribute_Token $attribute ): string { $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); - return WP_HTML_Decoder::decode_attribute( $raw_value ); + /* + * Newline normalization is part of preprocessing the input stream + * and precedes character reference decoding: ` ` decodes into + * a carriage return which must be preserved. The check avoids + * scanning the value again when it contains no carriage return; + * most values contain none. + */ + if ( false !== strpos( $raw_value, "\r" ) ) { + $raw_value = str_replace( "\r\n", "\n", $raw_value ); + $raw_value = str_replace( "\r", "\n", $raw_value ); + } + + $decoded_value = WP_HTML_Decoder::decode_attribute( $raw_value ); + + /* + * The tokenizer replaces U+0000 NULL bytes as it consumes input: + * character references see the raw NULL byte — an unambiguous + * follower for references without a terminating semicolon — and + * no character reference decodes into NULL, so the replacement + * applies equivalently after decoding, where it cannot disturb + * how references parse. + */ + if ( false !== strpos( $decoded_value, "\x00" ) ) { + $decoded_value = str_replace( "\x00", "\u{FFFD}", $decoded_value ); + } + + return $decoded_value; } /** @@ -2849,6 +2910,10 @@ public function get_attribute( $name ) { * $p->get_attribute_names_with_prefix( 'data-' ) === null; * * @since 6.2.0 + * @since 7.1.0 NULL bytes in source attribute names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. The prefix + * is matched verbatim against these replaced names; a prefix + * containing a NULL byte matches nothing. * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive * @@ -2898,6 +2963,8 @@ public function get_namespace(): string { * $p->get_tag() === null; * * @since 6.2.0 + * @since 7.1.0 NULL bytes in the source tag name are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ @@ -2906,7 +2973,15 @@ public function get_tag(): ?string { return null; } - $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + /* + * The tokenizer would have replaced U+0000 NULL bytes in the tag + * name with U+FFFD; this is deferred to this read boundary. The + * replacement never applies to internal identification, which + * compares raw bytes (`scr\x00ipt` is not SCRIPT in browsers either). + * + * @see https://html.spec.whatwg.org/#tag-name-state + */ + $tag_name = str_replace( "\x00", "\u{FFFD}", substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ) ); if ( self::STATE_MATCHED_TAG === $this->parser_state ) { return strtoupper( $tag_name ); @@ -2927,6 +3002,8 @@ public function get_tag(): ?string { * account the current parsing context, whether HTML, SVG, or MathML. * * @since 6.7.0 + * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of current tag name. */ @@ -3425,6 +3502,8 @@ public function get_token_type(): ?string { * of the document without matching a token. * * @since 6.5.0 + * @since 7.1.0 NULL bytes in source tag names are returned as U+FFFD, + * matching the tokenizer replacement browsers apply. * * @return string|null Name of the matched token. */ @@ -3782,6 +3861,12 @@ public function get_modifiable_text(): string { * // Renders as “Eggs & Milk” in a browser, encoded as `

Eggs &amp; Milk

`. * $processor->set_modifiable_text( 'Eggs & Milk' ); * + * Note: unlike attribute values set through `set_attribute()`, which read + * back verbatim, text set through this method currently reads back through + * `get_modifiable_text()` with newlines normalized and NULL bytes handled + * as if the text had come from the input document. In the DOM, API-supplied + * text round-trips verbatim; this asymmetry is a known limitation. + * * @since 6.7.0 * @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping. * @@ -4770,14 +4855,37 @@ private function matches(): bool { } // Does the tag name match the requested tag name in a case-insensitive manner? - if ( - isset( $this->sought_tag_name ) && - ( - strlen( $this->sought_tag_name ) !== $this->tag_name_length || - 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) - ) - ) { - return false; + if ( isset( $this->sought_tag_name ) ) { + $tag_name_matches = ( + strlen( $this->sought_tag_name ) === $this->tag_name_length && + 0 === substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) + ); + + /* + * Names are matched in the same alphabet `get_tag()` exposes, + * where U+0000 NULL bytes appear as U+FFFD: a sought name + * containing U+FFFD matches source names with NULL bytes in + * its place, and a sought name containing a NULL byte matches + * nothing, since no exposed name contains one. The byte + * comparison above already agrees for names without NULL + * bytes, so this only resolves the rare disagreements. + */ + if ( $tag_name_matches ) { + $tag_name_matches = false === strpos( $this->sought_tag_name, "\x00" ); + } elseif ( false !== strpos( $this->sought_tag_name, "\u{FFFD}" ) ) { + $raw_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + if ( false !== strpos( $raw_name, "\x00" ) ) { + $exposed_name = str_replace( "\x00", "\u{FFFD}", $raw_name ); + $tag_name_matches = ( + strlen( $this->sought_tag_name ) === strlen( $exposed_name ) && + 0 === substr_compare( $exposed_name, $this->sought_tag_name, 0, strlen( $exposed_name ), true ) + ); + } + } + + if ( ! $tag_name_matches ) { + return false; + } } if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 97954f4eb3e30..9527739edd23b 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -61,6 +61,84 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); } + /** + * Ensures that numeric character references for U+0000 decode to U+FFFD + * while raw NULL bytes pass through the decoder untransformed. + * + * The tokenizer, not the decoder, is responsible for replacing raw NULL + * bytes; in the Tag Processor that responsibility falls on the methods + * which read values out of the input document. + * + * @ticket 65372 + * + * @dataProvider data_null_code_points + * + * @param string $raw_value Raw attribute value. + * @param string $decoded_value The expected decoded attribute value. + */ + public function test_null_code_points_in_attribute_values( string $raw_value, string $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_attribute( $raw_value ), + 'Improperly decoded raw attribute value.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_null_code_points() { + return array( + 'Decimal zero' => array( 'a�b', "a\u{FFFD}b" ), + 'Hexadecimal zero' => array( 'a�b', "a\u{FFFD}b" ), + 'Multiple zeros' => array( 'a�b', "a\u{FFFD}b" ), + 'Raw NULL byte passes through' => array( "a\x00b", "a\x00b" ), + ); + } + + /** + * Ensures that the ambiguous-follower check for character references + * lacking a terminating semicolon treats only ASCII alphanumerics and + * the equals sign as ambiguous, regardless of the process locale. + * + * `ctype_alnum()` classifies bytes 0x80 and above as alphanumeric under + * UTF-8 locales, wrongly suppressing decodes whose follower is a + * non-ASCII byte, such as U+FFFD produced by NULL-byte replacement. + * + * @ticket 65372 + * + * @see https://html.spec.whatwg.org/#named-character-reference-state + * + * @dataProvider data_semicolon_less_references_with_followers + * + * @param string $raw_value Raw attribute value. + * @param string $decoded_value The expected decoded attribute value. + */ + public function test_semicolon_less_reference_followers( string $raw_value, string $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_attribute( $raw_value ), + 'Improperly decoded raw attribute value.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_semicolon_less_references_with_followers() { + return array( + 'U+FFFD follower decodes' => array( "x&\u{FFFD};y", "x&\u{FFFD};y" ), + 'Non-ASCII follower decodes' => array( "x&\u{E9}y", "x&\u{E9}y" ), + 'ASCII letter follower is ambiguous' => array( 'x&zy', 'x&zy' ), + 'ASCII digit follower is ambiguous' => array( 'x&1y', 'x&1y' ), + 'Equals sign follower is ambiguous' => array( 'x&=y', 'x&=y' ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. * diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php new file mode 100644 index 0000000000000..1180fa7110c88 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -0,0 +1,428 @@ +assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( $expected, $processor->get_attribute( 'a' ) ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_attribute_values_with_preprocessing() { + return array( + 'Raw CR' => array( "
", "x\ny" ), + 'Raw CRLF' => array( "
", "x\ny" ), + 'Raw CR then CRLF' => array( "
", "x\n\ny" ), + 'Double-quoted raw CR' => array( "
", "x\ny" ), + 'NULL byte' => array( "
", "x\u{FFFD}y" ), + 'NULL byte unquoted' => array( "
", "x\u{FFFD}y" ), + 'Encoded CR is preserved' => array( "
", "x\ry" ), + 'Encoded NULL becomes U+FFFD' => array( "
", "x\u{FFFD}y" ), + 'Raw CR before encoded CR' => array( "
", "x\n\ry" ), + 'Raw CR and NULL byte' => array( "
", "x\n\u{FFFD}y" ), + 'Named reference before NULL' => array( "
", "x&\u{FFFD};y" ), + 'Named reference before CR' => array( "
", "x&\ny" ), + ); + } + + /** + * Ensures that values enqueued through `set_attribute()` are returned verbatim. + * + * Input-stream preprocessing applies only to the input document. API-supplied + * values are plaintext, equivalent to DOM `setAttribute()`, which performs + * no replacements. Browser-verified. + * + * @ticket 65372 + * + * @covers ::get_attribute + * + * @dataProvider data_enqueued_attribute_values + * + * @param string $value Plaintext attribute value to set and expect back unchanged. + */ + public function test_get_attribute_returns_enqueued_values_verbatim( string $value ) { + $processor = new WP_HTML_Tag_Processor( '
' ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->set_attribute( 'a', $value ), 'Should have enqueued the attribute update.' ); + $this->assertSame( $value, $processor->get_attribute( 'a' ) ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_enqueued_attribute_values() { + return array( + 'Carriage return' => array( "x\ry" ), + 'CRLF' => array( "x\r\ny" ), + 'NULL byte' => array( "x\x00y" ), + ); + } + + /** + * Ensures the existing class attribute value is preprocessed when enqueued + * class updates are flushed into an attribute update. + * + * @ticket 65372 + * + * @covers ::add_class + * + * @dataProvider data_class_updates_with_preprocessing + * + * @param string $html HTML containing a tag with a class attribute. + * @param string $expected_html Expected document after adding a class. + */ + public function test_class_updates_apply_input_preprocessing_to_existing_value( string $html, string $expected_html ) { + $processor = new WP_HTML_Tag_Processor( $html ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->add_class( 'added' ), 'Should have enqueued the class addition.' ); + $this->assertSame( $expected_html, $processor->get_updated_html() ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_class_updates_with_preprocessing() { + return array( + 'Raw CR' => array( "
", "
" ), + 'Raw CRLF' => array( "
", "
" ), + 'NULL byte' => array( "
", "
" ), + 'Named reference before NULL' => array( "
", "
" ), + ); + } + + /** + * Ensures attribute names containing NULL bytes are exposed with U+FFFD and + * are addressable only by their replaced name, as browsers expose them. + * + * Browser-verified: `getAttribute("da\u{FFFD}ta")` finds the attribute + * parsed from `da\x00ta`; `getAttribute("da\x00ta")` does not. + * + * @ticket 65372 + * + * @covers ::get_attribute + * @covers ::get_attribute_names_with_prefix + */ + public function test_attribute_names_replace_null_bytes() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "da\u{FFFD}ta" ), $processor->get_attribute_names_with_prefix( '' ) ); + $this->assertSame( '1', $processor->get_attribute( "da\u{FFFD}ta" ), 'Should have found the attribute by its replaced name.' ); + $this->assertNull( $processor->get_attribute( "da\x00ta" ), 'Should not have found the attribute by its raw source name.' ); + + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "da\u{FFFD}ta" ), $processor->get_attribute_names_with_prefix( '' ), 'Should have lowercased the name around the replacement character.' ); + } + + /** + * Ensures attribute names which collapse to the same name after NULL-byte + * replacement are duplicates of one attribute: the first in document order + * provides the value and removal removes every collapsed copy. + * + * Browser-verified: `
` produces a single + * attribute `da\u{FFFD}ta` with value "1". + * + * @ticket 65372 + * + * @covers ::get_attribute + * @covers ::remove_attribute + */ + public function test_attribute_names_collapsing_after_null_replacement_are_duplicates() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "da\u{FFFD}ta" ), $processor->get_attribute_names_with_prefix( '' ) ); + $this->assertSame( '1', $processor->get_attribute( "da\u{FFFD}ta" ), 'First duplicate should provide the value.' ); + + $this->assertTrue( $processor->remove_attribute( "da\u{FFFD}ta" ), 'Should have removed the attribute.' ); + $this->assertSame( '
', $processor->get_updated_html(), 'Should have removed all duplicates of the attribute.' ); + } + + /** + * Ensures setting an attribute by its U+FFFD-replaced name updates the + * source attribute whose raw name contains a NULL byte instead of adding + * a second attribute. + * + * @ticket 65372 + * + * @covers ::set_attribute + */ + public function test_set_attribute_updates_attribute_with_null_byte_in_source_name() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->set_attribute( "da\u{FFFD}ta", 'new' ), 'Should have set the attribute.' ); + $this->assertSame( "
", $processor->get_updated_html() ); + } + + /** + * Ensures tag names containing NULL bytes are exposed with U+FFFD, + * matching the tokenizer's tag-name-state replacement in browsers. + * + * @ticket 65372 + * + * @covers ::get_tag + * @covers ::get_token_name + */ + public function test_get_tag_replaces_null_bytes() { + $processor = new WP_HTML_Tag_Processor( "x" ); + + $this->assertTrue( $processor->next_token(), 'Should have found the tag opener.' ); + $this->assertSame( "DI\u{FFFD}V", $processor->get_tag() ); + $this->assertSame( "DI\u{FFFD}V", $processor->get_token_name() ); + + $this->assertTrue( $processor->next_token(), 'Should have found the text node.' ); + $this->assertSame( 'x', $processor->get_modifiable_text() ); + + $this->assertTrue( $processor->next_token(), 'Should have found the tag closer.' ); + $this->assertTrue( $processor->is_tag_closer(), 'Should have matched the tag closer.' ); + $this->assertSame( "DI\u{FFFD}V", $processor->get_tag() ); + } + + /** + * Ensures NULL bytes in tag names do not affect special-element detection: + * `` is not SCRIPT and does not switch into rawtext parsing, + * in browsers or here. Internal identification uses raw bytes. + * + * @ticket 65372 + * + * @covers ::get_tag + */ + public function test_null_byte_in_tag_name_does_not_select_rawtext_parsing() { + $processor = new WP_HTML_Tag_Processor( "" ); + + $this->assertTrue( $processor->next_token(), 'Should have found the tag opener.' ); + $this->assertSame( "SCR\u{FFFD}IPT", $processor->get_tag() ); + + $this->assertTrue( $processor->next_token(), 'Should have found the B tag, not raw text.' ); + $this->assertSame( 'B', $processor->get_tag() ); + } + + /** + * Ensures NULL bytes cannot appear in PI-lookalike comment tag names, + * whose targets are restricted to ASCII name characters. + * + * @ticket 65372 + * + * @covers ::get_tag + */ + public function test_pi_lookalike_target_stops_before_null_byte() { + $processor = new WP_HTML_Tag_Processor( "" ); + + $this->assertTrue( $processor->next_token(), 'Should have found the comment.' ); + $this->assertSame( WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE, $processor->get_comment_type() ); + $this->assertSame( 'px', $processor->get_tag() ); + } + + /** + * Ensures tag-name queries match in the same replaced alphabet that + * `get_tag()` exposes: a sought name containing U+FFFD matches source + * names whose raw bytes contain NULL in its place, a sought name + * containing a raw NULL byte matches nothing, and the value returned + * by `get_tag()` round-trips into a successful query. + * + * This is also how WP_HTML_Processor::next_tag() matches, since it + * compares sought names against the token name. + * + * @ticket 65372 + * + * @covers ::next_tag + */ + public function test_tag_name_queries_match_replaced_names() { + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertTrue( $processor->next_tag( "DI\u{FFFD}V" ), 'Should have matched the tag by its replaced name.' ); + + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $tag_name = $processor->get_tag(); + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertTrue( $processor->next_tag( array( 'tag_name' => $tag_name ) ), 'The name returned by get_tag() should match in a query.' ); + + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertFalse( $processor->next_tag( "DI\x00V" ), 'Should not have matched the tag by its raw source name.' ); + + $processor = new WP_HTML_Tag_Processor( "" ); + $this->assertTrue( $processor->next_tag( "DI\u{FFFD}V" ), 'Should have matched a raw U+FFFD name.' ); + + $processor = WP_HTML_Processor::create_full_parser( "" ); + $this->assertTrue( $processor->next_tag( array( 'tag_name' => "DI\u{FFFD}V" ) ), 'The HTML Processor should match the replaced name.' ); + + $processor = WP_HTML_Processor::create_full_parser( "" ); + $this->assertFalse( $processor->next_tag( array( 'tag_name' => "DI\x00V" ) ), 'The HTML Processor should not match the raw source name.' ); + } + + /** + * Ensures class_list does not replace NULL bytes in API-supplied values. + * + * Browser-verified: `setAttribute('class', "a\x00b")` then reading + * `classList` yields the token "a\x00b" with the NULL byte preserved; + * U+0000 replacement happens only in the tokenizer, and values from the + * input document already receive it through `get_attribute()`. + * + * @ticket 65372 + * + * @covers ::class_list + * @covers ::has_class + */ + public function test_class_list_preserves_null_bytes_in_enqueued_values() { + $processor = new WP_HTML_Tag_Processor( '
' ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->set_attribute( 'class', "a\x00b c\u{FFFD}d" ), 'Should have set the class attribute.' ); + $this->assertSame( array( "a\x00b", "c\u{FFFD}d" ), iterator_to_array( $processor->class_list(), false ), 'Should have preserved the NULL byte in the API-supplied class.' ); + $this->assertTrue( $processor->has_class( "a\x00b" ) ); + } + + /** + * Ensures the class helpers operate on the replaced source value: + * a class containing a NULL byte in the document is exposed, matched, + * and queried by its U+FFFD spelling only. + * + * @ticket 65372 + * + * @covers ::class_list + * @covers ::has_class + * @covers ::next_tag + */ + public function test_class_helpers_use_replaced_source_values() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "a\u{FFFD}b" ), iterator_to_array( $processor->class_list(), false ), 'Should have exposed the replaced class name.' ); + $this->assertTrue( $processor->has_class( "a\u{FFFD}b" ), 'Should have matched the replaced class name.' ); + $this->assertFalse( $processor->has_class( "a\x00b" ), 'Should not have matched the raw source class name.' ); + + $processor = new WP_HTML_Tag_Processor( "
" ); + $this->assertTrue( $processor->next_tag( array( 'class_name' => "a\u{FFFD}b" ) ), 'Should have matched a class_name query by the replaced name.' ); + } + + /** + * Ensures boolean attributes whose names contain NULL bytes are + * addressable by their replaced name. + * + * @ticket 65372 + * + * @covers ::get_attribute + */ + public function test_boolean_attribute_with_null_byte_in_name() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->get_attribute( "da\u{FFFD}ta" ), 'Should have reported the boolean attribute by its replaced name.' ); + } + + /** + * Ensures attribute-name prefixes are matched verbatim against the + * replaced names: a prefix spelled with U+FFFD matches, and a prefix + * containing a raw NULL byte matches nothing. + * + * @ticket 65372 + * + * @covers ::get_attribute_names_with_prefix + */ + public function test_attribute_name_prefixes_match_replaced_names() { + $processor = new WP_HTML_Tag_Processor( "
" ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( array( "da\u{FFFD}ta" ), $processor->get_attribute_names_with_prefix( "da\u{FFFD}" ), 'A replaced-name prefix should match.' ); + $this->assertSame( array(), $processor->get_attribute_names_with_prefix( "da\x00" ), 'A raw NULL prefix should match nothing.' ); + } + + /** + * Ensures the replaced tag names flow through HTML Processor tree + * construction: an end tag spelled with U+FFFD closes an element + * whose start tag was spelled with a raw NULL byte, as in browsers, + * where both spellings tokenize to the same name. + * + * @ticket 65372 + */ + public function test_html_processor_matches_end_tags_across_null_byte_spellings() { + $this->assertSame( + "xy", + WP_HTML_Processor::normalize( "xy" ), + 'The U+FFFD-spelled end tag should have closed the NULL-spelled element.' + ); + + $processor = WP_HTML_Processor::create_full_parser( "xy" ); + $this->assertTrue( $processor->next_tag( array( 'tag_name' => "DI\u{FFFD}V" ) ), 'Should have found the element by its replaced name.' ); + $this->assertSame( array( 'HTML', 'BODY', "DI\u{FFFD}V" ), $processor->get_breadcrumbs(), 'Should have built breadcrumbs from replaced names.' ); + } + + /** + * Ensures pending class updates are flushed for any case spelling of + * the "class" attribute name, since attribute names are matched + * ASCII-case-insensitively. + * + * @ticket 65372 + * + * @covers ::get_attribute + */ + public function test_get_attribute_flushes_class_updates_case_insensitively() { + $processor = new WP_HTML_Tag_Processor( '
' ); + + $this->assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertTrue( $processor->add_class( 'b' ), 'Should have enqueued the class addition.' ); + $this->assertSame( 'a b', $processor->get_attribute( 'CLASS' ), 'Should have included pending class updates for an uppercase lookup.' ); + } + + /** + * Ensures numeric character references for U+0000 decode to U+FFFD in text. + * + * @ticket 65372 + * + * @covers ::get_modifiable_text + */ + public function test_encoded_null_in_text_node_decodes_to_replacement_character() { + $processor = new WP_HTML_Tag_Processor( 'a�b' ); + + $this->assertTrue( $processor->next_token(), 'Should have found the text node.' ); + $this->assertSame( "a\u{FFFD}b", $processor->get_modifiable_text() ); + } +}