Eggs & Milk
`. * $processor->set_modifiable_text( 'Eggs & Milk' ); * + * Note: unlike attribute values set through `set_attribute()`, which read + * back verbatim, text set through this method currently reads back through + * `get_modifiable_text()` with newlines normalized and NULL bytes handled + * as if the text had come from the input document. In the DOM, API-supplied + * text round-trips verbatim; this asymmetry is a known limitation. + * * @since 6.7.0 * @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping. * @@ -4770,14 +4855,37 @@ private function matches(): bool { } // Does the tag name match the requested tag name in a case-insensitive manner? - if ( - isset( $this->sought_tag_name ) && - ( - strlen( $this->sought_tag_name ) !== $this->tag_name_length || - 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) - ) - ) { - return false; + if ( isset( $this->sought_tag_name ) ) { + $tag_name_matches = ( + strlen( $this->sought_tag_name ) === $this->tag_name_length && + 0 === substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) + ); + + /* + * Names are matched in the same alphabet `get_tag()` exposes, + * where U+0000 NULL bytes appear as U+FFFD: a sought name + * containing U+FFFD matches source names with NULL bytes in + * its place, and a sought name containing a NULL byte matches + * nothing, since no exposed name contains one. The byte + * comparison above already agrees for names without NULL + * bytes, so this only resolves the rare disagreements. + */ + if ( $tag_name_matches ) { + $tag_name_matches = false === strpos( $this->sought_tag_name, "\x00" ); + } elseif ( false !== strpos( $this->sought_tag_name, "\u{FFFD}" ) ) { + $raw_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + if ( false !== strpos( $raw_name, "\x00" ) ) { + $exposed_name = str_replace( "\x00", "\u{FFFD}", $raw_name ); + $tag_name_matches = ( + strlen( $this->sought_tag_name ) === strlen( $exposed_name ) && + 0 === substr_compare( $exposed_name, $this->sought_tag_name, 0, strlen( $exposed_name ), true ) + ); + } + } + + if ( ! $tag_name_matches ) { + return false; + } } if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 97954f4eb3e30..9527739edd23b 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -61,6 +61,84 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); } + /** + * Ensures that numeric character references for U+0000 decode to U+FFFD + * while raw NULL bytes pass through the decoder untransformed. + * + * The tokenizer, not the decoder, is responsible for replacing raw NULL + * bytes; in the Tag Processor that responsibility falls on the methods + * which read values out of the input document. + * + * @ticket 65372 + * + * @dataProvider data_null_code_points + * + * @param string $raw_value Raw attribute value. + * @param string $decoded_value The expected decoded attribute value. + */ + public function test_null_code_points_in_attribute_values( string $raw_value, string $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_attribute( $raw_value ), + 'Improperly decoded raw attribute value.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_null_code_points() { + return array( + 'Decimal zero' => array( 'ab', "a\u{FFFD}b" ), + 'Hexadecimal zero' => array( 'ab', "a\u{FFFD}b" ), + 'Multiple zeros' => array( 'ab', "a\u{FFFD}b" ), + 'Raw NULL byte passes through' => array( "a\x00b", "a\x00b" ), + ); + } + + /** + * Ensures that the ambiguous-follower check for character references + * lacking a terminating semicolon treats only ASCII alphanumerics and + * the equals sign as ambiguous, regardless of the process locale. + * + * `ctype_alnum()` classifies bytes 0x80 and above as alphanumeric under + * UTF-8 locales, wrongly suppressing decodes whose follower is a + * non-ASCII byte, such as U+FFFD produced by NULL-byte replacement. + * + * @ticket 65372 + * + * @see https://html.spec.whatwg.org/#named-character-reference-state + * + * @dataProvider data_semicolon_less_references_with_followers + * + * @param string $raw_value Raw attribute value. + * @param string $decoded_value The expected decoded attribute value. + */ + public function test_semicolon_less_reference_followers( string $raw_value, string $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_attribute( $raw_value ), + 'Improperly decoded raw attribute value.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_semicolon_less_references_with_followers() { + return array( + 'U+FFFD follower decodes' => array( "x&\u{FFFD};y", "x&\u{FFFD};y" ), + 'Non-ASCII follower decodes' => array( "x&\u{E9}y", "x&\u{E9}y" ), + 'ASCII letter follower is ambiguous' => array( 'x&zy', 'x&zy' ), + 'ASCII digit follower is ambiguous' => array( 'x&1y', 'x&1y' ), + 'Equals sign follower is ambiguous' => array( 'x&=y', 'x&=y' ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index e516addb6c314..d09cada99ed50 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -509,6 +509,161 @@ public static function data_provider_normalized_fuzzer_cases_that_should_be_idem ); } + /** + * Ensures that decoded carriage returns are serialized as character references. + * + * @ticket 65372 + * + * @dataProvider data_provider_decoded_carriage_returns + * + * @param string $input HTML input containing a decoded carriage return. + * @param string $expected Expected normalized output. + */ + public function test_normalize_serializes_decoded_carriage_returns_as_character_references( string $input, string $expected ) { + $normalized = WP_HTML_Processor::normalize( $input ); + + $this->assertSame( $expected, $normalized, 'Should have serialized the carriage return as a character reference.' ); + $this->assertSame( + $expected, + WP_HTML_Processor::normalize( $normalized ), + 'Normalizing already-normalized HTML should not change the serialized carriage return.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_provider_decoded_carriage_returns() { + return array( + 'Regular text' => array( 'a b
', 'a b
' ), + 'Regular text with non-canonical character reference' => array( 'a b
', 'a b
' ), + 'RCDATA title' => array( '| x |
| x |
a b
', 'a b
' ), + ); + } + + /** + * Ensures that raw carriage returns in attribute values are serialized as line feeds. + * + * @ticket 65372 + * + * @dataProvider data_provider_raw_attribute_carriage_returns + * + * @param string $input HTML input containing raw carriage returns. + * @param string $expected Expected normalized output. + */ + public function test_normalize_serializes_raw_attribute_carriage_returns_as_line_feeds( string $input, string $expected ) { + $normalized = WP_HTML_Processor::normalize( $input ); + + $this->assertSame( $expected, $normalized, 'Should have serialized raw attribute carriage returns as line feeds.' ); + $this->assertSame( + $expected, + WP_HTML_Processor::normalize( $normalized ), + 'Normalizing already-normalized HTML should not change raw attribute newlines.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_provider_raw_attribute_carriage_returns() { + return array( + 'Raw carriage return' => array( "", "" ), + 'Raw CRLF pair' => array( "", "" ), + ); + } + + /** + * Ensures that raw carriage returns are normalized before class updates are serialized. + * + * @ticket 65372 + */ + public function test_serialize_token_normalizes_raw_class_carriage_returns_before_class_updates() { + $processor = WP_HTML_Processor::create_fragment( "" ); + + $this->assertTrue( $processor->next_tag( 'P' ), 'Should find the P element.' ); + + $processor->add_class( 'c' ); + + $serialized = $processor->serialize_token(); + $this->assertSame( + "", + $serialized, + 'Should have serialized raw class carriage returns as line feeds before adding classes.' + ); + + $reparsed = WP_HTML_Processor::create_fragment( $serialized ); + $this->assertTrue( $reparsed->next_tag( 'P' ), 'Should find the reparsed P element.' ); + $this->assertSame( "a\nb c", $reparsed->get_attribute( 'class' ), 'The serialized class should parse back to the same value.' ); + } + + /** + * Ensures rawtext element contents serialize without escaping: + * character references do not decode inside SCRIPT and STYLE, so + * escaping their contents or emitting ` ` there would corrupt them. + * + * @ticket 65372 + * + * @dataProvider data_provider_rawtext_contents + * + * @param string $html HTML whose rawtext contents must serialize unchanged. + */ + public function test_normalize_preserves_rawtext_contents( string $html ) { + $this->assertSame( + $html, + WP_HTML_Processor::normalize( $html ), + 'Should have serialized the rawtext contents unchanged.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_provider_rawtext_contents() { + return array( + 'SCRIPT with character references' => array( '' ), + 'STYLE with character references' => array( '' ), + ); + } + + /** + * Ensures NULL bytes in attribute values set through the API serialize + * as U+FFFD so that serialized output parses back to the same value. + * + * Browsers serialize the raw NULL byte in innerHTML, which does not + * round-trip: re-parsing replaces it with U+FFFD. Serializing U+FFFD + * directly is a benign deviation which keeps output idempotent, like + * serializing decoded carriage returns as . + * + * @ticket 65372 + */ + public function test_serialize_token_replaces_null_bytes_in_enqueued_attribute_values() { + $processor = WP_HTML_Processor::create_fragment( '
' ); + + $this->assertTrue( $processor->next_tag( 'P' ), 'Should find the P element.' ); + $this->assertTrue( $processor->set_attribute( 'title', "a\x00b" ), 'Should have set the attribute.' ); + + $serialized = $processor->serialize_token(); + $this->assertSame( + "", + $serialized, + 'Should have serialized the NULL byte as U+FFFD.' + ); + + $reparsed = WP_HTML_Processor::create_fragment( $serialized ); + $this->assertTrue( $reparsed->next_tag( 'P' ), 'Should find the reparsed P element.' ); + $this->assertSame( "a\u{FFFD}b", $reparsed->get_attribute( 'title' ), 'The serialized title should parse back to the same value.' ); + } + /** * Data provider. * diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php new file mode 100644 index 0000000000000..1180fa7110c88 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -0,0 +1,428 @@ +assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( $expected, $processor->get_attribute( 'a' ) ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_attribute_values_with_preprocessing() { + return array( + 'Raw CR' => array( "