Eggs & Milk
`. * $processor->set_modifiable_text( 'Eggs & Milk' ); * + * Note: unlike attribute values set through `set_attribute()`, which read + * back verbatim, text set through this method currently reads back through + * `get_modifiable_text()` with newlines normalized and NULL bytes handled + * as if the text had come from the input document. In the DOM, API-supplied + * text round-trips verbatim; this asymmetry is a known limitation. + * * @since 6.7.0 * @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping. * @@ -4770,14 +4855,37 @@ private function matches(): bool { } // Does the tag name match the requested tag name in a case-insensitive manner? - if ( - isset( $this->sought_tag_name ) && - ( - strlen( $this->sought_tag_name ) !== $this->tag_name_length || - 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) - ) - ) { - return false; + if ( isset( $this->sought_tag_name ) ) { + $tag_name_matches = ( + strlen( $this->sought_tag_name ) === $this->tag_name_length && + 0 === substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) + ); + + /* + * Names are matched in the same alphabet `get_tag()` exposes, + * where U+0000 NULL bytes appear as U+FFFD: a sought name + * containing U+FFFD matches source names with NULL bytes in + * its place, and a sought name containing a NULL byte matches + * nothing, since no exposed name contains one. The byte + * comparison above already agrees for names without NULL + * bytes, so this only resolves the rare disagreements. + */ + if ( $tag_name_matches ) { + $tag_name_matches = false === strpos( $this->sought_tag_name, "\x00" ); + } elseif ( false !== strpos( $this->sought_tag_name, "\u{FFFD}" ) ) { + $raw_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + if ( false !== strpos( $raw_name, "\x00" ) ) { + $exposed_name = str_replace( "\x00", "\u{FFFD}", $raw_name ); + $tag_name_matches = ( + strlen( $this->sought_tag_name ) === strlen( $exposed_name ) && + 0 === substr_compare( $exposed_name, $this->sought_tag_name, 0, strlen( $exposed_name ), true ) + ); + } + } + + if ( ! $tag_name_matches ) { + return false; + } } if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 97954f4eb3e30..9527739edd23b 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -61,6 +61,84 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { $this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' ); } + /** + * Ensures that numeric character references for U+0000 decode to U+FFFD + * while raw NULL bytes pass through the decoder untransformed. + * + * The tokenizer, not the decoder, is responsible for replacing raw NULL + * bytes; in the Tag Processor that responsibility falls on the methods + * which read values out of the input document. + * + * @ticket 65372 + * + * @dataProvider data_null_code_points + * + * @param string $raw_value Raw attribute value. + * @param string $decoded_value The expected decoded attribute value. + */ + public function test_null_code_points_in_attribute_values( string $raw_value, string $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_attribute( $raw_value ), + 'Improperly decoded raw attribute value.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_null_code_points() { + return array( + 'Decimal zero' => array( 'ab', "a\u{FFFD}b" ), + 'Hexadecimal zero' => array( 'ab', "a\u{FFFD}b" ), + 'Multiple zeros' => array( 'ab', "a\u{FFFD}b" ), + 'Raw NULL byte passes through' => array( "a\x00b", "a\x00b" ), + ); + } + + /** + * Ensures that the ambiguous-follower check for character references + * lacking a terminating semicolon treats only ASCII alphanumerics and + * the equals sign as ambiguous, regardless of the process locale. + * + * `ctype_alnum()` classifies bytes 0x80 and above as alphanumeric under + * UTF-8 locales, wrongly suppressing decodes whose follower is a + * non-ASCII byte, such as U+FFFD produced by NULL-byte replacement. + * + * @ticket 65372 + * + * @see https://html.spec.whatwg.org/#named-character-reference-state + * + * @dataProvider data_semicolon_less_references_with_followers + * + * @param string $raw_value Raw attribute value. + * @param string $decoded_value The expected decoded attribute value. + */ + public function test_semicolon_less_reference_followers( string $raw_value, string $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_attribute( $raw_value ), + 'Improperly decoded raw attribute value.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_semicolon_less_references_with_followers() { + return array( + 'U+FFFD follower decodes' => array( "x&\u{FFFD};y", "x&\u{FFFD};y" ), + 'Non-ASCII follower decodes' => array( "x&\u{E9}y", "x&\u{E9}y" ), + 'ASCII letter follower is ambiguous' => array( 'x&zy', 'x&zy' ), + 'ASCII digit follower is ambiguous' => array( 'x&1y', 'x&1y' ), + 'Equals sign follower is ambiguous' => array( 'x&=y', 'x&=y' ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. * diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php new file mode 100644 index 0000000000000..1180fa7110c88 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-input-preprocessing.php @@ -0,0 +1,428 @@ +assertTrue( $processor->next_tag(), 'Should have found the tag.' ); + $this->assertSame( $expected, $processor->get_attribute( 'a' ) ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_attribute_values_with_preprocessing() { + return array( + 'Raw CR' => array( "