diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 35d91fad3129c..947a866551040 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1347,6 +1347,8 @@ public function serialize(): ?string { * * @since 6.7.0 * @since 6.9.0 Converted from protected to public method. + * @since 7.1.0 Contents of IFRAME, NOEMBED, NOFRAMES, and XMP elements are + * serialized literally instead of being dropped or escaped. * * @return string Serialization of token, or empty string if no serialization exists. */ @@ -1490,16 +1492,38 @@ public function serialize_token(): string { $text = $this->get_modifiable_text(); switch ( $tag_name ) { + /* + * The contents of these elements are emitted literally to preserve + * the document's contents, following the HTML serialization spec: + * + * > If the parent of current node is a style, script, xmp, iframe, + * > noembed, noframes, or plaintext element, or if the parent of + * > current node is a noscript element and scripting is enabled for + * > the node, then append the value of current node's data literally. + * + * This is safe because character references are never decoded in + * their contents. RAWTEXT contents (IFRAME, NOEMBED, NOFRAMES, + * STYLE, XMP) cannot contain their own closing tag, so the closer + * appended below cannot be matched early. SCRIPT data may contain + * escaped closers (e.g. within ``), but re-parsing the + * identical bytes follows the same tokenization rules that produced + * this text, terminating at the appended closer all the same. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments + */ case 'IFRAME': case 'NOEMBED': case 'NOFRAMES': - $text = ''; - break; - case 'SCRIPT': case 'STYLE': + case 'XMP': break; + /* + * The contents of TEXTAREA and TITLE are parsed as RCDATA, in which + * character references are decoded, so the decoded modifiable text + * must be re-escaped to preserve the document's contents. + */ default: $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); } diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index e516addb6c314..a08d5bb0a6fa9 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -134,6 +134,118 @@ public function test_style_contents_are_not_escaped() { ); } + /** + * Ensures that XMP contents are not escaped, as they are not parsed like text nodes are. + * + * XMP contents are parsed as raw text: character references are never decoded. + * Escaping the contents would change the document, e.g. a "<" would be replaced + * by the literal text "<" after serializing and re-parsing. + * + * @ticket 65372 + */ + public function test_xmp_contents_are_not_escaped() { + $this->assertSame( + "1 < 2 &amp; apples > or\u{FFFD}anges", + WP_HTML_Processor::normalize( "1 < 2 &amp; apples > or\x00anges" ), + 'Should have preserved text inside an XMP element, except for replacing NULL bytes.' + ); + } + + /** + * Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are + * preserved when serializing. + * + * These elements contain raw text which is part of the parsed document. + * Dropping it would change the document's contents across a serialize and + * re-parse cycle. + * + * @ticket 65372 + * + * @dataProvider data_rawtext_elements_with_contents + * + * @param string $html Normalized HTML containing a rawtext element with contents. + */ + public function test_rawtext_element_contents_are_preserved_when_normalizing( string $html ) { + $this->assertSame( + $html, + WP_HTML_Processor::normalize( $html ), + 'Should have preserved the rawtext element contents.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_rawtext_elements_with_contents() { + return array( + 'IFRAME with following text' => array( 'y' ), + 'NOEMBED with following text' => array( 'xy' ), + 'NOFRAMES with following text' => array( '
xy
' ), + 'NOFRAMES before comment' => array( '
x
' ), + 'IFRAME with markup-like contents' => array( '' ), + 'NOEMBED with character reference' => array( '&amp;' ), + 'IFRAME in foreign content' => array( '' ), + ); + } + + /** + * Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are + * preserved when serializing full documents, including NOFRAMES elements + * in the HEAD or after a FRAMESET. + * + * @ticket 65372 + * + * @dataProvider data_full_documents_with_rawtext_elements + * + * @param string $html Input HTML document. + * @param string $expected Expected serialization of the full document. + */ + public function test_rawtext_element_contents_are_preserved_in_full_documents( string $html, string $expected ) { + $processor = WP_HTML_Processor::create_full_parser( $html ); + + $this->assertSame( + $expected, + $processor->serialize(), + 'Should have preserved the rawtext element contents.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_full_documents_with_rawtext_elements() { + return array( + 'IFRAME in BODY' => array( + 'y', + 'y', + ), + 'NOEMBED in BODY' => array( + 'ax', + 'ax', + ), + 'NOFRAMES in BODY' => array( + 'ax', + 'ax', + ), + 'NOFRAMES in HEAD' => array( + 'xz', + 'xz', + ), + 'NOFRAMES in FRAMESET' => array( + 'x', + 'x', + ), + 'IFRAME before a comment' => array( + '

', + '

', + ), + ); + } + public function test_unexpected_closing_tags_are_removed() { $this->assertSame( WP_HTML_Processor::normalize( 'one
twothree' ), @@ -281,6 +393,10 @@ public static function data_tokens_with_null_bytes() { 'Foreign content text' => array( "one\x00two", "one\u{FFFD}two" ), 'SCRIPT content' => array( "", "" ), 'STYLE content' => array( "", "" ), + 'IFRAME content' => array( "", "" ), + 'NOEMBED content' => array( "a\x00b", "a\u{FFFD}b" ), + 'NOFRAMES content' => array( "a\x00b", "a\u{FFFD}b" ), + 'XMP content' => array( "a\x00b", "a\u{FFFD}b" ), 'Comment text' => array( "", "" ), ); }