From b2abcaa6a30a2a94614edea4e8ca98f89c9532c5 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 17:37:46 +0200 Subject: [PATCH 1/2] HTML API: Preserve IFRAME, NOEMBED, and NOFRAMES contents when serializing. The serializer dropped the raw text contents of these elements, removing parsed document content across a serialize/re-parse cycle. Per the HTML fragment serialization algorithm, their contents are emitted literally, as is already done for SCRIPT and STYLE. Raw text cannot contain its own closing tag, so literal emission cannot terminate the element early when re-parsing. See #65372. --- .../html-api/class-wp-html-processor.php | 24 ++++- .../html-api/wpHtmlProcessor-serialize.php | 98 +++++++++++++++++++ 2 files changed, 119 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 35d91fad3129c..5ce892af0024a 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1347,6 +1347,8 @@ public function serialize(): ?string { * * @since 6.7.0 * @since 6.9.0 Converted from protected to public method. + * @since 7.1.0 Contents of IFRAME, NOEMBED, and NOFRAMES elements are + * serialized literally instead of being dropped. * * @return string Serialization of token, or empty string if no serialization exists. */ @@ -1490,12 +1492,28 @@ public function serialize_token(): string { $text = $this->get_modifiable_text(); switch ( $tag_name ) { + /* + * The contents of these elements are emitted literally to preserve + * the document's contents, following the HTML serialization spec: + * + * > If the parent of current node is a style, script, xmp, iframe, + * > noembed, noframes, or plaintext element, or if the parent of + * > current node is a noscript element and scripting is enabled for + * > the node, then append the value of current node's data literally. + * + * This is safe because character references are never decoded in + * their contents. RAWTEXT contents (IFRAME, NOEMBED, NOFRAMES, + * STYLE) cannot contain their own closing tag, so the closer + * appended below cannot be matched early. SCRIPT data may contain + * escaped closers (e.g. within ``), but re-parsing the + * identical bytes follows the same tokenization rules that produced + * this text, terminating at the appended closer all the same. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments + */ case 'IFRAME': case 'NOEMBED': case 'NOFRAMES': - $text = ''; - break; - case 'SCRIPT': case 'STYLE': break; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index e516addb6c314..7deca5b5da715 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -134,6 +134,101 @@ public function test_style_contents_are_not_escaped() { ); } + /** + * Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are + * preserved when serializing. + * + * These elements contain raw text which is part of the parsed document. + * Dropping it would change the document's contents across a serialize and + * re-parse cycle. + * + * @ticket 65372 + * + * @dataProvider data_rawtext_elements_with_contents + * + * @param string $html Normalized HTML containing a rawtext element with contents. + */ + public function test_rawtext_element_contents_are_preserved_when_normalizing( string $html ) { + $this->assertSame( + $html, + WP_HTML_Processor::normalize( $html ), + 'Should have preserved the rawtext element contents.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_rawtext_elements_with_contents() { + return array( + 'IFRAME with following text' => array( 'y' ), + 'NOEMBED with following text' => array( 'xy' ), + 'NOFRAMES with following text' => array( '
xy
' ), + 'NOFRAMES before comment' => array( '
x
' ), + 'IFRAME with markup-like contents' => array( '' ), + 'NOEMBED with character reference' => array( '&' ), + 'IFRAME in foreign content' => array( '' ), + ); + } + + /** + * Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are + * preserved when serializing full documents, including NOFRAMES elements + * in the HEAD or after a FRAMESET. + * + * @ticket 65372 + * + * @dataProvider data_full_documents_with_rawtext_elements + * + * @param string $html Input HTML document. + * @param string $expected Expected serialization of the full document. + */ + public function test_rawtext_element_contents_are_preserved_in_full_documents( string $html, string $expected ) { + $processor = WP_HTML_Processor::create_full_parser( $html ); + + $this->assertSame( + $expected, + $processor->serialize(), + 'Should have preserved the rawtext element contents.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_full_documents_with_rawtext_elements() { + return array( + 'IFRAME in BODY' => array( + 'y', + 'y', + ), + 'NOEMBED in BODY' => array( + 'ax', + 'ax', + ), + 'NOFRAMES in BODY' => array( + 'ax', + 'ax', + ), + 'NOFRAMES in HEAD' => array( + 'xz', + 'xz', + ), + 'NOFRAMES in FRAMESET' => array( + 'x', + 'x', + ), + 'IFRAME before a comment' => array( + '

', + '

', + ), + ); + } + public function test_unexpected_closing_tags_are_removed() { $this->assertSame( WP_HTML_Processor::normalize( 'one
twothree' ), @@ -281,6 +376,9 @@ public static function data_tokens_with_null_bytes() { 'Foreign content text' => array( "one\x00two", "one\u{FFFD}two" ), 'SCRIPT content' => array( "", "" ), 'STYLE content' => array( "", "" ), + 'IFRAME content' => array( "", "" ), + 'NOEMBED content' => array( "a\x00b", "a\u{FFFD}b" ), + 'NOFRAMES content' => array( "a\x00b", "a\u{FFFD}b" ), 'Comment text' => array( "", "" ), ); } From ae3a8bee4e26ddee4027df048390150d518307a9 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 17:37:46 +0200 Subject: [PATCH 2/2] HTML API: Stop escaping XMP contents when serializing. XMP contents are raw text in which character references are never decoded. Escaping them changed document contents across a serialize/re-parse cycle: `1 < 2` serialized as `1 &lt; 2`, which re-parses as the literal text "1 < 2". XMP contents now serialize literally like the other raw text elements, following the HTML fragment serialization algorithm. See #65372. --- .../html-api/class-wp-html-processor.php | 12 +++++++++--- .../html-api/wpHtmlProcessor-serialize.php | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 5ce892af0024a..947a866551040 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1347,8 +1347,8 @@ public function serialize(): ?string { * * @since 6.7.0 * @since 6.9.0 Converted from protected to public method. - * @since 7.1.0 Contents of IFRAME, NOEMBED, and NOFRAMES elements are - * serialized literally instead of being dropped. + * @since 7.1.0 Contents of IFRAME, NOEMBED, NOFRAMES, and XMP elements are + * serialized literally instead of being dropped or escaped. * * @return string Serialization of token, or empty string if no serialization exists. */ @@ -1503,7 +1503,7 @@ public function serialize_token(): string { * * This is safe because character references are never decoded in * their contents. RAWTEXT contents (IFRAME, NOEMBED, NOFRAMES, - * STYLE) cannot contain their own closing tag, so the closer + * STYLE, XMP) cannot contain their own closing tag, so the closer * appended below cannot be matched early. SCRIPT data may contain * escaped closers (e.g. within ``), but re-parsing the * identical bytes follows the same tokenization rules that produced @@ -1516,8 +1516,14 @@ public function serialize_token(): string { case 'NOFRAMES': case 'SCRIPT': case 'STYLE': + case 'XMP': break; + /* + * The contents of TEXTAREA and TITLE are parsed as RCDATA, in which + * character references are decoded, so the decoded modifiable text + * must be re-escaped to preserve the document's contents. + */ default: $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); } diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index 7deca5b5da715..a08d5bb0a6fa9 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -134,6 +134,23 @@ public function test_style_contents_are_not_escaped() { ); } + /** + * Ensures that XMP contents are not escaped, as they are not parsed like text nodes are. + * + * XMP contents are parsed as raw text: character references are never decoded. + * Escaping the contents would change the document, e.g. a "<" would be replaced + * by the literal text "<" after serializing and re-parsing. + * + * @ticket 65372 + */ + public function test_xmp_contents_are_not_escaped() { + $this->assertSame( + "1 < 2 &amp; apples > or\u{FFFD}anges", + WP_HTML_Processor::normalize( "1 < 2 &amp; apples > or\x00anges" ), + 'Should have preserved text inside an XMP element, except for replacing NULL bytes.' + ); + } + /** * Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are * preserved when serializing. @@ -379,6 +396,7 @@ public static function data_tokens_with_null_bytes() { 'IFRAME content' => array( "", "" ), 'NOEMBED content' => array( "a\x00b", "a\u{FFFD}b" ), 'NOFRAMES content' => array( "a\x00b", "a\u{FFFD}b" ), + 'XMP content' => array( "a\x00b", "a\u{FFFD}b" ), 'Comment text' => array( "", "" ), ); }