Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,8 @@ public function serialize(): ?string {
*
* @since 6.7.0
* @since 6.9.0 Converted from protected to public method.
* @since 7.1.0 Contents of IFRAME, NOEMBED, NOFRAMES, and XMP elements are
* serialized literally instead of being dropped or escaped.
*
* @return string Serialization of token, or empty string if no serialization exists.
*/
Expand Down Expand Up @@ -1490,16 +1492,38 @@ public function serialize_token(): string {
$text = $this->get_modifiable_text();

switch ( $tag_name ) {
/*
* The contents of these elements are emitted literally to preserve
* the document's contents, following the HTML serialization spec:
*
* > If the parent of current node is a style, script, xmp, iframe,
* > noembed, noframes, or plaintext element, or if the parent of
* > current node is a noscript element and scripting is enabled for
* > the node, then append the value of current node's data literally.
*
* This is safe because character references are never decoded in
* their contents. RAWTEXT contents (IFRAME, NOEMBED, NOFRAMES,
* STYLE, XMP) cannot contain their own closing tag, so the closer
* appended below cannot be matched early. SCRIPT data may contain
* escaped closers (e.g. within `<!-- -->`), but re-parsing the
* identical bytes follows the same tokenization rules that produced
* this text, terminating at the appended closer all the same.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments
*/
case 'IFRAME':
case 'NOEMBED':
case 'NOFRAMES':
$text = '';
break;

case 'SCRIPT':
case 'STYLE':
case 'XMP':
break;

/*
* The contents of TEXTAREA and TITLE are parsed as RCDATA, in which
* character references are decoded, so the decoded modifiable text
* must be re-escaped to preserve the document's contents.
*/
default:
$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
}
Expand Down
116 changes: 116 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,118 @@ public function test_style_contents_are_not_escaped() {
);
}

/**
* Ensures that XMP contents are not escaped, as they are not parsed like text nodes are.
*
* XMP contents are parsed as raw text: character references are never decoded.
* Escaping the contents would change the document, e.g. a "<" would be replaced
* by the literal text "&lt;" after serializing and re-parsing.
*
* @ticket 65372
*/
public function test_xmp_contents_are_not_escaped() {
$this->assertSame(
"<xmp>1 < 2 &amp; apples > or\u{FFFD}anges</xmp>",
WP_HTML_Processor::normalize( "<xmp>1 < 2 &amp; apples > or\x00anges</xmp>" ),
'Should have preserved text inside an XMP element, except for replacing NULL bytes.'
);
}

/**
* Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are
* preserved when serializing.
*
* These elements contain raw text which is part of the parsed document.
* Dropping it would change the document's contents across a serialize and
* re-parse cycle.
*
* @ticket 65372
*
* @dataProvider data_rawtext_elements_with_contents
*
* @param string $html Normalized HTML containing a rawtext element with contents.
*/
public function test_rawtext_element_contents_are_preserved_when_normalizing( string $html ) {
$this->assertSame(
$html,
WP_HTML_Processor::normalize( $html ),
'Should have preserved the rawtext element contents.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_rawtext_elements_with_contents() {
return array(
'IFRAME with following text' => array( '<iframe>x</iframe>y' ),
'NOEMBED with following text' => array( '<noembed>x</noembed>y' ),
'NOFRAMES with following text' => array( '<section><noframes>x</noframes>y</section>' ),
'NOFRAMES before comment' => array( '<section><noframes>x</noframes><!----></section>' ),
'IFRAME with markup-like contents' => array( '<iframe><div>inert</div></iframe>' ),
'NOEMBED with character reference' => array( '<noembed>&amp;</noembed>' ),
'IFRAME in foreign content' => array( '<svg><iframe>1 &lt; 2</iframe></svg>' ),
);
}

/**
* Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are
* preserved when serializing full documents, including NOFRAMES elements
* in the HEAD or after a FRAMESET.
*
* @ticket 65372
*
* @dataProvider data_full_documents_with_rawtext_elements
*
* @param string $html Input HTML document.
* @param string $expected Expected serialization of the full document.
*/
public function test_rawtext_element_contents_are_preserved_in_full_documents( string $html, string $expected ) {
$processor = WP_HTML_Processor::create_full_parser( $html );

$this->assertSame(
$expected,
$processor->serialize(),
'Should have preserved the rawtext element contents.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_full_documents_with_rawtext_elements() {
return array(
'IFRAME in BODY' => array(
'<iframe>x</iframe>y',
'<html><head></head><body><iframe>x</iframe>y</body></html>',
),
'NOEMBED in BODY' => array(
'a<noembed>x</noembed>',
'<html><head></head><body>a<noembed>x</noembed></body></html>',
),
'NOFRAMES in BODY' => array(
'a<noframes>x</noframes>',
'<html><head></head><body>a<noframes>x</noframes></body></html>',
),
'NOFRAMES in HEAD' => array(
'<head><noframes>x</noframes></head>z',
'<html><head><noframes>x</noframes></head><body>z</body></html>',
),
'NOFRAMES in FRAMESET' => array(
'<html><frameset><noframes>x</noframes>',
'<html><head></head><frameset><noframes>x</noframes></frameset></html>',
),
'IFRAME before a comment' => array(
'<h3><div><small><dd><iframe>x</iframe><!---->',
'<html><head></head><body><h3><div><small><dd><iframe>x</iframe><!----></dd></small></div></h3></body></html>',
),
);
}

public function test_unexpected_closing_tags_are_removed() {
$this->assertSame(
WP_HTML_Processor::normalize( 'one</div>two</span>three' ),
Expand Down Expand Up @@ -281,6 +393,10 @@ public static function data_tokens_with_null_bytes() {
'Foreign content text' => array( "<svg>one\x00two</svg>", "<svg>one\u{FFFD}two</svg>" ),
'SCRIPT content' => array( "<script>alert(\x00)</script>", "<script>alert(\u{FFFD})</script>" ),
'STYLE content' => array( "<style>\x00 {}</style>", "<style>\u{FFFD} {}</style>" ),
'IFRAME content' => array( "<iframe>a\x00b</iframe>", "<iframe>a\u{FFFD}b</iframe>" ),
'NOEMBED content' => array( "<noembed>a\x00b</noembed>", "<noembed>a\u{FFFD}b</noembed>" ),
'NOFRAMES content' => array( "<noframes>a\x00b</noframes>", "<noframes>a\u{FFFD}b</noframes>" ),
'XMP content' => array( "<xmp>a\x00b</xmp>", "<xmp>a\u{FFFD}b</xmp>" ),
'Comment text' => array( "<!-- \x00 -->", "<!-- \u{FFFD} -->" ),
);
}
Expand Down
Loading