diff --git a/.gitignore b/.gitignore index 15876fa47fee8..dbec517727bbc 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,7 @@ wp-tests-config.php /packagehash.txt /.gutenberg-hash /artifacts +/tools/html-api-fuzz/oracles/lexbor/build /setup.log /coverage diff --git a/merged-prs-2026-06-11.md b/merged-prs-2026-06-11.md new file mode 100644 index 0000000000000..e86fd913c3179 --- /dev/null +++ b/merged-prs-2026-06-11.md @@ -0,0 +1,29 @@ +# PR merges for html-api-fuzz + +Date: 2026-06-11 + +All included PRs were merged into `html-api-fuzz` with merge commits. Trunk was checked before and after these merges; `origin/trunk` is already an ancestor of this branch, so Git had no trunk merge commit to create. + +## Merged + +- PR #53, `origin/spec-compliant-getters` + - Merge commit: `5a3cbf19df` + - Why: Aligns HTML API input preprocessing and getter behavior with the spec, reducing fuzzer/oracle noise around NULL bytes, carriage returns, and decoded source values. This also gives the rebuilt #42 branch the helper behavior it expects. + +- PR #42, `origin/html-api-fuzz-fiz/decoded-cr` + - Merge commit: `ce2af0eff6` + - Why: Updates the earlier #42 merge to the current PR head. It preserves decoded carriage returns as ` ` during serialization, normalizes NULL bytes through the shared serializer path, and removes the old `get_attribute_for_serialization()` workaround that #53 makes unnecessary. + +- PR #51, `origin/html-api-normalize-restore-missing-text-content` + - Merge commit: `ce08149069` + - Why: Addresses issue #50 and the latest fuzzer `normalize-tree-changed` failures where raw text was dropped from `IFRAME`, `NOEMBED`, `NOFRAMES`, and related rawtext serialization paths. + +- PR #17, `origin/copilot/add-script-data-filter` + - Merge commit: `adbe354c94` + - Why: Addresses issue #16 by adding the classic-script `script_data_{$handle}` JSON data hook. This is separate from the HTML API fuzzer stack, so it was kept in its own merge commit. + - Follow-up: `57d458df91` corrects the new test's script-tag assertion to match WordPress's emitted attribute order. + +## Not merged + +- Trunk: already present; no-op. +- Other open PRs: either already contained in this branch or not tied to the existing issues identified in this pass. diff --git a/src/wp-includes/class-wp-scripts.php b/src/wp-includes/class-wp-scripts.php index 6f633d465bb2c..6eb95febe1c02 100644 --- a/src/wp-includes/class-wp-scripts.php +++ b/src/wp-includes/class-wp-scripts.php @@ -480,7 +480,108 @@ public function do_item( $handle, $group = false ) { $attr['data-wp-fetchpriority'] = $original_fetchpriority; } - $tag = $translations . $before_script; + /** + * Filters data associated with a given script. + * + * Scripts may require data that is required for initialization or is essential + * to have immediately available on page load. These are suitable use cases for + * this data. + * + * The dynamic portion of the hook name, `$handle`, refers to the script handle. + * + * This is best suited to pass essential data that must be available to the script for + * initialization or immediately on page load. It does not replace the REST API or + * fetching data from the client. + * + * Example: + * + * add_filter( + * 'script_data_my-script-handle', + * function ( array $data ): array { + * $data['myConfig'] = array( 'key' => 'value' ); + * return $data; + * } + * ); + * + * If the filter returns no data (an empty array), nothing will be embedded in the page. + * + * The data for a given script, if provided, will be JSON serialized in a script + * tag with an ID of the form `wp-script-data-{$handle}` and type `application/json`. + * + * The data can be read on the client with a pattern like this: + * + * Example: + * + * const dataContainer = document.getElementById( 'wp-script-data-my-script-handle' ); + * let data = {}; + * if ( dataContainer ) { + * try { + * data = JSON.parse( dataContainer.textContent ); + * } catch {} + * } + * // data.myConfig.key === 'value'; + * initMyScriptWithData( data ); + * + * @since 7.1.0 + * + * @param array $data The data associated with the script. + */ + $script_data = apply_filters( "script_data_{$handle}", array() ); + + $script_data_tag = ''; + if ( ! empty( $script_data ) ) { + /* + * This data will be printed as JSON inside a script tag like this: + * + * + * A script tag must be closed by a sequence beginning with `` will be printed as `\u003C/script>`. + * + * - JSON_HEX_TAG: All < and > are converted to \u003C and \u003E. + * - JSON_UNESCAPED_SLASHES: Don't escape /. + * - JSON_INVALID_UTF8_SUBSTITUTE: Substitute invalid UTF-8 sequences with U+FFFD (�) + * instead of failing. This avoids the overhead of `wp_json_encode()`'s fallback + * re-encoding and ensures consistent handling with the standard replacement character. + * + * If the page will use UTF-8 encoding, it's safe to print unescaped unicode: + * + * - JSON_UNESCAPED_UNICODE: Encode multibyte Unicode characters literally (instead of as `\uXXXX`). + * - JSON_UNESCAPED_LINE_TERMINATORS: The line terminators are kept unescaped when + * JSON_UNESCAPED_UNICODE is supplied. It uses the same behaviour as it was + * before PHP 7.1 without this constant. Available as of PHP 7.1.0. + * + * The JSON specification requires encoding in UTF-8, so if the generated HTML page + * is not encoded in UTF-8 then it's not safe to include those literals. They must + * be escaped to avoid encoding issues. + * + * @see https://www.rfc-editor.org/rfc/rfc8259.html for details on encoding requirements. + * @see https://www.php.net/manual/en/json.constants.php for details on these constants. + * @see https://html.spec.whatwg.org/#script-data-state for details on script tag parsing. + */ + $json_encode_flags = JSON_HEX_TAG | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_LINE_TERMINATORS | JSON_INVALID_UTF8_SUBSTITUTE; + if ( ! is_utf8_charset() ) { + $json_encode_flags = JSON_HEX_TAG | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_SUBSTITUTE; + } + + /* + * Return the data script tag as a string (third parameter false) rather than echoing it. + * This allows it to be included with the script tag in the concatenated output. + */ + $script_data_tag = wp_print_inline_script_tag( + wp_json_encode( + $script_data, + $json_encode_flags + ), + array( + 'type' => 'application/json', + 'id' => "wp-script-data-{$handle}", + ), + false + ); + } + + $tag = $translations . $before_script . $script_data_tag; $tag .= wp_get_script_tag( $attr ); $tag .= $after_script; diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index d902f4b7cabc4..9f33056de0c14 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -195,6 +195,8 @@ public static function decode( $context, $text ): string { * 7 === $token_length; // `∉` * * @since 6.6.0 + * @since 7.1.0 Detects ambiguous followers of semicolon-less references + * by ASCII classification only, independent of the locale. * * @global WP_Token_Map $html5_named_character_references Mappings for HTML5 named character references. * @@ -377,14 +379,20 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * At this point though there's a match for an entry in the named * character reference table but the match doesn't end in `;`. * It may be allowed if it's followed by something unambiguous. + * + * Only an ASCII alphanumeric or U+003D EQUALS SIGN is ambiguous. + * `ctype_alnum()` must be avoided here: its classification of + * bytes 0x80 and above depends on the process locale, but only + * these specific ASCII characters prevent decoding. + * + * @see https://html.spec.whatwg.org/#named-character-reference-state */ + $follower = $after_name < $length ? $text[ $after_name ] : ''; $ambiguous_follower = ( - $after_name < $length && - $name_at < $length && - ( - ctype_alnum( $text[ $after_name ] ) || - '=' === $text[ $after_name ] - ) + ( 'a' <= $follower && 'z' >= $follower ) || + ( 'A' <= $follower && 'Z' >= $follower ) || + ( '0' <= $follower && '9' >= $follower ) || + '=' === $follower ); // It's non-ambiguous, safe to leave it in. diff --git a/src/wp-includes/html-api/class-wp-html-open-elements.php b/src/wp-includes/html-api/class-wp-html-open-elements.php index 0cd1f0fc45e07..8941e4b8b181a 100644 --- a/src/wp-includes/html-api/class-wp-html-open-elements.php +++ b/src/wp-includes/html-api/class-wp-html-open-elements.php @@ -281,6 +281,7 @@ public function has_element_in_specific_scope( string $tag_name, $termination_li * > - th * > - marquee * > - object + * > - select * > - template * > - MathML mi * > - MathML mo @@ -312,6 +313,7 @@ public function has_element_in_scope( string $tag_name ): bool { 'TH', 'MARQUEE', 'OBJECT', + 'SELECT', 'TEMPLATE', 'math MI', @@ -362,6 +364,7 @@ public function has_element_in_list_item_scope( string $tag_name ): bool { 'MARQUEE', 'OBJECT', 'OL', + 'SELECT', 'TEMPLATE', 'UL', @@ -410,6 +413,7 @@ public function has_element_in_button_scope( string $tag_name ): bool { 'TH', 'MARQUEE', 'OBJECT', + 'SELECT', 'TEMPLATE', 'math MI', @@ -459,9 +463,8 @@ public function has_element_in_table_scope( string $tag_name ): bool { /** * Returns whether a particular element is in select scope. * - * This test differs from the others like it, in that its rules are inverted. - * Instead of arriving at a match when one of any tag in a termination group - * is reached, this one terminates if any other tag is reached. + * The "select scope" concept was removed from the HTML standard along with the + * customizable ` #errors -33: Stray start tag “menuitem”. +1:34: ERROR: End tag 'select' isn't allowed here. Currently open tags: html, body, select, menuitem. #document | | | | |