diff --git a/src/wp-includes/css-api/class-wp-css-builder.php b/src/wp-includes/css-api/class-wp-css-builder.php new file mode 100644 index 0000000000000..56ae7a5ba009a --- /dev/null +++ b/src/wp-includes/css-api/class-wp-css-builder.php @@ -0,0 +1,241 @@ += 0x80 ) { + $result .= $value[ $i ]; + continue; + } + + // ASCII letters and underscore: always valid in idents. + if ( + ( $byte >= 0x41 && $byte <= 0x5A ) || // A-Z + ( $byte >= 0x61 && $byte <= 0x7A ) || // a-z + 0x5F === $byte // _ + ) { + $result .= $value[ $i ]; + continue; + } + + // Hyphen: valid in idents, but check for hyphen-digit at start. + if ( 0x2D === $byte ) { + // Hyphen at position 0 followed by a digit at position 1: escape the digit. + if ( 0 === $i && $i + 1 < $length && ord( $value[ $i + 1 ] ) >= 0x30 && ord( $value[ $i + 1 ] ) <= 0x39 ) { + $result .= '-'; + ++$i; + $result .= sprintf( '\\%X ', ord( $value[ $i ] ) ); + continue; + } + $result .= '-'; + continue; + } + + // Digits: valid except at position 0. + if ( $byte >= 0x30 && $byte <= 0x39 ) { + if ( 0 === $i ) { + $result .= sprintf( '\\%X ', $byte ); + } else { + $result .= $value[ $i ]; + } + continue; + } + + // Everything else: hex-escape. + $result .= sprintf( '\\%X ', $byte ); + } + + return $result; + } + + /** + * Create a quoted CSS string from a plain PHP string value. + * + * Example: + * $value = 'CSS & a ""; + * + * CSS strings are quoted many characters that are problematic in HTML + * or may be complicated for rudimentary CSS or HTML processors to handle + * are encoded using Unicode escape sequences. + * + * @see https://www.w3.org/TR/css-syntax-3/#escaping + */ + public static function string( string $value ): string { + $value = wp_scrub_utf8( $value ); + $escaped = strtr( + $value, + array( + // Escape existing backslashes to prevent unintentional escapes in result. + '\\' => '\\5C ', + + // Pre-processing replaces NULLs and some newlines. Replace and escape as necessary. + "\0" => "\u{FFFD}", + + // Normalize and replace newlines. https://www.w3.org/TR/css-syntax-3/#input-preprocessing + "\r\n" => '\\A ', + "\r" => '\\A ', + "\f" => '\\A ', + + // Newlines must be escaped in CSS strings. + "\n" => '\\A ', + + // Arbitrary characters for Unicode escaping: + + // HTML syntax may be problematic. + '<' => '\\3C ', + '>' => '\\3E ', + '&' => '\\26 ', + + // CSS syntax may be problematic. + ',' => '\\2C ', + ';' => '\\3B ', + '{' => '\\7B ', + '}' => '\\7D ', + '"' => '\\22 ', + "'" => '\\27 ', + ) + ); + return "\"{$escaped}\""; + } + + public static function normalize_and_escape_css( string $css ): string { + $css = wp_scrub_utf8( $css ); + $processor = WP_CSS_Token_Processor::create( $css ); + if ( null === $processor ) { + return ''; + } + + $normalized_css = ''; + + while ( $processor->next_token() ) { + switch ( $processor->get_token_type() ) { + + // Basic punctuation: + case WP_CSS_Token_Processor::TOKEN_SEMICOLON: $normalized_css .= ';'; break; + case WP_CSS_Token_Processor::TOKEN_COMMA: $normalized_css .= ','; break; + case WP_CSS_Token_Processor::TOKEN_WHITESPACE: $normalized_css .= ' '; break; + case WP_CSS_Token_Processor::TOKEN_COLON: $normalized_css .= ':'; break; + + // Paired punctuation: + case WP_CSS_Token_Processor::TOKEN_LEFT_BRACE: $normalized_css .= '{'; break; + case WP_CSS_Token_Processor::TOKEN_RIGHT_BRACE: $normalized_css .= '}'; break; + case WP_CSS_Token_Processor::TOKEN_LEFT_PAREN: $normalized_css .= '('; break; + case WP_CSS_Token_Processor::TOKEN_RIGHT_PAREN: $normalized_css .= ')'; break; + case WP_CSS_Token_Processor::TOKEN_LEFT_BRACKET: $normalized_css .= '['; break; + case WP_CSS_Token_Processor::TOKEN_RIGHT_BRACKET: $normalized_css .= ']'; break; + + // "@" + ident + case WP_CSS_Token_Processor::TOKEN_AT_KEYWORD: + $normalized_css .= '@' . self::ident( $processor->get_token_value() ); + break; + + // ident + "(" + case WP_CSS_Token_Processor::TOKEN_FUNCTION: + $normalized_css .= self::ident( $processor->get_token_value() ) . '('; + break; + + /* + * Hash tokens are not idents but their value can be escaped as such. + * + * ‖→ "#" →─┐ ┌──────────────────────────────┐ ┌─→‖ + * ├─→─┤ a-z A-Z 0-9 _ - or non-ASCII ├─→─┤ + * │ └──────────────────────────────┘ │ + * │ ┌──────────────────────────────┐ │ + * ├─→─┤ escape ├─→─┤ + * │ └──────────────────────────────┘ │ + * └──────────────────←───────────────────┘ + */ + case WP_CSS_Token_Processor::TOKEN_HASH: + $normalized_css .= '#' . self::ident( $processor->get_token_value() ); + break; + + case WP_CSS_Token_Processor::TOKEN_DIMENSION: + $normalized_css .= $processor->get_token_value() . $processor->get_token_unit(); + break; + + case WP_CSS_Token_Processor::TOKEN_PERCENTAGE: + $normalized_css .= "%{$processor->get_token_value()}"; + break; + + case WP_CSS_Token_Processor::TOKEN_NUMBER: + $normalized_css .= $processor->get_token_value(); + break; + + case WP_CSS_Token_Processor::TOKEN_DELIM: + $normalized_css .= $processor->get_token_value(); + break; + + case WP_CSS_Token_Processor::TOKEN_IDENT: + $normalized_css .= self::ident( $processor->get_token_value() ); + break; + + case WP_CSS_Token_Processor::TOKEN_STRING: + var_dump( $processor->get_token_value() ); + $normalized_css .= self::string( $processor->get_token_value() ); + break; + + // Keep or strip comments? + case WP_CSS_Token_Processor::TOKEN_COMMENT: + $normalized_css .= substr( $css, $processor->get_token_start(), $processor->get_token_length() ); + break; + + /** + * A is an open string that reaches a newline. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-string-token + * + * @see https://www.w3.org/TR/css-syntax-3/#preserved-tokens + * > Note: The tokens <}-token>s, <)-token>s, <]-token>, , and are always parse errors, but they are preserved in the token stream by this specification to allow other specs, such as Media Queries, to define more fine-grained error-handling than just dropping an entire declaration or block. + */ + case WP_CSS_Token_Processor::TOKEN_BAD_STRING: + $normalized_css .= substr( $css, $processor->get_token_start(), $processor->get_token_length() ) . "\n"; + break; + + case WP_CSS_Token_Processor::TOKEN_URL: + case WP_CSS_Token_Processor::TOKEN_BAD_URL: + case WP_CSS_Token_Processor::TOKEN_CDC: + case WP_CSS_Token_Processor::TOKEN_CDO: + default: + throw new Error( 'unhandled token type ' . $processor->get_token_type() . ' with value ' . var_export( $processor->get_token_value(), true ) ); + } + } + + return strtr( + $normalized_css, + array( + ' ' => '␠', + "\t" => "␉\t", + "\n" => "␊\n", + ) + ); + } +} diff --git a/src/wp-includes/css-api/class-wp-css-token-processor.php b/src/wp-includes/css-api/class-wp-css-token-processor.php new file mode 100644 index 0000000000000..f61775fdf6228 --- /dev/null +++ b/src/wp-includes/css-api/class-wp-css-token-processor.php @@ -0,0 +1,1845 @@ + Replace any U+000D CARRIAGE RETURN (CR) code points, U+000C FORM FEED (FF) + * > code points, or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE + * > FEED (LF) in input by a single U+000A LINE FEED (LF) code point. + * > Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT + * > CHARACTER (�). + * + * This processor delays normalization as much as possible. That keeps the raw byte + * positions intact for accurate rewrites while still letting consumers ask for a + * normalized token when they need one. + * + * ### No EOF token + * + * The EOF token is a CSS parsing concept, not CSS tokenization concept. Therefore, + * this processor does not produce it. + * + * ### UTF-8 handling + * + * Only UTF-8 strings are supported. Invalid sequences are replaced with U+FFFD (�) + * using the maximal subpart approach described in + * https://www.unicode.org/versions/Unicode9.0.0/ch03.pdf, section 3.9 Best Practices + * for Using U+FFFD. + * + * ## Usage + * + * Basic iteration: + * + * $css = 'width: 10px;'; + * $processor = WP_CSS_Token_Processor::create( $css ); + * while ( $processor->next_token() ) { + * echo $processor->get_normalized_token(); + * } + * // Outputs: + * // width: 10px; + * + * Rewriting a URL while keeping the rest of the stylesheet intact: + * + * $css = 'background: url(old.jpg) center / cover;'; + * $processor = WP_CSS_Token_Processor::create( $css ); + * while ( $processor->next_token() ) { + * if ( WP_CSS_Token_Processor::TOKEN_URL === $processor->get_token_type() ) { + * $processor->set_value( 'uploads/new.jpg' ); + * } + * } + * $result = $processor->get_updated_css(); + * // background: url(uploads/new.jpg) center / cover; + * + * Gathering diagnostics with byte offsets: + * + * $css = "color: red;\ncolor: re\nd;"; + * $processor = WP_CSS_Token_Processor::create( $css ); + * $bad_strings = array(); + * while ( $processor->next_token() ) { + * if ( WP_CSS_Token_Processor::TOKEN_BAD_STRING === $processor->get_token_type() ) { + * $bad_strings[] = array( + * 'start' => $processor->get_token_start(), + * 'length' => $processor->get_token_length(), + * 'value' => $processor->get_unnormalized_token(), + * ); + * } + * } + * + * @see https://www.w3.org/TR/css-syntax-3/#tokenization + */ +class WP_CSS_Token_Processor { + /** + * Token type constants matching the CSS Syntax Level 3 specification. + * + * @see https://www.w3.org/TR/css-syntax-3/#tokenization + */ + public const TOKEN_WHITESPACE = 'whitespace-token'; + public const TOKEN_COMMENT = 'comment'; + public const TOKEN_STRING = 'string-token'; + + /** + * BAD-STRING tokens occur when a string contains an unescaped newline. + * + * Valid strings: "hello", 'world', "line1\Aline2" (escaped newline) + * Invalid (produces bad-string): "hello + * world" (literal newline breaks the string) + * + * The processor stops at the newline and produces a bad-string token for error recovery. + * + * @see https://www.w3.org/TR/css-syntax-3/#typedef-bad-string-token + */ + public const TOKEN_BAD_STRING = 'bad-string-token'; + public const TOKEN_HASH = 'hash-token'; + public const HASH_TOKEN_ID = 'id'; + public const HASH_TOKEN_UNRESTRICTED = 'unrestricted'; + public const TOKEN_DELIM = 'delim-token'; + public const TOKEN_NUMBER = 'number-token'; + public const TOKEN_PERCENTAGE = 'percentage-token'; + public const TOKEN_DIMENSION = 'dimension-token'; + public const TOKEN_AT_KEYWORD = 'at-keyword-token'; + public const TOKEN_COLON = 'colon-token'; + public const TOKEN_SEMICOLON = 'semicolon-token'; + public const TOKEN_COMMA = 'comma-token'; + public const TOKEN_LEFT_PAREN = '(-token'; + public const TOKEN_RIGHT_PAREN = ')-token'; + public const TOKEN_LEFT_BRACKET = '[-token'; + public const TOKEN_RIGHT_BRACKET = ']-token'; + public const TOKEN_LEFT_BRACE = '{-token'; + public const TOKEN_RIGHT_BRACE = '}-token'; + public const TOKEN_FUNCTION = 'function-token'; + + /** + * URL tokens represent unquoted URLs in url() notation. + * + * For example, `url(image.jpg)` is a URL token. + * + * Quoted URLs like `url( "https://example.com" )` are handled as a function + * token, _not_ a URL token. + * + * Bad URL tokens are created when invalid characters are encountered in + * a URL token. + * + * @see https://www.w3.org/TR/css-syntax-3/#typedef-url-token + */ + public const TOKEN_URL = 'url-token'; + + /** + * BAD-URL tokens occur when a URL contains invalid characters. + * + * Invalid characters: quotes ("), apostrophes ('), parentheses (() + * Example invalid: url(image(.jpg) or url(image".jpg) + * + * When detected, the processor consumes everything up to ) or EOF. + * This prevents the bad URL from breaking subsequent tokens. + * + * @see https://www.w3.org/TR/css-syntax-3/#typedef-bad-url-token + */ + public const TOKEN_BAD_URL = 'bad-url-token'; + + /** + * Identifier tokens, such as `color`, `margin-top`, `red`, + * `inherit`, `--my-var`, `\x-escaped`, `über` (Unicode), etc. + * + * There are restrictions on the codepoints that start or are contained in + * an identifier, and identifiers may contain escape sequences. + * + * @see https://www.w3.org/TR/css-syntax-3/#typedef-ident-token + */ + public const TOKEN_IDENT = 'ident-token'; + + /** + * CDC (Comment Delimiter Close) token: --> + * + * Legacy token from when CSS was embedded in HTML + * + * Modern CSS no longer needs these, but they're preserved for compatibility. + * In stylesheets, they're typically treated like whitespace. + * + * @see https://www.w3.org/TR/css-syntax-3/#typedef-CDC-token + */ + public const TOKEN_CDC = 'CDC-token'; + + /** + * CDO (Comment Delimiter Open) token: ) + * + * Comment Delimiter Close - legacy HTML comment syntax in CSS. + * + * @see https://www.w3.org/TR/css-syntax-3/#CDC-token-diagram + */ + if ( + $this->at + 2 < $this->length && + '-' === $this->css[ $this->at + 1 ] && + '>' === $this->css[ $this->at + 2 ] + ) { + // Consume them and return a . + $this->at += 3; + $this->token_type = self::TOKEN_CDC; + $this->token_length = 3; + return true; + } + + // Otherwise, if the input stream starts with an ident sequence, + // reconsume the current input code point, consume an ident-like + // token, and return it. + if ( $this->check_if_3_code_points_start_an_ident_sequence( $this->at ) ) { + return $this->consume_ident_like(); + } + + // Otherwise, return a with its value set to the current input code point. + ++$this->at; + $this->token_type = self::TOKEN_DELIM; + $this->token_length = 1; + return true; + } + + /* + * U+003C LESS-THAN SIGN (<) + * If followed by !--, this is a CDO token (\n", + "tokens": [ + { + "type": "CDC-token", + "raw": "-->", + "startIndex": 0, + "endIndex": 3, + "normalized": "-->", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 3, + "endIndex": 4, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0001": { + "css": "foo\n", + "tokens": [ + { + "type": "ident-token", + "raw": "foo", + "startIndex": 0, + "endIndex": 3, + "normalized": "foo", + "value": "foo" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 3, + "endIndex": 4, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0002": { + "css": "--\n", + "tokens": [ + { + "type": "ident-token", + "raw": "--", + "startIndex": 0, + "endIndex": 2, + "normalized": "--", + "value": "--" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 2, + "endIndex": 3, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0003": { + "css": "--0\n", + "tokens": [ + { + "type": "ident-token", + "raw": "--0", + "startIndex": 0, + "endIndex": 3, + "normalized": "--0", + "value": "--0" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 3, + "endIndex": 4, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0004": { + "css": "-\\\n", + "tokens": [ + { + "type": "delim-token", + "raw": "-", + "startIndex": 0, + "endIndex": 1, + "normalized": "-", + "value": "-" + }, + { + "type": "delim-token", + "raw": "\\", + "startIndex": 1, + "endIndex": 2, + "normalized": "\\", + "value": "\\" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 2, + "endIndex": 3, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0005": { + "css": "-\\ \n", + "tokens": [ + { + "type": "ident-token", + "raw": "-\\ ", + "startIndex": 0, + "endIndex": 3, + "normalized": "- ", + "value": "- " + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 3, + "endIndex": 4, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0006": { + "css": "--💅\n", + "tokens": [ + { + "type": "ident-token", + "raw": "--💅", + "startIndex": 0, + "endIndex": 6, + "normalized": "--💅", + "value": "--💅" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 6, + "endIndex": 7, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0007": { + "css": "-§\n", + "tokens": [ + { + "type": "ident-token", + "raw": "-§", + "startIndex": 0, + "endIndex": 3, + "normalized": "-§", + "value": "-§" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 3, + "endIndex": 4, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0008": { + "css": "-×\n", + "tokens": [ + { + "type": "ident-token", + "raw": "-×", + "startIndex": 0, + "endIndex": 3, + "normalized": "-×", + "value": "-×" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 3, + "endIndex": 4, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident/0009": { + "css": "--a𐀀\n", + "tokens": [ + { + "type": "ident-token", + "raw": "--a𐀀", + "startIndex": 0, + "endIndex": 7, + "normalized": "--a𐀀", + "value": "--a𐀀" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 7, + "endIndex": 8, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0001": { + "css": "url(foo)\n", + "tokens": [ + { + "type": "url-token", + "raw": "url(foo)", + "startIndex": 0, + "endIndex": 8, + "normalized": "url(foo)", + "value": "foo" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 8, + "endIndex": 9, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0002": { + "css": "\\75 Rl(foo)\n", + "tokens": [ + { + "type": "url-token", + "raw": "\\75 Rl(foo)", + "startIndex": 0, + "endIndex": 11, + "normalized": "uRl(foo)", + "value": "foo" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 11, + "endIndex": 12, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0003": { + "css": "uR\\6c (foo)\n", + "tokens": [ + { + "type": "url-token", + "raw": "uR\\6c (foo)", + "startIndex": 0, + "endIndex": 11, + "normalized": "uRl(foo)", + "value": "foo" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 11, + "endIndex": 12, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0004": { + "css": "url('foo')\n", + "tokens": [ + { + "type": "function-token", + "raw": "url(", + "startIndex": 0, + "endIndex": 4, + "normalized": "url(", + "value": "url" + }, + { + "type": "string-token", + "raw": "'foo'", + "startIndex": 4, + "endIndex": 9, + "normalized": "'foo'", + "value": "foo" + }, + { + "type": ")-token", + "raw": ")", + "startIndex": 9, + "endIndex": 10, + "normalized": ")", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 10, + "endIndex": 11, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0005": { + "css": "url( 'foo')\n", + "tokens": [ + { + "type": "function-token", + "raw": "url(", + "startIndex": 0, + "endIndex": 4, + "normalized": "url(", + "value": "url" + }, + { + "type": "whitespace-token", + "raw": " ", + "startIndex": 4, + "endIndex": 5, + "normalized": " ", + "value": null + }, + { + "type": "string-token", + "raw": "'foo'", + "startIndex": 5, + "endIndex": 10, + "normalized": "'foo'", + "value": "foo" + }, + { + "type": ")-token", + "raw": ")", + "startIndex": 10, + "endIndex": 11, + "normalized": ")", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 11, + "endIndex": 12, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0006": { + "css": "url( 'foo')\n", + "tokens": [ + { + "type": "function-token", + "raw": "url(", + "startIndex": 0, + "endIndex": 4, + "normalized": "url(", + "value": "url" + }, + { + "type": "whitespace-token", + "raw": " ", + "startIndex": 4, + "endIndex": 6, + "normalized": " ", + "value": null + }, + { + "type": "string-token", + "raw": "'foo'", + "startIndex": 6, + "endIndex": 11, + "normalized": "'foo'", + "value": "foo" + }, + { + "type": ")-token", + "raw": ")", + "startIndex": 11, + "endIndex": 12, + "normalized": ")", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 12, + "endIndex": 13, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0007": { + "css": "url( 'foo')\n", + "tokens": [ + { + "type": "function-token", + "raw": "url(", + "startIndex": 0, + "endIndex": 4, + "normalized": "url(", + "value": "url" + }, + { + "type": "whitespace-token", + "raw": " ", + "startIndex": 4, + "endIndex": 7, + "normalized": " ", + "value": null + }, + { + "type": "string-token", + "raw": "'foo'", + "startIndex": 7, + "endIndex": 12, + "normalized": "'foo'", + "value": "foo" + }, + { + "type": ")-token", + "raw": ")", + "startIndex": 12, + "endIndex": 13, + "normalized": ")", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 13, + "endIndex": 14, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0008": { + "css": "not-url( 'foo')\n", + "tokens": [ + { + "type": "function-token", + "raw": "not-url(", + "startIndex": 0, + "endIndex": 8, + "normalized": "not-url(", + "value": "not-url" + }, + { + "type": "whitespace-token", + "raw": " ", + "startIndex": 8, + "endIndex": 11, + "normalized": " ", + "value": null + }, + { + "type": "string-token", + "raw": "'foo'", + "startIndex": 11, + "endIndex": 16, + "normalized": "'foo'", + "value": "foo" + }, + { + "type": ")-token", + "raw": ")", + "startIndex": 16, + "endIndex": 17, + "normalized": ")", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 17, + "endIndex": 18, + "normalized": "\n", + "value": null + } + ] + }, + "tests/ident-like/0009": { + "css": "url( foo)\n", + "tokens": [ + { + "type": "url-token", + "raw": "url( foo)", + "startIndex": 0, + "endIndex": 11, + "normalized": "url( foo)", + "value": "foo" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 11, + "endIndex": 12, + "normalized": "\n", + "value": null + } + ] + }, + "tests/left-curly-bracket/0001": { + "css": "{\n", + "tokens": [ + { + "type": "{-token", + "raw": "{", + "startIndex": 0, + "endIndex": 1, + "normalized": "{", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 1, + "endIndex": 2, + "normalized": "\n", + "value": null + } + ] + }, + "tests/left-parenthesis/0001": { + "css": "(\n", + "tokens": [ + { + "type": "(-token", + "raw": "(", + "startIndex": 0, + "endIndex": 1, + "normalized": "(", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 1, + "endIndex": 2, + "normalized": "\n", + "value": null + } + ] + }, + "tests/left-square-bracket/0001": { + "css": "[\n", + "tokens": [ + { + "type": "[-token", + "raw": "[", + "startIndex": 0, + "endIndex": 1, + "normalized": "[", + "value": null + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 1, + "endIndex": 2, + "normalized": "\n", + "value": null + } + ] + }, + "tests/less-than/0001": { + "css": "<\n", + "tokens": [ + { + "type": "delim-token", + "raw": "<", + "startIndex": 0, + "endIndex": 1, + "normalized": "<", + "value": "<" + }, + { + "type": "whitespace-token", + "raw": "\n", + "startIndex": 1, + "endIndex": 2, + "normalized": "\n", + "value": null + } + ] + }, + "tests/less-than/0002": { + "css": "' ); + $html = ''; + foreach ( $bits as $bit ) { + if ( $this->prng->chance( 35 ) ) { + $html .= $this->prng->choice( $filler ); + } + $html .= $bit; + } + + foreach ( $this->pools as $key => $values ) { + $this->pools[ $key ] = array_values( array_unique( $values ) ); + } + + return array( + 'model' => null, + 'children' => $children, + 'html' => $html, + 'context' => '', + 'fragment' => true, + 'quirks' => false, + 'pools' => $this->pools, + ); + } + + /** + * Rows ( TreeCapture shape ) for a ``-context fragment: the + * top-level children flattened with the implicit HTML/BODY ancestors the + * fragment parser reports. + */ + public static function rows_from_fragment( array $children ): array { + $html_root = array( 'tag' => 'html', 'fid' => '(html)', 'attrs' => array(), 'children' => array() ); + $body_root = array( 'tag' => 'body', 'fid' => '(body)', 'attrs' => array(), 'children' => $children ); + + $rows = array(); + foreach ( $children as $child ) { + foreach ( self::flatten_with_ancestors( $child, array( $body_root, $html_root ) ) as $pair ) { + list( $element, $ancestors ) = $pair; + + $attrs = array(); + $seen = array(); + foreach ( $element['attrs'] as $attr ) { + $lower = ascii_strtolower( $attr[0] ); + if ( isset( $seen[ $lower ] ) ) { + continue; + } + $seen[ $lower ] = true; + $attrs[] = array( $lower, $attr[1] ); + } + + $ancestor_tags = array(); + foreach ( $ancestors as $ancestor ) { + $ancestor_tags[] = strtoupper( ascii_strtolower( $ancestor['tag'] ) ); + } + + $rows[] = array( + 'tag' => strtoupper( ascii_strtolower( $element['tag'] ) ), + 'fid' => $element['fid'], + 'attrs' => $attrs, + 'ancestorTags' => $ancestor_tags, + ); + } + } + return $rows; + } + + private function build(): array { + $has_doctype = $this->prng->chance( 85 ); + + $head_children = array(); + if ( $this->prng->chance( 60 ) ) { + $head_children[] = $this->make_element( 'title', array(), array() ); + } + if ( $this->prng->chance( 30 ) ) { + $head_children[] = $this->make_element( 'meta', $this->random_attrs(), array() ); + } + + $body_children = array(); + $child_budget = $this->prng->int( 1, 6 ); + for ( $i = 0; $i < $child_budget && $this->element_count < $this->max_elements; $i++ ) { + $body_children[] = $this->random_subtree( 0 ); + } + + $head = $this->make_element( 'head', array(), $head_children ); + $body = $this->make_element( 'body', $this->prng->chance( 30 ) ? $this->random_attrs() : array(), $body_children ); + $html = $this->make_element( 'html', $this->prng->chance( 20 ) ? $this->random_attrs() : array(), array( $head, $body ) ); + + $rendered = ( $has_doctype ? '' : '' ) . $this->render_element( $html ); + + foreach ( $this->pools as $key => $values ) { + $this->pools[ $key ] = array_values( array_unique( $values ) ); + } + + return array( + 'model' => $html, + 'html' => $rendered, + 'quirks' => ! $has_doctype, + 'pools' => $this->pools, + ); + } + + private function random_subtree( int $depth ): array { + ++$this->element_count; + + if ( $depth >= 7 || $this->element_count >= $this->max_elements || $this->prng->chance( 25 ) ) { + // Leaf. + if ( $this->prng->chance( 25 ) ) { + return $this->make_element( $this->prng->choice( self::VOID_TAGS ), $this->random_attrs(), array(), true ); + } + return $this->make_element( $this->prng->choice( self::SAFE_TAGS ), $this->random_attrs(), array() ); + } + + $children = array(); + $child_count = $this->prng->int( 1, 4 ); + for ( $i = 0; $i < $child_count && $this->element_count < $this->max_elements; $i++ ) { + $children[] = $this->random_subtree( $depth + 1 ); + } + + return $this->make_element( $this->prng->choice( self::SAFE_TAGS ), $this->random_attrs(), $children ); + } + + private function make_element( string $tag, array $attrs, array $children, bool $is_void = false ): array { + $fid = 'e' . $this->fid_counter++; + + $written_tag = $this->prng->chance( 15 ) ? $this->random_case( $tag ) : $tag; + + $this->pools['tags'][] = $tag; + + return array( + 'tag' => $written_tag, + 'fid' => $fid, + 'attrs' => $attrs, + 'children' => $children, + 'void' => $is_void || in_array( strtolower( $tag ), array( 'meta', 'br', 'hr', 'img', 'wbr', 'input', 'embed' ), true ), + ); + } + + /** @return array name/value pairs in source order. */ + private function random_attrs(): array { + $attrs = array(); + $count = $this->prng->weighted( + array( + 0 => 15, + 1 => 30, + 2 => 30, + 3 => 15, + 4 => 10, + ) + ); + + $used_names = array(); + for ( $i = 0; $i < $count; $i++ ) { + $name = $this->prng->choice( self::ATTR_NAMES ); + + // Occasionally repeat an attribute name: the processor keeps the first. + $is_duplicate = isset( $used_names[ ascii_strtolower( $name ) ] ); + if ( $is_duplicate && ! $this->prng->chance( 20 ) ) { + continue; + } + $used_names[ ascii_strtolower( $name ) ] = true; + + if ( $this->prng->chance( 12 ) ) { + $name = $this->random_case( $name ); + } + + $lower = ascii_strtolower( $name ); + if ( 'class' === $lower ) { + $value = $this->random_class_value(); + } elseif ( 'id' === $lower ) { + $value = $this->prng->chance( 85 ) ? $this->random_id_value() : ( $this->prng->chance( 50 ) ? '' : true ); + } elseif ( in_array( $lower, array( 'disabled', 'hidden' ), true ) ) { + $value = $this->prng->chance( 70 ) ? true : $this->prng->choice( array( '', 'disabled', 'true' ) ); + } else { + $value = $this->prng->chance( 12 ) ? true : $this->random_attr_value(); + } + + $this->pools['attrNames'][] = ascii_strtolower( $name ); + if ( is_string( $value ) && 'class' !== $lower ) { + $this->pools['attrValues'][] = $value; + } + + $attrs[] = array( $name, $value ); + } + + return $attrs; + } + + private function random_class_value(): string { + $count = $this->prng->int( 1, 4 ); + $classes = array(); + for ( $i = 0; $i < $count; $i++ ) { + $class = $this->random_word( true ); + $raw_class = $this->maybe_inject_class_nul( $class ); + $classes[] = $raw_class; + foreach ( self::class_tokens( $raw_class ) as $token ) { + $this->pools['classes'][] = $token; + } + } + + $ws = array( ' ', ' ', ' ', "\t", "\n", "\f", ' ' ); + $value = $this->prng->chance( 20 ) ? $this->prng->choice( $ws ) : ''; + foreach ( $classes as $i => $class ) { + if ( $i > 0 ) { + $value .= $this->prng->choice( $ws ); + } + $value .= $class; + } + if ( $this->prng->chance( 20 ) ) { + $value .= $this->prng->choice( $ws ); + } + return $value; + } + + private function maybe_inject_class_nul( string $class ): string { + if ( '' === $class || ! $this->prng->chance( 12 ) ) { + return $class; + } + + $points = utf8_codepoints( $class ); + $at = $this->prng->int( 0, count( $points ) ); + $out = ''; + foreach ( $points as $i => $point ) { + if ( $i === $at ) { + $out .= "\0"; + } + $out .= $point[0]; + } + return $at === count( $points ) ? $out . "\0" : $out; + } + + private function random_id_value(): string { + $id = $this->random_word( true ); + $this->pools['ids'][] = $id; + return $id; + } + + private function random_attr_value(): string { + $kind = $this->prng->weighted( + array( + 'word' => 35, + 'words' => 20, + 'hyphenated' => 15, + 'empty' => 8, + 'spicy' => 12, + 'unicode' => 10, + ) + ); + + switch ( $kind ) { + case 'word': + return $this->random_word( true ); + case 'words': + $parts = array(); + $n = $this->prng->int( 2, 4 ); + for ( $i = 0; $i < $n; $i++ ) { + $parts[] = $this->random_word( true ); + } + return implode( $this->prng->choice( array( ' ', ' ', "\t", "\n" ) ), $parts ); + case 'hyphenated': + return $this->random_word( false ) . '-' . $this->random_word( false ); + case 'empty': + return ''; + case 'spicy': + $spice = array( 'a"b', "a'b", 'a&b', 'ab', 'a=b', 'a b c', '&', '"x', '100%', 'semi;colon', 'a,b' ); + return $this->prng->choice( $spice ); + case 'unicode': + $unicode = array( 'héllo', 'ÄÖÜ', '✓done', 'naïve', 'Ωmega', '\u{1F600}smile' ); + $value = $this->prng->choice( $unicode ); + return str_replace( '\u{1F600}', "\u{1F600}", $value ); + } + return 'fallback'; + } + + private function random_word( bool $allow_mixed_case ): string { + $stems = array( 'alpha', 'beta', 'gamma', 'delta', 'box', 'col', 'item', 'note', 'wide', 'main-item', 'x', 'a', '-lead', '--var', '_under', 'Über', 'mixedCase' ); + $word = $this->prng->choice( $stems ); + if ( $this->prng->chance( 30 ) ) { + $word .= (string) $this->prng->int( 0, 99 ); + } + if ( $allow_mixed_case && $this->prng->chance( 15 ) ) { + $word = $this->random_case( $word ); + } + return $word; + } + + private function random_case( string $input ): string { + $out = ''; + for ( $i = 0; $i < strlen( $input ); $i++ ) { + $c = $input[ $i ]; + $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c ); + } + return $out; + } + + /* + * --------- + * Rendering + * --------- + */ + + private function render_element( array $element ): string { + $out = '<' . $element['tag']; + + $rendered_attrs = array( ' data-fid="' . $element['fid'] . '"' ); + foreach ( $element['attrs'] as $attr ) { + $rendered_attrs[] = ' ' . $this->render_attr( $attr[0], $attr[1] ); + } + $out .= implode( '', $rendered_attrs ); + + if ( $element['void'] ) { + $out .= $this->prng->chance( 25 ) ? ' />' : '>'; + return $out; + } + + $out .= '>'; + + $child_bits = array(); + foreach ( $element['children'] as $child ) { + $child_bits[] = $this->render_element( $child ); + } + + /* + * Sprinkle text and comments between children — but never directly + * inside `html` or `head`, where character tokens would trigger + * insertion-mode changes (early body creation, head popping) that + * desynchronize the model from the parsed tree. + */ + $lower_tag = strtolower( $element['tag'] ); + $may_have_filler = ! in_array( $lower_tag, array( 'html', 'head' ), true ); + $filler_options = array( + '', + 'text', + ' more text ', + "\n ", + '& <escaped>', + '', + 'café ✓', + ); + $content = ''; + foreach ( $child_bits as $bit ) { + if ( $may_have_filler && $this->prng->chance( 40 ) ) { + $content .= $this->prng->choice( $filler_options ); + } + $content .= $bit; + } + if ( $may_have_filler && $this->prng->chance( 40 ) ) { + $content .= $this->prng->choice( $filler_options ); + } + if ( 'title' === $lower_tag ) { + // RAWTEXT: keep it plain. + $content = $this->prng->chance( 60 ) ? 'Fuzz Title' : ''; + } + + return $out . $content . ''; + } + + /** @param string|true $value */ + private function render_attr( string $name, $value ): string { + if ( true === $value ) { + return $name; + } + + $style = $this->prng->weighted( + array( + 'double' => 60, + 'single' => 20, + 'unquoted' => 20, + ) + ); + + if ( 'unquoted' === $style && ( '' === $value || strlen( $value ) !== strspn( $value, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._:-' ) ) ) { + $style = 'double'; + } + + switch ( $style ) { + case 'unquoted': + return $name . '=' . $value; + case 'single': + return $name . "='" . str_replace( array( '&', "'", '<' ), array( '&', ''', '<' ), $value ) . "'"; + default: + return $name . '="' . str_replace( array( '&', '"', '<' ), array( '&', '"', '<' ), $value ) . '"'; + } + } + + /* + * ---------------- + * Model utilities + * ---------------- + */ + + /** Pre-order (document order) list of elements. */ + public static function flatten( array $element ): array { + $out = array( $element ); + foreach ( $element['children'] as $child ) { + foreach ( self::flatten( $child ) as $descendant ) { + $out[] = $descendant; + } + } + return $out; + } + + /** + * Pre-order list of ( element, ancestors ) pairs where ancestors is the + * chain from nearest ancestor to root — the same orientation as + * WP_HTML_Processor::get_breadcrumbs() reversed past self. + */ + public static function flatten_with_ancestors( array $element, array $ancestors = array() ): array { + $out = array( array( $element, $ancestors ) ); + $next_ancestors = array_merge( array( $element ), $ancestors ); + foreach ( $element['children'] as $child ) { + foreach ( self::flatten_with_ancestors( $child, $next_ancestors ) as $pair ) { + $out[] = $pair; + } + } + return $out; + } + + /** + * Flat element rows ( the TreeCapture row shape ) derived from a model: + * pre-order, tags uppercased, attribute names lowercased with the first + * of duplicates winning — directly comparable to a TreeCapture of the + * rendered document. + */ + public static function rows_from_model( array $model ): array { + $rows = array(); + foreach ( self::flatten_with_ancestors( $model ) as $pair ) { + list( $element, $ancestors ) = $pair; + + $attrs = array(); + $seen = array(); + foreach ( $element['attrs'] as $attr ) { + $lower = ascii_strtolower( $attr[0] ); + if ( isset( $seen[ $lower ] ) ) { + continue; + } + $seen[ $lower ] = true; + $attrs[] = array( $lower, $attr[1] ); + } + + $ancestor_tags = array(); + foreach ( $ancestors as $ancestor ) { + $ancestor_tags[] = strtoupper( ascii_strtolower( $ancestor['tag'] ) ); + } + + $rows[] = array( + 'tag' => strtoupper( ascii_strtolower( $element['tag'] ) ), + 'fid' => $element['fid'], + 'attrs' => $attrs, + 'ancestorTags' => $ancestor_tags, + ); + } + return $rows; + } + + /** First attribute value for a name, ASCII case-insensitive; null if absent. */ + public static function get_attribute_value( array $element, string $name ) { + $comparable = ascii_strtolower( $name ); + foreach ( $element['attrs'] as $attr ) { + if ( ascii_strtolower( $attr[0] ) === $comparable ) { + return $attr[1]; + } + } + return null; + } + + /** + * Class tokens as seen by selector matching: ASCII whitespace separates + * tokens, and NUL inside a token is exposed as U+FFFD by class_list(). + * + * @return string[] + */ + public static function class_tokens( string $class_value ): array { + $tokens = array(); + $length = strlen( $class_value ); + $at = 0; + $ws = " \t\r\n\f"; + while ( $at < $length ) { + $at += strspn( $class_value, $ws, $at ); + if ( $at >= $length ) { + break; + } + + $token_length = strcspn( $class_value, $ws, $at ); + $tokens[] = str_replace( "\0", "\u{FFFD}", substr( $class_value, $at, $token_length ) ); + $at += $token_length; + } + return $tokens; + } +} diff --git a/tools/css-selector-fuzz/lib/LexborOracle.php b/tools/css-selector-fuzz/lib/LexborOracle.php new file mode 100644 index 0000000000000..afb508ddc1278 --- /dev/null +++ b/tools/css-selector-fuzz/lib/LexborOracle.php @@ -0,0 +1,215 @@ + fuzzer-oracle problem ( investigate + * the fuzzer, 'lexbor-divergence' ). + * reference == lexbor != WP => high-confidence WP finding ( the + * regular match-mismatch-html failure + * with no accompanying divergence ). + * + * Known bug compensated for: lexbor #368 — class and #id selectors match + * ASCII case-insensitively even in no-quirks mode ( attribute selectors + * like [id=x] are correctly case-sensitive ). Detected by probe at startup; + * when present, lexbor is compared against the reference matcher run with + * quirks-style class/ID folding. Quirks documents are compared only when + * the probe also confirms class and #id selectors fold in quirks mode. + */ +class LexborOracle { + + const READ_TIMEOUT_SECONDS = 5; + + /** @var resource|null */ + private static $process = null; + /** @var array|null */ + private static $pipes = null; + /** @var bool|null */ + private static $available = null; + /** @var bool */ + private static $issue368 = false; + /** @var bool */ + private static $quirks_class_id_reliable = false; + + public static function harness_path(): string { + return dirname( __DIR__ ) . '/lexbor/harness'; + } + + /** Whether the harness is built, starts, and answered the probes. */ + public static function available(): bool { + if ( null !== self::$available ) { + return self::$available; + } + + self::$available = false; + if ( ! is_executable( self::harness_path() ) || ! self::start() ) { + return false; + } + + // Probe: sanity plus class/#id case-sensitivity behavior. + $sane = self::query( '
', 'div.a' ); + if ( null === $sane || array( 'x' ) !== $sane['matches'] ) { + self::stop(); + return false; + } + + $no_quirks_class = self::query( '
', '.A' ); + $no_quirks_id = self::query( '
', '#A' ); + $quirks_class = self::query( '
', '.A' ); + $quirks_id = self::query( '
', '#A' ); + foreach ( array( $no_quirks_class, $no_quirks_id, $quirks_class, $quirks_id ) as $probe ) { + if ( null === $probe || null !== $probe['error'] ) { + self::stop(); + return false; + } + } + + self::$issue368 = array( 'x' ) === $no_quirks_class['matches'] + || array( 'x' ) === $no_quirks_id['matches']; + self::$quirks_class_id_reliable = ! self::$issue368 + && array() === $no_quirks_class['matches'] + && array() === $no_quirks_id['matches'] + && array( 'x' ) === $quirks_class['matches'] + && array( 'x' ) === $quirks_id['matches']; + self::$available = true; + return true; + } + + /** Whether the built lexbor exhibits issue #368 ( class/ID case folding ). */ + public static function has_issue_368(): bool { + return self::$issue368; + } + + /** Whether lexbor can be trusted on quirks class/#id case folding. */ + public static function quirks_class_id_reliable(): bool { + return self::$quirks_class_id_reliable; + } + + /** + * Runs one case through lexbor. + * + * @return array{ + * rows: array, + * matches: string[], + * error: string|null, + * }|null Null when the harness is unavailable or misbehaved ( the + * harness is stopped; the caller should skip the differential ). + */ + public static function query( string $html, string $selector ): ?array { + if ( null === self::$process && ! self::start() ) { + return null; + } + + $line = base64_encode( $html ) . "\t" . base64_encode( $selector ) . "\n"; + $written = fwrite( self::$pipes[0], $line ); + fflush( self::$pipes[0] ); + if ( strlen( $line ) !== $written ) { + self::stop(); + self::$available = false; + return null; + } + + $rows = array(); + $matches = array(); + $error = null; + + while ( true ) { + $response = self::read_line(); + if ( null === $response ) { + self::stop(); + self::$available = false; + return null; + } + if ( 'D' === $response ) { + break; + } + + $parts = explode( "\t", $response ); + switch ( $parts[0] ) { + case 'R': + $rows[] = array( + 'tag' => $parts[1] ?? '', + 'fid' => $parts[2] ?? '', + 'ancestorTags' => '' === ( $parts[3] ?? '' ) ? array() : explode( ',', $parts[3] ), + ); + break; + case 'M': + $matches[] = $parts[1] ?? ''; + break; + case 'X': + $error = $parts[1] ?? 'unknown'; + break; + } + } + + return array( + 'rows' => $rows, + 'matches' => $matches, + 'error' => $error, + ); + } + + private static function start(): bool { + $descriptors = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', '/dev/null', 'w' ), + ); + + $process = proc_open( array( self::harness_path() ), $descriptors, $pipes ); + if ( ! is_resource( $process ) ) { + return false; + } + + self::$process = $process; + self::$pipes = $pipes; + stream_set_blocking( $pipes[1], false ); + return true; + } + + private static function stop(): void { + if ( null === self::$process ) { + return; + } + @fclose( self::$pipes[0] ); + @fclose( self::$pipes[1] ); + @proc_terminate( self::$process, 9 ); + @proc_close( self::$process ); + self::$process = null; + self::$pipes = null; + } + + /** Reads one newline-terminated line with a timeout; null on failure. */ + private static function read_line(): ?string { + $line = ''; + $deadline = microtime( true ) + self::READ_TIMEOUT_SECONDS; + + while ( true ) { + $read = array( self::$pipes[1] ); + $write = null; + $except = null; + $left = $deadline - microtime( true ); + if ( $left <= 0 ) { + return null; + } + $ready = stream_select( $read, $write, $except, 0, (int) ( $left * 1e6 ) ); + if ( false === $ready || 0 === $ready ) { + return null; + } + $chunk = fgets( self::$pipes[1] ); + if ( false === $chunk ) { + return null; + } + $line .= $chunk; + if ( str_ends_with( $line, "\n" ) ) { + return substr( $line, 0, -1 ); + } + } + } +} diff --git a/tools/css-selector-fuzz/lib/Metamorph.php b/tools/css-selector-fuzz/lib/Metamorph.php new file mode 100644 index 0000000000000..159d8e8bedd52 --- /dev/null +++ b/tools/css-selector-fuzz/lib/Metamorph.php @@ -0,0 +1,150 @@ + + */ + public static function variants( array $list_ast, Prng $prng ): array { + /* + * from_selectors() scrubs invalid UTF-8 to U+FFFD before parsing, so + * parsed AST names are always valid UTF-8 and this guard should be + * unreachable. It stays as defense in depth: the renderer can only + * round-trip valid UTF-8 names, and a future AST source that skips + * normalization would otherwise corrupt the variants silently. + */ + if ( ! ast_strings_are_utf8( $list_ast ) ) { + return array(); + } + + $out = array(); + + $out[] = array( + 'name' => 'rerender', + 'selector' => SelectorGenerator::render( $prng->fork( 'rerender' ), $list_ast, true ), + 'ast' => $list_ast, + 'astMustMatch' => true, + ); + + $typecase = self::map_types( + $list_ast, + static function ( string $type ) use ( $prng ): string { + if ( '*' === $type ) { + return $type; + } + $out = ''; + for ( $i = 0; $i < strlen( $type ); $i++ ) { + $c = $type[ $i ]; + $out .= $prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c ); + } + return $out; + } + ); + if ( $typecase !== $list_ast ) { + $out[] = array( + 'name' => 'typecase', + 'selector' => SelectorGenerator::render( $prng->fork( 'typecase' ), $typecase ), + 'ast' => $typecase, + 'astMustMatch' => true, + ); + } + + $reordered = self::rotate_subs( $list_ast ); + if ( $reordered !== $list_ast ) { + $out[] = array( + 'name' => 'subs-reorder', + 'selector' => SelectorGenerator::render( $prng->fork( 'subs-reorder' ), $reordered ), + 'ast' => $reordered, + 'astMustMatch' => true, + ); + } + + $universal = self::explicit_universal( $list_ast ); + if ( $universal !== $list_ast ) { + $out[] = array( + 'name' => 'universal', + 'selector' => SelectorGenerator::render( $prng->fork( 'universal' ), $universal ), + 'ast' => $universal, + 'astMustMatch' => true, + ); + } + + $duplicated = $list_ast; + $duplicated[] = $list_ast[ $prng->int( 0, count( $list_ast ) - 1 ) ]; + $out[] = array( + 'name' => 'dup-branch', + 'selector' => SelectorGenerator::render( $prng->fork( 'dup-branch' ), $duplicated ), + 'ast' => $duplicated, + 'astMustMatch' => true, + ); + + return $out; + } + + /** Applies $fn to every type-selector name: compound types and context types. */ + private static function map_types( array $list_ast, callable $fn ): array { + foreach ( $list_ast as &$complex ) { + foreach ( $complex['context'] as &$pair ) { + $pair[0] = $fn( $pair[0] ); + } + unset( $pair ); + if ( null !== $complex['self']['type'] ) { + $complex['self']['type'] = $fn( $complex['self']['type'] ); + } + } + unset( $complex ); + return $list_ast; + } + + /** Rotates the subclass list of every compound that has two or more. */ + private static function rotate_subs( array $list_ast ): array { + foreach ( $list_ast as &$complex ) { + $subs = $complex['self']['subs']; + if ( is_array( $subs ) && count( $subs ) >= 2 ) { + $subs[] = array_shift( $subs ); + $complex['self']['subs'] = $subs; + } + } + unset( $complex ); + return $list_ast; + } + + /** Writes an explicit `*` wherever a compound omitted its type selector. */ + private static function explicit_universal( array $list_ast ): array { + foreach ( $list_ast as &$complex ) { + if ( null === $complex['self']['type'] && null !== $complex['self']['subs'] ) { + $complex['self']['type'] = '*'; + } + } + unset( $complex ); + return $list_ast; + } +} diff --git a/tools/css-selector-fuzz/lib/Prng.php b/tools/css-selector-fuzz/lib/Prng.php new file mode 100644 index 0000000000000..b8d8737304277 --- /dev/null +++ b/tools/css-selector-fuzz/lib/Prng.php @@ -0,0 +1,65 @@ +key = $seed . "\x1f" . $label; + } + + /** Derives an independent child stream; consuming it does not affect this stream. */ + public function fork( string $label ): Prng { + return new Prng( $this->key, $label . ':' . $this->uint32() ); + } + + public function bytes( int $length ): string { + while ( strlen( $this->buffer ) < $length ) { + $this->buffer .= hash( 'sha256', $this->key . ':' . $this->counter++, true ); + } + $out = substr( $this->buffer, 0, $length ); + $this->buffer = substr( $this->buffer, $length ); + return $out; + } + + public function uint32(): int { + $parts = unpack( 'Nvalue', $this->bytes( 4 ) ); + return (int) $parts['value']; + } + + public function int( int $min, int $max ): int { + if ( $max <= $min ) { + return $min; + } + return $min + ( $this->uint32() % ( $max - $min + 1 ) ); + } + + public function chance( int $numerator, int $denominator = 100 ): bool { + return $this->int( 1, $denominator ) <= $numerator; + } + + public function choice( array $values ) { + return $values[ $this->int( 0, count( $values ) - 1 ) ]; + } + + /** @param array $weights value => weight */ + public function weighted( array $weights ) { + $total = array_sum( $weights ); + $pick = $this->int( 1, max( 1, (int) $total ) ); + foreach ( $weights as $value => $weight ) { + $pick -= $weight; + if ( $pick <= 0 ) { + return $value; + } + } + return array_key_first( $weights ); + } +} diff --git a/tools/css-selector-fuzz/lib/ReferenceMatcher.php b/tools/css-selector-fuzz/lib/ReferenceMatcher.php new file mode 100644 index 0000000000000..ea422078e9893 --- /dev/null +++ b/tools/css-selector-fuzz/lib/ReferenceMatcher.php @@ -0,0 +1,323 @@ + true, + 'accept-charset' => true, + 'align' => true, + 'alink' => true, + 'axis' => true, + 'bgcolor' => true, + 'charset' => true, + 'checked' => true, + 'clear' => true, + 'codetype' => true, + 'color' => true, + 'compact' => true, + 'declare' => true, + 'defer' => true, + 'dir' => true, + 'direction' => true, + 'disabled' => true, + 'enctype' => true, + 'face' => true, + 'frame' => true, + 'hreflang' => true, + 'http-equiv' => true, + 'lang' => true, + 'language' => true, + 'link' => true, + 'media' => true, + 'method' => true, + 'multiple' => true, + 'nohref' => true, + 'noresize' => true, + 'noshade' => true, + 'nowrap' => true, + 'readonly' => true, + 'rel' => true, + 'rev' => true, + 'rules' => true, + 'scope' => true, + 'scrolling' => true, + 'selected' => true, + 'shape' => true, + 'target' => true, + 'text' => true, + 'type' => true, + 'valign' => true, + 'valuetype' => true, + 'vlink' => true, + ); + + /** + * Expected match list for WP_HTML_Processor::select(). + * + * @param array $list_ast Canonical complex selector list AST. + * @param array $rows Element rows in visit order, with ancestorTags. + * @param bool $quirks Whether the document parses in quirks mode. + * @param bool $html_attr_ci Whether HTML's case-insensitive attribute value + * list applies. True models WP/browsers; false + * models an engine without the rule ( lexbor ). + * @return string[] data-fid values in visit order. + */ + public static function expected_html_matches_rows( array $list_ast, array $rows, bool $quirks, bool $html_attr_ci = true ): array { + $out = array(); + foreach ( $rows as $row ) { + if ( self::list_matches_row( $list_ast, $row, $quirks, $html_attr_ci ) ) { + $out[] = $row['fid']; + } + } + return $out; + } + + /** + * Expected match list for WP_HTML_Tag_Processor::select() over the same + * markup. The tag processor never enters quirks mode on its own and a + * compound selector list never inspects ancestors. + * + * @param array $list_ast Canonical complex selector list AST ( contexts must be empty ). + * @param array $rows Tag-view element rows in token order. + * @return string[] data-fid values in token order. + */ + public static function expected_tag_matches_rows( array $list_ast, array $rows ): array { + $out = array(); + foreach ( $rows as $row ) { + $matched = false; + foreach ( $list_ast as $complex ) { + if ( self::compound_matches( $complex['self'], $row, false, true ) ) { + $matched = true; + break; + } + } + if ( $matched ) { + $out[] = $row['fid']; + } + } + return $out; + } + + /** Back-compat: expected html-processor matches from a generated model. */ + public static function expected_html_processor_matches( array $list_ast, array $model, bool $quirks ): array { + return self::expected_html_matches_rows( $list_ast, DocumentGenerator::rows_from_model( $model ), $quirks ); + } + + /** Back-compat: expected tag-processor matches from a generated model. */ + public static function expected_tag_processor_matches( array $list_ast, array $model ): array { + return self::expected_tag_matches_rows( $list_ast, DocumentGenerator::rows_from_model( $model ) ); + } + + public static function list_matches_row( array $list_ast, array $row, bool $quirks, bool $html_attr_ci = true ): bool { + foreach ( $list_ast as $complex ) { + if ( + self::compound_matches( $complex['self'], $row, $quirks, $html_attr_ci ) && + self::explore_context( $complex['context'], $row['ancestorTags'] ) + ) { + return true; + } + } + return false; + } + + /** + * @param array $context Right-to-left ( type, combinator ) pairs. + * @param string[] $ancestor_tags Nearest-ancestor-first tag names. + */ + private static function explore_context( array $context, array $ancestor_tags ): bool { + if ( array() === $context ) { + return true; + } + if ( array() === $ancestor_tags ) { + return false; + } + + list( $type, $combinator ) = $context[0]; + $rest = array_slice( $context, 1 ); + + if ( '>' === $combinator ) { + return self::type_matches( $type, $ancestor_tags[0] ) + && self::explore_context( $rest, array_slice( $ancestor_tags, 1 ) ); + } + + // Descendant: try every matching ancestor. + $count = count( $ancestor_tags ); + for ( $i = 0; $i < $count; $i++ ) { + if ( + self::type_matches( $type, $ancestor_tags[ $i ] ) && + self::explore_context( $rest, array_slice( $ancestor_tags, $i + 1 ) ) + ) { + return true; + } + } + return false; + } + + public static function compound_matches( array $compound, array $row, bool $quirks, bool $html_attr_ci = true ): bool { + if ( null !== $compound['type'] && ! self::type_matches( $compound['type'], $row['tag'] ) ) { + return false; + } + foreach ( (array) $compound['subs'] as $sub ) { + if ( ! self::sub_matches( $sub, $row, $quirks, $html_attr_ci ) ) { + return false; + } + } + return true; + } + + private static function type_matches( string $type, string $tag ): bool { + return '*' === $type || ascii_strtolower( $type ) === ascii_strtolower( $tag ); + } + + private static function sub_matches( array $sub, array $row, bool $quirks, bool $html_attr_ci ): bool { + switch ( $sub['kind'] ) { + case 'class': + return self::class_matches( $sub['name'], $row, $quirks ); + case 'id': + return self::id_matches( $sub['name'], $row, $quirks ); + case 'attr': + return self::attr_matches( $sub, $row, $html_attr_ci ); + } + return false; + } + + private static function class_matches( string $wanted, array $row, bool $quirks ): bool { + $class_value = DocumentGenerator::get_attribute_value( $row, 'class' ); + if ( ! is_string( $class_value ) ) { + return false; + } + + foreach ( DocumentGenerator::class_tokens( $class_value ) as $word ) { + if ( + $quirks + ? ascii_strtolower( $word ) === ascii_strtolower( $wanted ) + : $word === $wanted + ) { + return true; + } + } + return false; + } + + private static function id_matches( string $wanted, array $row, bool $quirks ): bool { + $id = DocumentGenerator::get_attribute_value( $row, 'id' ); + if ( ! is_string( $id ) ) { + return false; + } + return $quirks + ? ascii_strtolower( $id ) === ascii_strtolower( $wanted ) + : $id === $wanted; + } + + private static function attr_matches( array $sub, array $row, bool $html_attr_ci ): bool { + $attr_value = DocumentGenerator::get_attribute_value( $row, $sub['name'] ); + if ( null === $attr_value ) { + return false; + } + if ( null === $sub['matcher'] ) { + return true; + } + if ( true === $attr_value ) { + $attr_value = ''; + } + + $wanted = (string) $sub['value']; + $case_insensitive = 'case-insensitive' === $sub['modifier'] || ( + $html_attr_ci && + null === $sub['modifier'] && + 'html' === ( $row['namespace'] ?? 'html' ) && + isset( self::HTML_CASE_INSENSITIVE_ATTRIBUTES[ ascii_strtolower( $sub['name'] ) ] ) + ); + if ( $case_insensitive ) { + $attr_value = ascii_strtolower( $attr_value ); + $wanted = ascii_strtolower( $wanted ); + } + + switch ( $sub['matcher'] ) { + case 'exact': + return $attr_value === $wanted; + + case 'one-of': + if ( '' === $wanted || strlen( $wanted ) !== strcspn( $wanted, self::WHITESPACE ) ) { + return false; + } + $length = strlen( $attr_value ); + $at = 0; + while ( $at < $length ) { + $at += strspn( $attr_value, self::WHITESPACE, $at ); + if ( $at >= $length ) { + break; + } + $word_length = strcspn( $attr_value, self::WHITESPACE, $at ); + if ( substr( $attr_value, $at, $word_length ) === $wanted ) { + return true; + } + $at += $word_length; + } + return false; + + case 'exact-or-hyphen-suffixed': + if ( $attr_value === $wanted ) { + return true; + } + return 0 === strncmp( $attr_value, $wanted . '-', strlen( $wanted ) + 1 ); + + case 'prefixed': + if ( '' === $wanted ) { + return false; + } + return 0 === strncmp( $attr_value, $wanted, strlen( $wanted ) ); + + case 'suffixed': + if ( '' === $wanted ) { + return false; + } + return strlen( $attr_value ) >= strlen( $wanted ) + && substr( $attr_value, -strlen( $wanted ) ) === $wanted; + + case 'contains': + if ( '' === $wanted ) { + return false; + } + return false !== strpos( $attr_value, $wanted ); + } + + return false; + } +} diff --git a/tools/css-selector-fuzz/lib/SelectorGenerator.php b/tools/css-selector-fuzz/lib/SelectorGenerator.php new file mode 100644 index 0000000000000..a41a3525878c2 --- /dev/null +++ b/tools/css-selector-fuzz/lib/SelectorGenerator.php @@ -0,0 +1,1736 @@ + array( "\x80", 1 ), + 'truncated-2-byte' => array( "\xC3", 1 ), + 'truncated-3-byte' => array( "\xE2\x8C", 1 ), + 'truncated-4-byte' => array( "\xF0\x9F\x82", 1 ), + 'invalid-lead-f5' => array( "\xF5", 1 ), + 'invalid-lead-ff' => array( "\xFF", 1 ), + 'overlong-min' => array( "\xC0\x80", 2 ), + 'overlong-max' => array( "\xC1\xBF", 2 ), + 'surrogate-half' => array( "\xED\xA0\x80", 3 ), + 'beyond-max' => array( "\xF4\x90\x80\x80", 4 ), + ); + + /** @var Prng */ + private $prng; + /** @var array */ + private $pools; + /** @var bool Escape ident codepoints aggressively when rendering. */ + private $escape_boost = false; + + private function __construct( Prng $prng, array $pools ) { + $this->prng = $prng; + $this->pools = $pools; + } + + /** + * Renders a canonical complex-list AST to a selector string. Parsing the + * result must yield exactly the given AST. With $escape_boost, idents are + * escaped far more often (exercises the escape decoder on no-op escapes). + */ + public static function render( Prng $prng, array $list_ast, bool $escape_boost = false ): string { + $generator = new self( $prng, array() ); + $generator->escape_boost = $escape_boost; + return $generator->render_complex_list( $list_ast ); + } + + /** + * Renders a canonical complex-list AST deterministically with minimal + * escaping: single spaces around combinators, `, ` between branches, + * double-quoted attribute values, lowercase `i`/`s` modifiers, and all + * non-ASCII codepoints hex-escaped. Used to hand a semantically-identical + * selector to external engines: lexbor rejects some byte-level forms WP + * correctly accepts ( uppercase I/S attribute modifiers; raw non-ASCII + * ident codepoints in U+00B7, U+00C0-U+00F6 — its non-ASCII ident table + * starts at U+00F8 ). Escaping sidesteps codepoint classification. + */ + public static function render_canonical( array $list_ast ): string { + $branches = array(); + foreach ( $list_ast as $complex ) { + $out = ''; + foreach ( array_reverse( $complex['context'] ) as $pair ) { + list( $type, $combinator ) = $pair; + $out .= '*' === $type ? '*' : self::canonical_ident( $type ); + $out .= '>' === $combinator ? ' > ' : ' '; + } + + $compound = $complex['self']; + if ( null !== $compound['type'] ) { + $out .= '*' === $compound['type'] ? '*' : self::canonical_ident( $compound['type'] ); + } + foreach ( (array) $compound['subs'] as $sub ) { + switch ( $sub['kind'] ) { + case 'class': + $out .= '.' . self::canonical_ident( $sub['name'] ); + break; + case 'id': + $out .= '#' . self::canonical_ident( $sub['name'] ); + break; + case 'attr': + $out .= '[' . self::canonical_ident( $sub['name'] ); + if ( null !== $sub['matcher'] ) { + $matchers = array( + 'exact' => '=', + 'one-of' => '~=', + 'exact-or-hyphen-suffixed' => '|=', + 'prefixed' => '^=', + 'suffixed' => '$=', + 'contains' => '*=', + ); + $out .= $matchers[ $sub['matcher'] ] . self::canonical_string( (string) $sub['value'] ); + if ( 'case-insensitive' === $sub['modifier'] ) { + $out .= ' i'; + } elseif ( 'case-sensitive' === $sub['modifier'] ) { + $out .= ' s'; + } + } + $out .= ']'; + break; + } + } + $branches[] = $out; + } + return implode( ', ', $branches ); + } + + private static function canonical_ident( string $name ): string { + $points = utf8_codepoints( $name ); + $count = count( $points ); + $out = ''; + + foreach ( $points as $i => $point ) { + list( $char, $cp ) = $point; + + $is_digit = $cp >= 0x30 && $cp <= 0x39; + $is_ident_char = ( + '-' === $char || + '_' === $char || + $is_digit || + ( $cp >= 0x41 && $cp <= 0x5A ) || + ( $cp >= 0x61 && $cp <= 0x7A ) + ); + + $must_escape = ! $is_ident_char + || ( 0 === $i && $is_digit ) + || ( 1 === $i && '-' === $points[0][0] && $is_digit ) + || ( 1 === $count && '-' === $char ); + + $out .= $must_escape ? '\\' . dechex( $cp ) . ' ' : $char; + } + + return $out; + } + + private static function canonical_string( string $value ): string { + $out = '"'; + foreach ( utf8_codepoints( $value ) as $point ) { + list( $char, $cp ) = $point; + if ( '"' === $char || '\\' === $char || $cp < 0x20 || $cp > 0x7E ) { + $out .= '\\' . dechex( $cp ) . ' '; + } else { + $out .= $char; + } + } + return $out . '"'; + } + + /** + * @param array $pools Pools from DocumentGenerator ( tags, classes, ids, attrNames, attrValues ). + * @param array|null $rows Element rows ( TreeCapture shape ) with real + * fids; enables the path-directed bucket. + * @return array{ + * bucket: string, + * selector: string, + * expectCompound: bool|null, + * expectComplex: bool|null, + * ast: array|null, + * mustMatchFid: string|null, + * mustNotMatchFid: string|null, + * } + */ + public static function generate( Prng $prng, array $pools, ?array $rows = null, ?string $bucket = null ): array { + $generator = new self( $prng, $pools ); + + if ( null === $bucket ) { + $bucket = $prng->weighted( + null === $rows || array() === $rows + ? array( + 'supported-compound' => 28, + 'supported-complex' => 24, + 'unsupported' => 14, + 'invalid' => 11, + 'invalid-utf8' => 5, + 'chaos' => 8, + 'mutated' => 10, + 'edge-escape' => 5, + ) + : array( + 'supported-compound' => 23, + 'supported-complex' => 19, + 'path-directed' => 21, + 'unsupported' => 11, + 'invalid' => 9, + 'invalid-utf8' => 5, + 'chaos' => 6, + 'mutated' => 6, + 'edge-escape' => 5, + ) + ); + } + + if ( 'path-directed' === $bucket && ( null === $rows || array() === $rows ) ) { + $bucket = 'supported-complex'; + } + + switch ( $bucket ) { + case 'supported-compound': + $ast = $generator->gen_complex_list( false ); + return array( + 'bucket' => $bucket, + 'selector' => $generator->render_complex_list( $ast ), + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => $ast, + ); + + case 'supported-complex': + $ast = $generator->gen_complex_list( true ); + return array( + 'bucket' => $bucket, + 'selector' => $generator->render_complex_list( $ast ), + 'expectCompound' => false, + 'expectComplex' => true, + 'ast' => $ast, + ); + + case 'path-directed': + return $generator->gen_path_directed( $rows ); + + case 'edge-escape': + return $generator->gen_edge_escape(); + + case 'invalid-utf8': + return $generator->gen_invalid_utf8(); + + case 'unsupported': + return array( + 'bucket' => $bucket, + 'selector' => $generator->gen_unsupported(), + 'expectCompound' => false, + 'expectComplex' => false, + 'ast' => null, + ); + + case 'invalid': + return array( + 'bucket' => $bucket, + 'selector' => $generator->gen_invalid(), + 'expectCompound' => false, + 'expectComplex' => false, + 'ast' => null, + ); + + case 'chaos': + return array( + 'bucket' => $bucket, + 'selector' => $generator->gen_chaos(), + 'expectCompound' => null, + 'expectComplex' => null, + 'ast' => null, + ); + + case 'mutated': + default: + $ast = $generator->gen_complex_list( $generator->prng->chance( 50 ) ); + $rendered = $generator->render_complex_list( $ast ); + return array( + 'bucket' => 'mutated', + 'selector' => $generator->mutate( $rendered ), + 'expectCompound' => null, + 'expectComplex' => null, + 'ast' => null, + ); + } + } + + /* + * -------------- + * AST generation + * -------------- + * + * Canonical AST shapes (matching what AstExtractor produces from + * parsed WP_CSS_* objects): + * + * list: array of complex + * complex: array( 'context' => array( array( type, combinator ) ... right-to-left ), 'self' => compound ) + * compound: array( 'type' => string|null, 'subs' => array|null ) + * sub: array( 'kind' => 'class'|'id', 'name' => string ) + * | array( 'kind' => 'attr', 'name' => string, 'matcher' => string|null, + * 'value' => string|null, 'modifier' => string|null ) + */ + + private function gen_complex_list( bool $require_combinator ): array { + $count = $this->prng->weighted( + array( + 1 => 55, + 2 => 30, + 3 => 15, + ) + ); + + $list = array(); + $combinator_at = $require_combinator ? $this->prng->int( 0, $count - 1 ) : -1; + for ( $i = 0; $i < $count; $i++ ) { + $wants_combinators = $i === $combinator_at || ( $require_combinator && $this->prng->chance( 30 ) ); + $list[] = $this->gen_complex( $require_combinator ? $wants_combinators : false ); + } + return $list; + } + + private function gen_complex( bool $with_combinators ): array { + $context = array(); + if ( $with_combinators ) { + $context_count = $this->prng->int( 1, 3 ); + for ( $i = 0; $i < $context_count; $i++ ) { + $context[] = array( + $this->gen_type_name( true ), + $this->prng->chance( 50 ) ? ' ' : '>', + ); + } + } + + return array( + 'context' => $context, + 'self' => $this->gen_compound(), + ); + } + + private function gen_compound(): array { + $has_type = $this->prng->chance( 65 ); + $sub_count = $this->prng->weighted( + array( + 0 => 30, + 1 => 40, + 2 => 20, + 3 => 10, + ) + ); + if ( ! $has_type && 0 === $sub_count ) { + if ( $this->prng->chance( 50 ) ) { + $has_type = true; + } else { + $sub_count = 1; + } + } + + $subs = array(); + for ( $i = 0; $i < $sub_count; $i++ ) { + $subs[] = $this->gen_subclass(); + } + + return array( + 'type' => $has_type ? $this->gen_type_name( false ) : null, + 'subs' => array() === $subs ? null : $subs, + ); + } + + private function gen_type_name( bool $for_context ): string { + if ( $this->prng->chance( $for_context ? 25 : 12 ) ) { + return '*'; + } + $pool = $this->pools['tags'] ?? array(); + if ( array() !== $pool && $this->prng->chance( 70 ) ) { + $name = $this->prng->choice( $pool ); + return $this->prng->chance( 25 ) ? $this->random_case( $name ) : $name; + } + return $this->prng->choice( array( 'video', 'table', 'x-absent', 'object', 'span' ) ); + } + + private function gen_subclass(): array { + $kind = $this->prng->weighted( + array( + 'class' => 40, + 'id' => 25, + 'attr' => 35, + ) + ); + + switch ( $kind ) { + case 'class': + return array( + 'kind' => 'class', + 'name' => $this->pick_name( 'classes' ), + ); + case 'id': + return array( + 'kind' => 'id', + 'name' => $this->pick_name( 'ids' ), + ); + default: + return $this->gen_attr_selector(); + } + } + + private function gen_attr_selector(): array { + $name = $this->pick_name( 'attrNames' ); + + $matcher = $this->prng->weighted( + array( + '' => 25, + 'exact' => 20, + 'one-of' => 12, + 'exact-or-hyphen-suffixed' => 11, + 'prefixed' => 11, + 'suffixed' => 11, + 'contains' => 10, + ) + ); + $matcher = '' === $matcher ? null : $matcher; + + if ( null === $matcher ) { + return array( + 'kind' => 'attr', + 'name' => $name, + 'matcher' => null, + 'value' => null, + 'modifier' => null, + ); + } + + $modifier = $this->prng->weighted( + array( + '' => 70, + 'case-insensitive' => 18, + 'case-sensitive' => 12, + ) + ); + + $value = $this->gen_attr_value(); + + /* + * HTML's case-insensitive attribute value list: with no modifier, + * the values of listed attributes ( type, rel, lang, dir, ... ) + * match ASCII case-insensitively on HTML elements. Sometimes flip + * the case of the selector value for a listed attribute so the + * differential exercises that rule rather than relying on sampled + * values happening to differ in case. + */ + if ( + '' === $modifier && + isset( ReferenceMatcher::HTML_CASE_INSENSITIVE_ATTRIBUTES[ ascii_strtolower( $name ) ] ) && + $this->prng->chance( 40 ) + ) { + $value = $this->prng->chance( 50 ) ? ascii_strtoupper( $value ) : str_shuffle_case( $value, $this->prng ); + } + + return array( + 'kind' => 'attr', + 'name' => $name, + 'matcher' => $matcher, + 'value' => $value, + 'modifier' => '' === $modifier ? null : $modifier, + ); + } + + private function gen_attr_value(): string { + $pool = $this->pools['attrValues'] ?? array(); + + $kind = $this->prng->weighted( + array( + 'pool' => 35, + 'pool-part' => 20, + 'pool-case' => 10, + 'empty' => 10, + 'word' => 15, + 'tricky' => 10, + ) + ); + + if ( in_array( $kind, array( 'pool', 'pool-part', 'pool-case' ), true ) && array() === $pool ) { + $kind = 'word'; + } + + switch ( $kind ) { + case 'pool': + return $this->prng->choice( $pool ); + + case 'pool-part': + $value = $this->prng->choice( $pool ); + if ( '' === $value ) { + return ''; + } + $points = utf8_codepoints( $value ); + $total = count( $points ); + $start = $this->prng->int( 0, max( 0, $total - 1 ) ); + $length = $this->prng->int( 1, $total - $start ); + $part = ''; + for ( $i = $start; $i < $start + $length; $i++ ) { + $part .= $points[ $i ][0]; + } + return $part; + + case 'pool-case': + return $this->random_case( $this->prng->choice( $pool ) ); + + case 'empty': + return ''; + + case 'word': + return $this->prng->choice( array( 'alpha', 'beta9', 'value', 'main-item', 'Z', 'i', 's', 'one two', 'x-y-z' ) ); + + case 'tricky': + default: + return $this->prng->choice( + array( + 'a b', + " lead", + "trail ", + "tab\there", + "line\nbreak", + 'quote"inside', + "apos'inside", + 'back\\slash', + '-', + '--', + '0digit', + 'ünïcode', + ) + ); + } + } + + private function pick_name( string $pool_key ): string { + $pool = $this->pools[ $pool_key ] ?? array(); + if ( array() !== $pool && $this->prng->chance( 65 ) ) { + $name = $this->prng->choice( $pool ); + if ( '' !== $name && $this->prng->chance( 20 ) ) { + $name = $this->random_case( $name ); + } + if ( '' !== $name ) { + return $name; + } + } + return $this->prng->choice( + array( + 'absent', + 'no-such-thing', + 'x', + '-lead', + '--double', + '_under', + 'Ünïcode', + 'with space', + '9starts-with-digit', + '-9hyphen-digit', + 'mixedCase', + ) + ); + } + + /* + * --------------------------- + * Edge-case escapes and input + * --------------------------- + * + * Targets parser branches the structural generators can't reach: + * - hex escapes whose codepoint is NUL / a surrogate / over-max, which + * the tokenizer must decode to U+FFFD; + * - raw NUL / CR / CRLF / FF bytes in the selector input, which + * the selector token stream preprocesses ( NUL→U+FFFD, the rest→LF ). + * + * These carry a known intended AST: the decoded ident is the U+FFFD + * replacement character ( or, for input normalization, the same selector + * with whitespace normalized ), so the AST round-trip still applies. + */ + private function gen_edge_escape(): array { + $kind = $this->prng->weighted( + array( + 'fffd-ident' => 35, + 'eof-escape' => 20, + 'eof-truncated' => 15, + 'nul-input' => 15, + 'ws-input' => 15, + ) + ); + + if ( 'eof-truncated' === $kind ) { + /* + * The end of input auto-closes an unterminated attribute selector + * block ( and an unterminated string inside it ): `[a=b` is the + * same selector as `[a=b]`. + * + * https://www.w3.org/TR/css-syntax-3/#consume-simple-block + */ + $matcher = $this->prng->choice( array( null, 'exact', 'one-of', 'exact-or-hyphen-suffixed', 'prefixed', 'suffixed', 'contains' ) ); + $value = null === $matcher ? null : $this->prng->choice( array( 'v' . $this->prng->int( 0, 99 ), 'a b', '', 'x,y', "caf\u{E9}" ) ); + $modifier = null !== $matcher && $this->prng->chance( 30 ) + ? $this->prng->choice( array( 'case-insensitive', 'case-sensitive' ) ) + : null; + $compound = array( + 'type' => $this->prng->chance( 50 ) ? 'div' : null, + 'subs' => array( + array( + 'kind' => 'attr', + 'name' => 'a' . $this->prng->int( 0, 99 ), + 'matcher' => $matcher, + 'value' => $value, + 'modifier' => $modifier, + ), + ), + ); + + // The attribute selector is the final rendered unit, so the render always ends with ']'. + $rendered = $this->render_compound( $compound ); + $truncated = substr( $rendered, 0, -1 ); + + // Sometimes also drop a closing string quote: EOF terminates the string, then closes the block. + $last_byte = substr( $truncated, -1 ); + if ( ( '"' === $last_byte || "'" === $last_byte ) && $this->prng->chance( 50 ) ) { + $truncated = substr( $truncated, 0, -1 ); + + // A backslash at the end of an unterminated string "does nothing": the value is unchanged. + if ( $this->prng->chance( 40 ) ) { + $truncated .= '\\'; + } + } + + return array( + 'bucket' => 'edge-escape', + 'selector' => $truncated, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => array( + array( + 'context' => array(), + 'self' => $compound, + ), + ), + ); + } + + if ( 'eof-escape' === $kind ) { + /* + * A backslash at the end of input is a valid escape ( EOF is not + * a newline ) and decodes to U+FFFD, in ident context only: + * `.foo\` is the class `foo\u{FFFD}`. + * + * https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point + */ + $name = $this->prng->chance( 30 ) ? '' : 'a' . $this->prng->int( 0, 99 ); + list( $selector, $self ) = $this->prng->choice( + array( + array( + '.' . $name . '\\', + array( + 'type' => null, + 'subs' => array( array( 'kind' => 'class', 'name' => $name . "\u{FFFD}" ) ), + ), + ), + array( + '#' . $name . '\\', + array( + 'type' => null, + 'subs' => array( array( 'kind' => 'id', 'name' => $name . "\u{FFFD}" ) ), + ), + ), + array( + $name . '\\', + array( + 'type' => $name . "\u{FFFD}", + 'subs' => null, + ), + ), + ) + ); + return array( + 'bucket' => 'edge-escape', + 'selector' => $selector, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => array( + array( + 'context' => array(), + 'self' => $self, + ), + ), + ); + } + + if ( 'fffd-ident' === $kind ) { + // A class selector whose name is a single U+FFFD, produced by a + // hex escape for an out-of-range codepoint. + $hex = $this->prng->choice( + array( + '0', + '00', + '000000', + dechex( $this->prng->int( 0xD800, 0xDFFF ) ), // surrogate + dechex( $this->prng->int( 0x110000, 0xFFFFFF ) ), // over-max + ) + ); + if ( $this->prng->chance( 40 ) ) { + $hex = strtoupper( $hex ); + } + $selector = '.\\' . $hex . ' '; + $ast = array( + array( + 'context' => array(), + 'self' => array( + 'type' => null, + 'subs' => array( array( 'kind' => 'class', 'name' => "\u{FFFD}" ) ), + ), + ), + ); + return array( + 'bucket' => 'edge-escape', + 'selector' => $selector, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => $ast, + ); + } + + /* + * Raw control bytes in the selector input. A small fixed compound + * keeps the case focused on selector input preprocessing and avoids + * entangling with unrelated attribute-selector edge cases. + */ + $compound = array( + 'type' => $this->prng->chance( 50 ) ? 'span' : null, + 'subs' => array( + array( 'kind' => 'class', 'name' => 'foo' ), + array( 'kind' => 'id', 'name' => 'bar' ), + ), + ); + if ( null === $compound['type'] && $this->prng->chance( 50 ) ) { + array_pop( $compound['subs'] ); + } + $rendered = $this->render_compound( $compound ); + + if ( 'nul-input' === $kind ) { + // A NUL between a class dot's selectors becomes part of an ident + // only in limited spots; simplest reliable case: a class whose + // name contains a NUL ( → U+FFFD ). + $ast = array( + array( + 'context' => array(), + 'self' => array( + 'type' => null, + 'subs' => array( array( 'kind' => 'class', 'name' => "a\u{FFFD}b" ) ), + ), + ), + ); + return array( + 'bucket' => 'edge-escape', + 'selector' => ".a\0b", + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => $ast, + ); + } + + // ws-input: wrap/insert CR, CRLF, FF as insignificant whitespace. + $lead = $this->prng->choice( array( "\r", "\f", "\r\n", "\r\r", "\f\f" ) ); + $trail = $this->prng->choice( array( "\r", "\f", "\r\n", '' ) ); + return array( + 'bucket' => 'edge-escape', + 'selector' => $lead . $rendered . $trail, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => array( + array( + 'context' => array(), + 'self' => $compound, + ), + ), + ); + } + + /* + * ----------------------- + * Invalid-UTF-8 injection + * ----------------------- + * + * Raw ill-formed UTF-8 byte sequences in the selector input, mirroring + * the nul-input pattern: a small fixed simple selector keeps the case + * focused on the selector token stream scrub. Each maximal subpart + * of the injected sequence decodes to one U+FFFD ( per-class counts + * pinned in INVALID_UTF8_CLASSES ), and U+FFFD is a valid ident + * codepoint — including in start position — so the scrubbed selector + * must parse and the post-scrub AST is known by construction. + */ + private function gen_invalid_utf8(): array { + list( $bytes, $subparts ) = $this->prng->choice( array_values( self::INVALID_UTF8_CLASSES ) ); + + $position = $this->prng->choice( array( 'lead', 'mid', 'trail', 'whole' ) ); + $prefix = in_array( $position, array( 'lead', 'whole' ), true ) ? '' : 'a' . $this->prng->int( 0, 9 ); + $suffix = in_array( $position, array( 'trail', 'whole' ), true ) ? '' : 'z' . $this->prng->int( 0, 9 ); + $raw = $prefix . $bytes . $suffix; + $decoded = $prefix . str_repeat( "\u{FFFD}", $subparts ) . $suffix; + + switch ( $this->prng->choice( array( 'class', 'id', 'attr-name', 'attr-value' ) ) ) { + case 'class': + $rendered = '.' . $raw; + $sub = array( + 'kind' => 'class', + 'name' => $decoded, + ); + break; + + case 'id': + $rendered = '#' . $raw; + $sub = array( + 'kind' => 'id', + 'name' => $decoded, + ); + break; + + case 'attr-name': + $rendered = '[' . $raw . ']'; + $sub = array( + 'kind' => 'attr', + 'name' => $decoded, + 'matcher' => null, + 'value' => null, + 'modifier' => null, + ); + break; + + case 'attr-value': + default: + $name = 'a' . $this->prng->int( 0, 99 ); + $quote = $this->prng->chance( 50 ) ? '"' : "'"; + $rendered = '[' . $name . '=' . $quote . $raw . $quote . ']'; + $sub = array( + 'kind' => 'attr', + 'name' => $name, + 'matcher' => 'exact', + 'value' => $decoded, + 'modifier' => null, + ); + break; + } + + $type = $this->prng->chance( 40 ) ? 'span' : null; + + return array( + 'bucket' => 'invalid-utf8', + 'selector' => ( null === $type ? '' : $type ) . $rendered, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => array( + array( + 'context' => array(), + 'self' => array( + 'type' => $type, + 'subs' => array( $sub ), + ), + ), + ), + ); + } + + /* + * ------------------------ + * Path-directed generation + * ------------------------ + * + * Synthesizes a selector from a real element of the model tree so that + * the selector is guaranteed (by construction) to match that element: + * the type comes from its tag, subclasses from its actual classes / id / + * attributes, and the context chain from its actual ancestor tags with + * combinators consistent with the real nesting. Optionally one feature + * is then flipped into a "near-miss" that is guaranteed NOT to match + * the element ( or, for combinator loosening, still guaranteed to ). + */ + + private function gen_path_directed( array $rows ): array { + // Bias toward elements deep enough for a meaningful context chain. + $deep = array(); + foreach ( $rows as $row ) { + if ( count( $row['ancestorTags'] ) >= 2 ) { + $deep[] = $row; + } + } + $element = array() !== $deep && $this->prng->chance( 75 ) + ? $this->prng->choice( $deep ) + : $this->prng->choice( $rows ); + + $compound = $this->path_compound_for( $element ); + $context = array() !== $element['ancestorTags'] && $this->prng->chance( 75 ) + ? $this->path_context_for( $element['ancestorTags'] ) + : array(); + + $list = array( + array( + 'context' => $context, + 'self' => $compound, + ), + ); + + $must_match = $element['fid']; + $must_not_match = null; + + if ( $this->prng->chance( 40 ) ) { + list( $list, $must_match, $must_not_match ) = $this->path_near_miss( $list, $element ); + } elseif ( $this->prng->chance( 20 ) ) { + // Extra unrelated branch: a list union can only add matches. + $list[] = $this->gen_complex( $this->prng->chance( 30 ) ); + } + + $has_context = false; + foreach ( $list as $complex ) { + if ( array() !== $complex['context'] ) { + $has_context = true; + break; + } + } + + return array( + 'bucket' => 'path-directed', + 'selector' => $this->render_complex_list( $list ), + 'expectCompound' => ! $has_context, + 'expectComplex' => true, + 'ast' => $list, + 'mustMatchFid' => $must_match, + 'mustNotMatchFid' => $must_not_match, + ); + } + + /** A compound selector built only from features the element row really has. */ + private function path_compound_for( array $element ): array { + $tag = ascii_strtolower( $element['tag'] ); + + $features = array(); + + $class_value = DocumentGenerator::get_attribute_value( $element, 'class' ); + if ( is_string( $class_value ) ) { + foreach ( DocumentGenerator::class_tokens( $class_value ) as $word ) { + $features[] = array( 'kind' => 'class', 'name' => $word ); + } + } + + $id_value = DocumentGenerator::get_attribute_value( $element, 'id' ); + if ( is_string( $id_value ) && '' !== $id_value ) { + $features[] = array( 'kind' => 'id', 'name' => $id_value ); + } + + $seen_attrs = array(); + foreach ( $element['attrs'] as $attr ) { + $lower = ascii_strtolower( $attr[0] ); + if ( isset( $seen_attrs[ $lower ] ) ) { + continue; + } + $seen_attrs[ $lower ] = true; + if ( 'class' === $lower && is_string( $attr[1] ) && false !== strpos( $attr[1], "\0" ) ) { + continue; + } + $features[] = $this->path_attr_feature( $lower, $attr[1], 'html' === ( $element['namespace'] ?? 'html' ) ); + } + + $subs = array(); + $available = count( $features ); + if ( $available > 0 ) { + $want = min( $available, $this->prng->weighted( array( 0 => 25, 1 => 40, 2 => 25, 3 => 10 ) ) ); + for ( $i = 0; $i < $want; $i++ ) { + $at = $this->prng->int( 0, count( $features ) - 1 ); + $subs[] = $features[ $at ]; + array_splice( $features, $at, 1 ); + } + } + + $type = null; + if ( array() === $subs || $this->prng->chance( 70 ) ) { + $type = $this->prng->chance( 12 ) ? '*' : ( $this->prng->chance( 30 ) ? $this->random_case( $tag ) : $tag ); + } + + return array( + 'type' => $type, + 'subs' => array() === $subs ? null : $subs, + ); + } + + /** An attribute selector that the (name, value) pair satisfies. */ + private function path_attr_feature( string $name, $value, bool $is_html_namespace = true ): array { + $presence = array( + 'kind' => 'attr', + 'name' => $this->prng->chance( 15 ) ? $this->random_case( $name ) : $name, + 'matcher' => null, + 'value' => null, + 'modifier' => null, + ); + + if ( true === $value ) { + // A boolean attribute has the empty string as its value. + $value = ''; + } + if ( ! is_string( $value ) || $this->prng->chance( 30 ) ) { + return $presence; + } + + $points = utf8_codepoints( $value ); + $total = count( $points ); + + $candidates = array( array( 'exact', $value ) ); + + foreach ( preg_split( '/[ \t\n\f\r]+/', $value, -1, PREG_SPLIT_NO_EMPTY ) as $word ) { + $candidates[] = array( 'one-of', $word ); + break; + } + + $hyphen_at = strpos( $value, '-' ); + $candidates[] = array( 'exact-or-hyphen-suffixed', false === $hyphen_at ? $value : substr( $value, 0, $hyphen_at ) ); + + if ( $total > 0 ) { + $slice = static function ( array $points, int $start, int $length ): string { + $out = ''; + for ( $i = $start; $i < $start + $length; $i++ ) { + $out .= $points[ $i ][0]; + } + return $out; + }; + + $candidates[] = array( 'prefixed', $slice( $points, 0, $this->prng->int( 1, $total ) ) ); + $length = $this->prng->int( 1, $total ); + $candidates[] = array( 'suffixed', $slice( $points, $total - $length, $length ) ); + $start = $this->prng->int( 0, $total - 1 ); + $candidates[] = array( 'contains', $slice( $points, $start, $this->prng->int( 1, $total - $start ) ) ); + } + + list( $matcher, $operand ) = $this->prng->choice( $candidates ); + + /* + * `|=` with an operand cut at a hyphen only matches when the operand + * is non-empty and actually a value prefix; an operand equal to the + * value always matches. Guard the degenerate empty-operand cases. + */ + if ( 'exact-or-hyphen-suffixed' === $matcher && '' === $operand && '' !== $value ) { + $matcher = 'exact'; + $operand = $value; + } + if ( in_array( $matcher, array( 'one-of', 'prefixed', 'suffixed', 'contains' ), true ) && '' === $operand ) { + return $presence; + } + + $modifier = null; + if ( $this->prng->chance( 25 ) ) { + if ( $this->prng->chance( 60 ) ) { + $modifier = 'case-insensitive'; + $operand = $this->random_case( $operand ); + } else { + $modifier = 'case-sensitive'; + } + } elseif ( + $is_html_namespace && + isset( ReferenceMatcher::HTML_CASE_INSENSITIVE_ATTRIBUTES[ $name ] ) && + $this->prng->chance( 50 ) + ) { + /* + * HTML's case-insensitive attribute value list: with no modifier + * the flipped operand still satisfies the (name, value) pair on + * an html-namespace element, which makes the folding rule + * load-bearing for the mustMatchFid invariant — name and value + * here come from the same real element, unlike the independent + * pools in gen_attr_selector. + */ + $operand = $this->random_case( $operand ); + } + + return array( + 'kind' => 'attr', + 'name' => $presence['name'], + 'matcher' => $matcher, + 'value' => $operand, + 'modifier' => $modifier, + ); + } + + /** + * A context chain ( right-to-left ( type, combinator ) pairs ) drawn from + * the element's real ancestors so the chain is satisfied by construction: + * `>` is only used for the immediately-next ancestor, descendant + * combinators may skip generations. + * + * @param string[] $ancestor_tags Nearest-first ancestor tag names. + */ + private function path_context_for( array $ancestor_tags ): array { + $chain = array(); + $pos = 0; + $count = count( $ancestor_tags ); + + while ( $pos < $count && ( array() === $chain || $this->prng->chance( 45 ) ) ) { + $jump = $this->prng->chance( 65 ) ? 0 : $this->prng->int( 0, $count - 1 - $pos ); + $at = $pos + $jump; + + $combinator = ( 0 === $jump && $this->prng->chance( 55 ) ) ? '>' : ' '; + $tag = ascii_strtolower( $ancestor_tags[ $at ] ); + $type = $this->prng->chance( 12 ) + ? '*' + : ( $this->prng->chance( 25 ) ? $this->random_case( $tag ) : $tag ); + + $chain[] = array( $type, $combinator ); + $pos = $at + 1; + } + + return $chain; + } + + /** + * Flips one feature of the guaranteed-match selector. Most flips + * guarantee the element no longer matches; loosening a `>` to a + * descendant combinator must keep it matching. + * + * @return array{0: array, 1: string|null, 2: string|null} list, mustMatchFid, mustNotMatchFid. + */ + private function path_near_miss( array $list, array $element ): array { + $complex = $list[0]; + $compound = $complex['self']; + $fid = $element['fid']; + + $flips = array( 'wrong-class', 'wrong-attr' ); + if ( null !== $compound['type'] && '*' !== $compound['type'] ) { + $flips[] = 'wrong-type'; + } + foreach ( $complex['context'] as $pair ) { + if ( '>' === $pair[1] ) { + $flips[] = 'loosen-combinator'; + } + $flips[] = 'tighten-combinator'; + break; + } + + switch ( $this->prng->choice( $flips ) ) { + case 'wrong-type': + $tag = ascii_strtolower( $element['tag'] ); + do { + $other = $this->prng->choice( DocumentGenerator::SAFE_TAGS ); + } while ( $other === $tag ); + $complex['self']['type'] = $this->prng->chance( 25 ) ? $this->random_case( $other ) : $other; + return array( array( $complex ), null, $fid ); + + case 'wrong-attr': + $subs = (array) $complex['self']['subs']; + $subs[] = array( + 'kind' => 'attr', + 'name' => 'zz-no-such-attr', + 'matcher' => null, + 'value' => null, + 'modifier' => null, + ); + $complex['self']['subs'] = $subs; + return array( array( $complex ), null, $fid ); + + case 'loosen-combinator': + // Replacing every `>` with a descendant combinator can only + // widen the context; the element must still match. + foreach ( $complex['context'] as &$pair ) { + $pair[1] = ' '; + } + unset( $pair ); + $list[0] = $complex; + return array( $list, $fid, null ); + + case 'tighten-combinator': + // May or may not still match; no membership expectation. + $at = $this->prng->int( 0, count( $complex['context'] ) - 1 ); + $complex['context'][ $at ][1] = '>'; + $list[0] = $complex; + return array( $list, null, null ); + + case 'wrong-class': + default: + $subs = (array) $complex['self']['subs']; + $subs[] = array( + 'kind' => 'class', + 'name' => 'zz-no-such-class', + ); + $complex['self']['subs'] = $subs; + return array( array( $complex ), null, $fid ); + } + } + + /* + * --------- + * Rendering + * --------- + */ + + private function render_complex_list( array $list ): string { + $bits = array(); + foreach ( $list as $complex ) { + $bits[] = $this->render_complex( $complex ); + } + + $out = $this->maybe_ws( 25 ); + foreach ( $bits as $i => $bit ) { + if ( $i > 0 ) { + $out .= $this->maybe_ws( 40 ) . ',' . $this->maybe_ws( 60 ); + } + $out .= $bit; + } + return $out . $this->maybe_ws( 25 ); + } + + private function render_complex( array $complex ): string { + $out = ''; + // Context selectors are stored right-to-left; render left-to-right. + $reversed = array_reverse( $complex['context'] ); + foreach ( $reversed as $pair ) { + list( $type, $combinator ) = $pair; + $out .= '*' === $type ? '*' : $this->render_ident( $type ); + if ( '>' === $combinator ) { + $out .= $this->maybe_ws( 50 ) . '>' . $this->maybe_ws( 50 ); + } else { + $out .= $this->ws(); + } + } + return $out . $this->render_compound( $complex['self'] ); + } + + private function render_compound( array $compound ): string { + $out = ''; + if ( null !== $compound['type'] ) { + $out .= '*' === $compound['type'] ? '*' : $this->render_ident( $compound['type'] ); + } + foreach ( (array) $compound['subs'] as $sub ) { + switch ( $sub['kind'] ) { + case 'class': + $out .= '.' . $this->render_ident( $sub['name'] ); + break; + case 'id': + $out .= '#' . $this->render_ident( $sub['name'] ); + break; + case 'attr': + $out .= $this->render_attr_selector( $sub ); + break; + } + } + return $out; + } + + private function render_attr_selector( array $sub ): string { + $out = '[' . $this->maybe_ws( 20 ) . $this->render_ident( $sub['name'] ) . $this->maybe_ws( 20 ); + + if ( null === $sub['matcher'] ) { + return $out . ']'; + } + + $matcher_strings = array( + 'exact' => '=', + 'one-of' => '~=', + 'exact-or-hyphen-suffixed' => '|=', + 'prefixed' => '^=', + 'suffixed' => '$=', + 'contains' => '*=', + ); + $out .= $matcher_strings[ $sub['matcher'] ] . $this->maybe_ws( 25 ); + + $value = $sub['value']; + $value_as_ident = '' !== $value && $this->can_render_as_ident( $value ) && $this->prng->chance( 45 ); + if ( $value_as_ident ) { + $out .= $this->render_ident( $value ); + } else { + $out .= $this->render_string( $value ); + } + + if ( null !== $sub['modifier'] ) { + // After an ident value, whitespace is mandatory before the modifier. + $out .= $value_as_ident ? $this->ws() : $this->maybe_ws( 60 ); + + if ( 'case-insensitive' === $sub['modifier'] ) { + $out .= $this->prng->chance( 70 ) ? 'i' : 'I'; + } else { + $out .= $this->prng->chance( 70 ) ? 's' : 'S'; + } + } + + return $out . $this->maybe_ws( 25 ) . ']'; + } + + /** + * Whether a value contains only codepoints this renderer is willing to + * put in an ident token (everything can be escaped, but a value ending + * in whitespace as an ident is fragile to read — strings handle those). + */ + private function can_render_as_ident( string $value ): bool { + return '' !== $value; + } + + /** + * Renders a name as a CSS ident token, escaping wherever required and + * sometimes where merely allowed. Parsing the result must yield $name. + */ + private function render_ident( string $name ): string { + $points = utf8_codepoints( $name ); + $count = count( $points ); + $out = ''; + + foreach ( $points as $i => $point ) { + list( $char, $cp ) = $point; + + $is_digit = $cp >= 0x30 && $cp <= 0x39; + $is_ident_char = ( + '-' === $char || + '_' === $char || + $is_digit || + ( $cp >= 0x41 && $cp <= 0x5A ) || + ( $cp >= 0x61 && $cp <= 0x7A ) || + $cp > 0x7F + ); + + $must_escape = ! $is_ident_char + || ( 0 === $i && $is_digit ) + || ( 1 === $i && '-' === $points[0][0] && $is_digit ) + || ( 1 === $count && '-' === $char ); + + if ( $must_escape || $this->prng->chance( $this->escape_boost ? 50 : 8 ) ) { + $out .= $this->render_escape( $char, $cp ); + } else { + $out .= $char; + } + } + + return $out; + } + + /** + * Renders one codepoint as a CSS escape sequence that decodes back to it. + */ + private function render_escape( string $char, int $cp ): string { + $is_hex_digit = ( $cp >= 0x30 && $cp <= 0x39 ) + || ( $cp >= 0x41 && $cp <= 0x46 ) + || ( $cp >= 0x61 && $cp <= 0x66 ); + $is_newline_like = "\n" === $char || "\r" === $char || "\f" === $char; + + /* + * Identity escapes are only safe for single-byte chars that are not + * hex digits (they would start a hex escape) and not newlines + * (backslash-newline is not a valid escape). + */ + $identity_ok = ! $is_hex_digit && ! $is_newline_like && $cp >= 0x20; + + if ( $identity_ok && $this->prng->chance( 35 ) ) { + return '\\' . $char; + } + + $hex = dechex( $cp ); + if ( $this->prng->chance( 25 ) && strlen( $hex ) < 6 ) { + $hex = str_pad( $hex, $this->prng->int( strlen( $hex ), 6 ), '0', STR_PAD_LEFT ); + } + if ( $this->prng->chance( 30 ) ) { + $hex = strtoupper( $hex ); + } + + // The trailing space is always emitted; it is consumed by the escape. + return '\\' . $hex . ' '; + } + + /** + * Renders a value as a CSS string token. Parsing must yield $value. + */ + private function render_string( string $value ): string { + $quote = $this->prng->chance( 60 ) ? '"' : "'"; + $out = $quote; + $points = utf8_codepoints( $value ); + + foreach ( $points as $point ) { + list( $char, $cp ) = $point; + + if ( "\n" === $char || "\r" === $char || "\f" === $char ) { + // Literal newlines end (break) the string; always hex-escape. + $out .= '\\' . dechex( $cp ) . ' '; + continue; + } + if ( $char === $quote || '\\' === $char ) { + $out .= $this->prng->chance( 60 ) ? '\\' . $char : '\\' . dechex( $cp ) . ' '; + continue; + } + if ( $this->prng->chance( 5 ) ) { + $out .= $this->render_escape( $char, $cp ); + continue; + } + $out .= $char; + } + + // Rarely add a backslash-newline line continuation (decodes to nothing). + if ( $this->prng->chance( 4 ) ) { + $out .= "\\\n"; + } + + return $out . $quote; + } + + private function ws(): string { + $options = array( ' ', ' ', ' ', "\t", "\n", "\f", "\r", ' ', " \t " ); + return $this->prng->choice( $options ); + } + + private function maybe_ws( int $percent ): string { + return $this->prng->chance( $percent ) ? $this->ws() : ''; + } + + private function random_case( string $input ): string { + $out = ''; + for ( $i = 0; $i < strlen( $input ); $i++ ) { + $c = $input[ $i ]; + $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c ); + } + return $out; + } + + /* + * ------------------- + * Unsupported selectors + * ------------------- + */ + + private function gen_unsupported(): string { + $kind = $this->prng->weighted( + array( + 'pseudo-class' => 25, + 'pseudo-element' => 15, + 'sibling-combinator' => 20, + 'column-combinator' => 8, + 'namespace-type' => 12, + 'namespace-attr' => 8, + 'non-type-context' => 12, + ) + ); + + switch ( $kind ) { + case 'pseudo-class': + $pseudo = $this->prng->choice( + array( + ':hover', + ':focus', + ':first-child', + ':last-child', + ':nth-child(2n+1)', + ':nth-of-type(3)', + ':not(.excluded)', + ':is(div, span)', + ':where(*)', + ':root', + ':empty', + ':checked', + ':lang(en)', + ':has(> img)', + ) + ); + return $this->render_compound( $this->gen_compound() ) . $pseudo; + + case 'pseudo-element': + $pseudo = $this->prng->choice( array( '::before', '::after', '::first-line', '::first-letter', '::marker', '::placeholder' ) ); + return $this->render_compound( $this->gen_compound() ) . $pseudo; + + case 'sibling-combinator': + $combinator = $this->prng->choice( array( '+', '~' ) ); + return $this->render_compound( $this->gen_compound() ) + . $this->maybe_ws( 60 ) . $combinator . $this->maybe_ws( 60 ) + . $this->render_compound( $this->gen_compound() ); + + case 'column-combinator': + return $this->gen_type_name( true ) + . $this->maybe_ws( 50 ) . '||' . $this->maybe_ws( 50 ) + . $this->gen_type_name( true ); + + case 'namespace-type': + $ns = $this->prng->choice( array( 'svg', 'html', '*', '' ) ); + return $ns . '|' . $this->prng->choice( array( 'title', 'a', 'circle', 'div' ) ); + + case 'namespace-attr': + // `[ns|name]` — must not be confused with the `|=` matcher, + // so the char after `|` must not be `=`. + $ns = $this->prng->choice( array( 'xlink', 'svg', 'xml' ) ); + return '[' . $ns . '|href]'; + + case 'non-type-context': + default: + // A context selector that is not a bare type selector. + $context = $this->prng->choice( array( '.ctx', '#ctx', '[ctx]', 'div.ctx', 'div#ctx', 'div[ctx]', '*.ctx' ) ); + $joiner = $this->prng->chance( 50 ) + ? $this->ws() + : $this->maybe_ws( 50 ) . '>' . $this->maybe_ws( 50 ); + return $context . $joiner . $this->render_compound( $this->gen_compound() ); + } + } + + /* + * ----------------- + * Invalid selectors + * ----------------- + */ + + private function gen_invalid(): string { + $kind = $this->prng->weighted( + array( + 'template' => 45, + 'trailing-garbage' => 25, + 'leading-garbage' => 15, + 'comma-trouble' => 15, + ) + ); + + switch ( $kind ) { + case 'template': + return $this->prng->choice( + array( + '', + ' ', + "\t\n\f ", + '.', + 'a.', + '#', + '[', + ']', + '[]', + '[ ]', + '.5x', + '#5', + '. x', + '..a', + '.#a', + /* + * EOF auto-closes an open attribute selector block + * ( '[a', '[a=b', '[a="b]', '[a=b i' are valid ), but + * grammar-level truncation is still invalid. + */ + '[a=', + '[a= ', + '[a~', + '[a^', + '[a=]', + '[=b]', + '[a==b]', + '[a~b]', + '[a!=b]', + "[a=\"b\nc\"]", + "[a=\"b\nc", + '[a=b x]', + '[a=b x', + '[a=b ix]', + '[a=b ix', + '[a=b i x', + '[5=b]', + '[5=b', + 'a >', + '> a', + 'a > > b', + 'a >> b', + '>', + '-', + // A lone '\' is a valid escape at EOF ( type selector U+FFFD ); + // '\' before a newline is not a valid escape. + "\\\n", + "a\\\nb", + 'a/**/b', + '!important', + '@media screen', + '{}', + ';', + 'a;b', + 'a{color:red}', + '()', + 'a()', + '*5', + '%', + 'a%', + ) + ); + + case 'trailing-garbage': + $garbage = $this->prng->choice( array( ':', '(', ')', '{', '}', ';', '!', '@', '%', '/', '=', '|', '^', '$' ) ); + return $this->render_compound( $this->gen_compound() ) . $garbage; + + case 'leading-garbage': + $garbage = $this->prng->choice( array( '%', ';', ')', '}', '=', '~', '+', '/', ',' ) ); + return $garbage . $this->render_compound( $this->gen_compound() ); + + case 'comma-trouble': + default: + $compound = $this->render_compound( $this->gen_compound() ); + return $this->prng->choice( + array( + $compound . ',', + ',' . $compound, + $compound . ',,' . $compound, + $compound . ', ,' . $compound, + $compound . ' , ', + ) + ); + } + } + + /* + * ----- + * Chaos + * ----- + */ + + private function gen_chaos(): string { + $alphabets = array( + 'css' => '.#[]=~|^$*>+,:()"\'\\ \t\n-_', + 'ident' => 'abcXYZ019-_', + 'mixed' => '.#[]=~|^$*>+,:()"\'\\ abcXYZ019-_iIsS', + 'unicode' => '✓Ωé🙂', + ); + + $alphabet = $alphabets[ $this->prng->weighted( + array( + 'css' => 25, + 'ident' => 15, + 'mixed' => 45, + 'unicode' => 15, + ) + ) ]; + + if ( 'unicode' === $alphabet ) { + $points = utf8_codepoints( $alphabet . '.#[]= aZ9' ); + $length = $this->prng->int( 0, 24 ); + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + $out .= $this->prng->choice( $points )[0]; + } + return $out; + } + + $length = $this->prng->int( 0, 40 ); + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + $out .= $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ]; + } + return $out; + } + + /* + * -------- + * Mutation + * -------- + */ + + private function mutate( string $selector ): string { + $mutation_count = $this->prng->int( 1, 4 ); + $alphabet = '.#[]=~|^$*>+,:()"\'\\ \t\niIsSabcXYZ019-_'; + + for ( $m = 0; $m < $mutation_count; $m++ ) { + $length = strlen( $selector ); + $kind = $this->prng->weighted( + array( + 'insert' => 30, + 'delete' => 25, + 'replace' => 25, + 'duplicate' => 10, + 'case-flip' => 10, + 'invalid-utf8' => 12, + ) + ); + + switch ( $kind ) { + case 'insert': + $at = $this->prng->int( 0, $length ); + $char = $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ]; + $selector = substr( $selector, 0, $at ) . $char . substr( $selector, $at ); + break; + + case 'delete': + if ( $length > 0 ) { + $at = $this->prng->int( 0, $length - 1 ); + $selector = substr( $selector, 0, $at ) . substr( $selector, $at + 1 ); + } + break; + + case 'replace': + if ( $length > 0 ) { + $at = $this->prng->int( 0, $length - 1 ); + $char = $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ]; + $selector = substr( $selector, 0, $at ) . $char . substr( $selector, $at + 1 ); + } + break; + + case 'duplicate': + if ( $length > 0 ) { + $start = $this->prng->int( 0, $length - 1 ); + $span = $this->prng->int( 1, min( 6, $length - $start ) ); + $selector = substr( $selector, 0, $start + $span ) + . substr( $selector, $start, $span ) + . substr( $selector, $start + $span ); + } + break; + + case 'case-flip': + if ( $length > 0 ) { + $at = $this->prng->int( 0, $length - 1 ); + $char = $selector[ $at ]; + $flip = ctype_lower( $char ) ? strtoupper( $char ) : strtolower( $char ); + $selector = substr( $selector, 0, $at ) . $flip . substr( $selector, $at + 1 ); + } + break; + + case 'invalid-utf8': + // Splice a raw ill-formed sequence at an arbitrary byte + // offset — possibly splitting an existing multibyte + // character or landing before a continuation byte that + // completes a truncated lead. No expectations here; these + // exercise crash / scrub-notice / differential paths. + $bytes = $this->prng->choice( array_column( self::INVALID_UTF8_CLASSES, 0 ) ); + $at = $this->prng->int( 0, $length ); + $selector = substr( $selector, 0, $at ) . $bytes . substr( $selector, $at ); + break; + } + } + + return $selector; + } +} diff --git a/tools/css-selector-fuzz/lib/TreeCapture.php b/tools/css-selector-fuzz/lib/TreeCapture.php new file mode 100644 index 0000000000000..2350db024c8ad --- /dev/null +++ b/tools/css-selector-fuzz/lib/TreeCapture.php @@ -0,0 +1,145 @@ +, + * ancestorTags: string[] nearest-first ) + * tag row: same without ancestorTags. + */ +class TreeCapture { + + const CAPTURE_ITERATION_LIMIT = 20000; + + /** + * Captures the processor's view of a document or a fragment. + * + * @param string $html The markup ( full document or fragment ). + * @param string|null $context When set, parse as a fragment in this + * context ( e.g. '' ); the tag + * processor has no fragment mode, so tagRows + * is null in that case. + * @return array{ + * htmlRows: array|null, + * tagRows: array|null, + * quirks: bool, + * error: string|null, + * } + */ + public static function capture( string $html, ?string $context = null ): array { + $out = array( + 'htmlRows' => null, + 'tagRows' => null, + 'quirks' => false, + 'error' => null, + ); + + $processor = null === $context + ? \WP_HTML_Processor::create_full_parser( $html ) + : \WP_HTML_Processor::create_fragment( $html, $context ); + if ( null === $processor ) { + $out['error'] = 'fragment-context-unsupported'; + return $out; + } + $rows = array(); + $iterations = 0; + while ( $processor->next_tag() ) { + if ( ++$iterations > self::CAPTURE_ITERATION_LIMIT ) { + $out['error'] = 'html-capture-iteration-limit'; + return $out; + } + $breadcrumbs = $processor->get_breadcrumbs(); + array_pop( $breadcrumbs ); + $rows[] = array( + 'tag' => (string) $processor->get_tag(), + 'fid' => self::fid_of( $processor ), + 'attrs' => self::attrs_of( $processor ), + 'ancestorTags' => array_reverse( $breadcrumbs ), + 'namespace' => $processor->get_namespace(), + ); + } + + if ( null !== $processor->get_last_error() ) { + $out['error'] = 'html-processor-error: ' . $processor->get_last_error(); + return $out; + } + if ( null !== $processor->get_unsupported_exception() ) { + $out['error'] = 'html-processor-unsupported: ' . $processor->get_unsupported_exception()->getMessage(); + return $out; + } + + $out['htmlRows'] = $rows; + $out['quirks'] = $processor->is_quirks_mode(); + + // The tag processor has no fragment mode; a fragment case exercises + // the html processor's select() only. + if ( null !== $context ) { + return $out; + } + + $tag_processor = new \WP_HTML_Tag_Processor( $html ); + $tag_rows = array(); + $iterations = 0; + while ( $tag_processor->next_tag() ) { + if ( ++$iterations > self::CAPTURE_ITERATION_LIMIT ) { + $out['error'] = 'tag-capture-iteration-limit'; + return $out; + } + $tag_rows[] = array( + 'tag' => (string) $tag_processor->get_tag(), + 'fid' => self::fid_of( $tag_processor ), + 'attrs' => self::attrs_of( $tag_processor ), + ); + } + $out['tagRows'] = $tag_rows; + + return $out; + } + + /** The element's data-fid, or the same placeholder collect_matches() uses. */ + private static function fid_of( $processor ): string { + $fid = $processor->get_attribute( 'data-fid' ); + return is_string( $fid ) ? self::sanitize_fid( $fid ) : '(missing-fid:' . $processor->get_tag() . ')'; + } + + /** + * Replaces the lexbor protocol framing bytes ( TAB / LF / CR ) in a fid + * with '?'. Generated fids never contain these, but the lexbor harness + * applies the same replacement, so matching this here keeps the two trees + * comparable even for a hypothetical control-char fid ( the worst case is + * a benign tree-gated skip, never a false divergence ). + */ + public static function sanitize_fid( string $fid ): string { + return strtr( $fid, "\t\n\r", '???' ); + } + + /** + * All attributes as ( lowercase name, decoded value ) pairs, excluding + * data-fid ( stored separately, mirroring the generated model's shape ). + * + * @return array + */ + private static function attrs_of( $processor ): array { + $attrs = array(); + foreach ( (array) $processor->get_attribute_names_with_prefix( '' ) as $name ) { + if ( 'data-fid' === $name ) { + continue; + } + $value = $processor->get_attribute( $name ); + $attrs[] = array( $name, true === $value ? true : (string) $value ); + } + return $attrs; + } +} diff --git a/tools/css-selector-fuzz/lib/WildDocumentGenerator.php b/tools/css-selector-fuzz/lib/WildDocumentGenerator.php new file mode 100644 index 0000000000000..3cd7dada17cc6 --- /dev/null +++ b/tools/css-selector-fuzz/lib/WildDocumentGenerator.php @@ -0,0 +1,381 @@ + '', + 'html' => '', + 'legacy-compat' => '', + 'quirky' => '', + 'limited' => '', + ); + + /** @var Prng */ + private $prng; + private $fid_counter = 0; + private $pools; + + private function __construct( Prng $prng ) { + $this->prng = $prng; + $this->pools = array( + 'tags' => array( 'html', 'head', 'body' ), + 'classes' => array(), + 'ids' => array(), + 'attrNames' => array(), + 'attrValues' => array(), + ); + } + + /** + * @return array{model: null, html: string, pools: array, wild: true, doctype: string} + */ + public static function generate( Prng $prng ): array { + $generator = new self( $prng ); + return $generator->build(); + } + + private function build(): array { + $doctype_kind = $this->prng->weighted( + array( + 'none' => 25, + 'html' => 45, + 'legacy-compat' => 10, + 'quirky' => 12, + 'limited' => 8, + ) + ); + + $out = self::DOCTYPES[ $doctype_kind ]; + + if ( $this->prng->chance( 15 ) ) { + $out .= 'render_attrs( $this->random_attrs() ) . '>'; + } + if ( $this->prng->chance( 10 ) ) { + $out .= 'render_attrs( $this->random_attrs() ) . '>'; + } + + $max_elements = $this->prng->int( 4, 35 ); + $token_budget = $this->prng->int( 8, 70 ); + $open = array(); + + for ( $i = 0; $i < $token_budget; $i++ ) { + $in_table = $this->in_table_context( $open ); + + $kind = $this->prng->weighted( + array( + 'start' => 42, + 'void' => $in_table ? 0 : 8, + 'end' => 24, + 'text' => 16, + 'comment' => 5, + 'stray' => $in_table ? 0 : 5, + ) + ); + + switch ( $kind ) { + case 'start': + if ( $this->fid_counter >= $max_elements ) { + break; + } + $tag = $in_table + ? $this->prng->choice( array( 'caption', 'colgroup', 'thead', 'tbody', 'tfoot', 'tr', 'tr', 'td', 'td', 'th' ) ) + : $this->prng->choice( self::TAGS ); + if ( 'a' === $tag && in_array( 'a', $open, true ) ) { + // A nested immediately runs the adoption agency. + $tag = 'span'; + } + $this->pools['tags'][] = $tag; + $out .= '<' . $this->maybe_case( $tag ) + . ' data-fid="w' . $this->fid_counter++ . '"' + . $this->render_attrs( $this->random_attrs() ) . '>'; + $open[] = $tag; + break; + + case 'void': + if ( $this->fid_counter >= $max_elements ) { + break; + } + $tag = $this->prng->choice( self::VOID_TAGS ); + $this->pools['tags'][] = $tag; + $out .= '<' . $this->maybe_case( $tag ) + . ' data-fid="w' . $this->fid_counter++ . '"' + . $this->render_attrs( $this->random_attrs() ) + . ( $this->prng->chance( 25 ) ? ' />' : '>' ); + break; + + case 'end': + if ( array() === $open ) { + break; + } + $pick = $this->prng->weighted( + array( + 'top' => 60, + 'random' => 40, + ) + ); + if ( 'top' === $pick ) { + $tag = array_pop( $open ); + } else { + /* + * Close a non-top open element: misnesting. Never + * across a formatting element — the processor only + * supports the trivial adoption-agency cases and + * bails on the rest ( "any other end tag" / + * "common ancestor" / reconstruction-with-rewind ). + */ + $formatting = array( 'a', 'b', 'i', 'em', 'strong', 'u', 's', 'code', 'small' ); + $lowest = count( $open ) - 1; + while ( $lowest > 0 && ! in_array( $open[ $lowest ], $formatting, true ) ) { + $lowest--; + } + if ( in_array( $open[ $lowest ], $formatting, true ) ) { + $lowest++; + } + if ( $lowest > count( $open ) - 1 ) { + $tag = array_pop( $open ); + } else { + $at = $this->prng->int( $lowest, count( $open ) - 1 ); + $tag = $open[ $at ]; + array_splice( $open, $at, 1 ); + } + } + $out .= 'maybe_case( $tag ) . '>'; + break; + + case 'text': + // Non-whitespace text in table context is unsupported + // (pending-table-character-tokens), keep it whitespace. + $out .= $in_table + ? "\n " + : $this->prng->choice( + array( + 'text', + ' wild text ', + "\n", + '& <x>', + 'café ✓', + 'a < b', + ) + ); + break; + + case 'comment': + $out .= ''; + break; + + case 'stray': + // An end tag for something that is not open. + // No formatting tags here: a stray formatting end tag + // runs the adoption agency's unsupported branches. + $out .= 'prng->choice( array( 'div', 'p', 'table', 'tr', 'li', 'span', 'x-wild' ) ) . '>'; + break; + } + } + + // Leave roughly half of the still-open elements unclosed. + foreach ( array_reverse( $open ) as $tag ) { + if ( $this->prng->chance( 50 ) ) { + $out .= 'maybe_case( $tag ) . '>'; + } + } + + foreach ( $this->pools as $key => $values ) { + $this->pools[ $key ] = array_values( array_unique( $values ) ); + } + + return array( + 'model' => null, + 'html' => $out, + 'pools' => $this->pools, + 'wild' => true, + 'doctype' => $doctype_kind, + ); + } + + /** + * Whether the insertion point is in table context outside any cell or + * caption — where arbitrary content would foster-parent (unsupported). + */ + private function in_table_context( array $open ): bool { + for ( $i = count( $open ) - 1; $i >= 0; $i-- ) { + $tag = $open[ $i ]; + if ( in_array( $tag, array( 'td', 'th', 'caption' ), true ) ) { + return false; + } + if ( in_array( $tag, array( 'table', 'thead', 'tbody', 'tfoot', 'tr', 'colgroup' ), true ) ) { + return true; + } + } + return false; + } + + /** @return array */ + private function random_attrs(): array { + $attrs = array(); + $count = $this->prng->weighted( array( 0 => 30, 1 => 35, 2 => 25, 3 => 10 ) ); + + for ( $i = 0; $i < $count; $i++ ) { + $name = $this->prng->choice( DocumentGenerator::ATTR_NAMES ); + + $lower = ascii_strtolower( $name ); + if ( 'class' === $lower ) { + $words = array(); + $n = $this->prng->int( 1, 3 ); + for ( $j = 0; $j < $n; $j++ ) { + $word = $this->maybe_inject_class_nul( $this->random_word() ); + $words[] = $word; + foreach ( DocumentGenerator::class_tokens( $word ) as $token ) { + $this->pools['classes'][] = $token; + } + } + $value = implode( ' ', $words ); + } elseif ( 'id' === $lower ) { + $value = $this->random_word(); + $this->pools['ids'][] = $value; + } elseif ( $this->prng->chance( 15 ) ) { + $value = true; + } else { + $value = $this->random_word(); + if ( $this->prng->chance( 20 ) ) { + $value .= ' ' . $this->random_word(); + } + } + + $this->pools['attrNames'][] = $lower; + if ( is_string( $value ) && 'class' !== $lower ) { + $this->pools['attrValues'][] = $value; + } + $attrs[] = array( $name, $value ); + } + + return $attrs; + } + + private function maybe_inject_class_nul( string $class ): string { + if ( '' === $class || ! $this->prng->chance( 12 ) ) { + return $class; + } + + $points = utf8_codepoints( $class ); + $at = $this->prng->int( 0, count( $points ) ); + $out = ''; + foreach ( $points as $i => $point ) { + if ( $i === $at ) { + $out .= "\0"; + } + $out .= $point[0]; + } + return $at === count( $points ) ? $out . "\0" : $out; + } + + private function render_attrs( array $attrs ): string { + $out = ''; + foreach ( $attrs as $attr ) { + list( $name, $value ) = $attr; + if ( true === $value ) { + $out .= ' ' . $name; + continue; + } + $out .= ' ' . $name . '="' . str_replace( array( '&', '"', '<' ), array( '&', '"', '<' ), $value ) . '"'; + } + return $out; + } + + private function random_word(): string { + $stems = array( 'wild', 'soup', 'alpha', 'beta', 'item', 'note', 'x', 'mixedCase', 'Über', 'main-thing', '--var', '_u' ); + $word = $this->prng->choice( $stems ); + if ( $this->prng->chance( 30 ) ) { + $word .= (string) $this->prng->int( 0, 99 ); + } + return $word; + } + + private function maybe_case( string $tag ): string { + if ( ! $this->prng->chance( 15 ) ) { + return $tag; + } + $out = ''; + for ( $i = 0; $i < strlen( $tag ); $i++ ) { + $c = $tag[ $i ]; + $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c ); + } + return $out; + } +} diff --git a/tools/css-selector-fuzz/lib/Worker.php b/tools/css-selector-fuzz/lib/Worker.php new file mode 100644 index 0000000000000..f701d87974657 --- /dev/null +++ b/tools/css-selector-fuzz/lib/Worker.php @@ -0,0 +1,1308 @@ +chance( 30 ); + $is_fragment = ! $is_wild && $prng->chance( 20 ); + + $failures = array(); + $record = static function ( string $invariant, array $detail ) use ( &$failures ) { + $failures[] = array( + 'invariant' => $invariant, + 'detail' => $detail, + ); + }; + $match_stats = array(); + + /* + * The processor's own parse is the matching oracle's ground truth. + * For safe (model-built) documents the model must agree with the + * capture — that soundness check is what lets the capture be trusted + * on wild documents, where no model exists. + * + * Wild documents that hit one of the processor's unsupported + * constructs (it bails on foster parenting, complex adoption-agency + * runs, …) are deterministically regenerated a bounded number of + * times so nearly every wild case carries a usable ground truth. + */ + $document = null; + $capture = null; + $capture_error = null; + $attempts = $is_wild ? 8 : 1; + for ( $attempt = 0; $attempt < $attempts; $attempt++ ) { + if ( $is_wild ) { + $document = WildDocumentGenerator::generate( $prng->fork( "wild-document:{$attempt}" ) ); + } elseif ( $is_fragment ) { + $document = DocumentGenerator::generate_fragment( $prng->fork( 'fragment' ) ); + } else { + $document = DocumentGenerator::generate( $prng->fork( 'document' ) ); + } + + $context = ( $document['fragment'] ?? false ) ? $document['context'] : null; + list( $capture, $capture_error ) = self::guard( + static function () use ( $document, $context ) { + return TreeCapture::capture( $document['html'], $context ); + } + ); + + if ( null === $capture_error && null === $capture['error'] ) { + break; + } + } + + $rows = null; + $tag_rows = null; + $quirks = false; + + if ( null !== $capture_error ) { + $record( 'model-desync', array( 'phase' => 'capture', 'error' => self::describe_throwable( $capture_error ) ) ); + } elseif ( null !== $capture['error'] ) { + if ( ! $is_wild ) { + $record( 'model-desync', array( 'phase' => 'capture', 'error' => $capture['error'] ) ); + } + // Wild markup the processor cannot fully visit is skipped: + // parsing invariants still run, matching has no ground truth. + } else { + $rows = $capture['htmlRows']; + $tag_rows = $capture['tagRows']; + $quirks = $capture['quirks']; + + if ( $is_fragment ) { + self::check_fragment_capture_against_model( $document, $capture, $record ); + } elseif ( ! $is_wild ) { + self::check_capture_against_model( $document, $capture, $record ); + } + } + + $path_rows = null; + if ( null !== $rows ) { + $path_rows = array(); + foreach ( $rows as $row ) { + if ( 0 !== strpos( $row['fid'], '(missing-fid:' ) ) { + $path_rows[] = $row; + } + } + } + + $selector = SelectorGenerator::generate( $prng->fork( 'selector' ), $document['pools'], $path_rows ); + + $selector_string = $selector['selector']; + + // --- Parse phase ------------------------------------------------- + + list( $compound_list, $compound_error ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string ); + } + ); + list( $complex_list, $complex_error ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string ); + } + ); + + if ( null !== $compound_error ) { + $record( 'parse-error', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $compound_error ) ) ); + } + if ( null !== $complex_error ) { + $record( 'parse-error', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $complex_error ) ) ); + } + + if ( null === $compound_error && null !== $selector['expectCompound'] && $selector['expectCompound'] !== ( null !== $compound_list ) ) { + $record( + 'parse-expectation', + array( + 'grammar' => 'compound', + 'expected' => $selector['expectCompound'] ? 'parse' : 'null', + 'actual' => null !== $compound_list ? 'parse' : 'null', + ) + ); + } + if ( null === $complex_error && null !== $selector['expectComplex'] && $selector['expectComplex'] !== ( null !== $complex_list ) ) { + $record( + 'parse-expectation', + array( + 'grammar' => 'complex', + 'expected' => $selector['expectComplex'] ? 'parse' : 'null', + 'actual' => null !== $complex_list ? 'parse' : 'null', + ) + ); + } + + if ( null !== $compound_list && null === $complex_list && null === $complex_error ) { + $record( 'compound-implies-complex', array() ); + } + + // Parse determinism: a second parse must agree with the first. + list( $compound_again, ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string ); + } + ); + list( $complex_again, ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string ); + } + ); + if ( ( null === $compound_list ) !== ( null === $compound_again ) || ( null === $complex_list ) !== ( null === $complex_again ) ) { + $record( 'parse-determinism', array( 'note' => 'null-ness changed between identical parses' ) ); + } + + // --- AST extraction ---------------------------------------------- + + $compound_ast = null; + $complex_ast = null; + + if ( null !== $compound_list ) { + list( $compound_ast, $shape_error ) = self::guard( + static function () use ( $compound_list ) { + return AstExtractor::from_compound_list( $compound_list ); + } + ); + if ( null !== $shape_error ) { + $record( 'ast-shape', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $shape_error ) ) ); + } + } + if ( null !== $complex_list ) { + list( $complex_ast, $shape_error ) = self::guard( + static function () use ( $complex_list ) { + return AstExtractor::from_complex_list( $complex_list ); + } + ); + if ( null !== $shape_error ) { + $record( 'ast-shape', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $shape_error ) ) ); + } + } + + if ( null !== $compound_ast && null !== $complex_ast && $compound_ast !== $complex_ast ) { + $record( + 'ast-cross-grammar', + array( + 'compoundAst' => $compound_ast, + 'complexAst' => $complex_ast, + ) + ); + } + + if ( null !== $selector['ast'] && null !== $complex_ast && $selector['ast'] !== $complex_ast ) { + $record( + 'ast-mismatch', + array( + 'generatedAst' => $selector['ast'], + 'parsedAst' => $complex_ast, + ) + ); + } + + // --- Match phase --------------------------------------------------- + + $html_matches = null; + // 'n/a' = the lexbor differential does not apply to this case + // ( unparseable selector, fragment, no captured tree ). Distinct from + // 'unavailable', which check_lexbor_differential reports only when the + // harness itself is missing or died — so a silently-dropped third + // oracle shows up in the per-batch tally instead of hiding in 'off'. + $lexbor_state = 'n/a'; + if ( null !== $complex_ast && null !== $rows ) { + $expected = ReferenceMatcher::expected_html_matches_rows( $complex_ast, $rows, $quirks ); + + /* + * Path-directed selectors are guaranteed by construction to match + * ( or, for near-misses, not to match ) a specific element. The + * reference matcher disagreeing means the generator or the + * reference matcher itself is wrong — a fuzzer-side defect. + */ + $must_match = $selector['mustMatchFid'] ?? null; + $must_not_match = $selector['mustNotMatchFid'] ?? null; + if ( null !== $must_match && ! in_array( $must_match, $expected, true ) ) { + $record( + 'path-expectation', + array( + 'expectation' => 'must-match', + 'fid' => $must_match, + 'expected' => $expected, + ) + ); + } + if ( null !== $must_not_match && in_array( $must_not_match, $expected, true ) ) { + $record( + 'path-expectation', + array( + 'expectation' => 'must-not-match', + 'fid' => $must_not_match, + 'expected' => $expected, + ) + ); + } + + $html_matches = self::check_select_matches( 'html', $selector_string, $document, $expected, $record ); + if ( null !== $html_matches ) { + self::note_match_assertion( $match_stats, 'html', $expected, $html_matches ); + } + + // lexbor parses full documents only; fragments skip it. + if ( ! ( $document['fragment'] ?? false ) ) { + $lexbor_state = self::check_lexbor_differential( $complex_ast, $selector_string, $document, $rows, $quirks, $expected, $record ); + } + } elseif ( null === $complex_list && null === $complex_error ) { + self::check_select_rejection( 'html', $selector_string, $document, $record ); + } + + if ( null !== $compound_ast && null !== $tag_rows ) { + $expected = ReferenceMatcher::expected_tag_matches_rows( $compound_ast, $tag_rows ); + $tag_matches = self::check_select_matches( 'tag', $selector_string, $document, $expected, $record ); + if ( null !== $tag_matches ) { + self::note_match_assertion( $match_stats, 'tag', $expected, $tag_matches ); + } + } elseif ( null === $compound_list && null === $compound_error ) { + self::check_select_rejection( 'tag', $selector_string, $document, $record ); + } + + // --- Metamorphic phase ---------------------------------------------- + // Oracle-free relations: meaning-preserving transforms of the selector + // must select exactly the same elements. Run only on otherwise-clean + // cases so a single root cause does not multiply into noise. + + if ( null !== $complex_ast && null !== $html_matches && array() === $failures ) { + self::check_metamorphic( $complex_ast, $html_matches, $document, $prng->fork( 'metamorph' ), $record ); + } + + $digest = sha1( + json_encode_safe( + array( + $selector_string, + $document['html'], + null !== $compound_list, + null !== $complex_list, + $compound_ast, + $complex_ast, + array_map( + static function ( $failure ) { + return $failure['invariant']; + }, + $failures + ), + ) + ) + ); + + $signatures = array(); + foreach ( $failures as $failure ) { + $signatures[] = self::signature( $failure ); + } + + return array( + 'seed' => $seed, + 'bucket' => $selector['bucket'], + 'digest' => $digest, + 'failures' => $failures, + 'signatures' => array_values( array_unique( $signatures ) ), + 'selector' => $selector_string, + 'html' => $document['html'], + 'lexbor' => $lexbor_state, + 'matchStats' => $match_stats, + ); + } + + /** + * Runs the SELF-CONTAINED invariants on an explicit ( selector, html ) + * pair — no generated model, intended AST, or parse expectation. This is + * what the minimizer drives: every checked property is computable from + * the pair alone ( WP select() vs the reference matcher over WP's own + * parsed AST and the captured tree; metamorphic relations; the lexbor + * differential; parse/shape/cross-grammar invariants; rejection + * bookkeeping for unparseable selectors ). + * + * Bug 1 surfaces here as metamorphic-ast, Bug 2 as match-mismatch-*, + * Bug 3 as metamorphic-parse — so all three known bugs are minimizable + * without the generator. + * + * @return array{ + * failures: array, + * signatures: string[], + * } + */ + public static function run_pair( string $selector_string, string $html, ?string $target = null ): array { + Bootstrap::load(); + + $failures = array(); + $record = static function ( string $invariant, array $detail ) use ( &$failures ) { + $failures[] = array( + 'invariant' => $invariant, + 'detail' => $detail, + ); + }; + + // When the minimizer fixes a target signature, the metamorphic loop + // ( the only expensive, multi-draw stage ) is only worth running if + // the target is itself a metamorphic signature. + $target_invariant = null === $target ? null : substr( strrchr( $target, ':' ), 1 ); + $target_is_metamorph = null !== $target_invariant && 0 === strpos( $target_invariant, 'metamorphic' ); + $has_target_signature = static function () use ( &$failures, $target ) { + if ( null === $target ) { + return false; + } + foreach ( $failures as $failure ) { + if ( self::signature( $failure ) === $target ) { + return true; + } + } + return false; + }; + + list( $capture, $capture_error ) = self::guard( + static function () use ( $html ) { + return TreeCapture::capture( $html ); + } + ); + + $rows = null; + $tag_rows = null; + $quirks = false; + if ( null === $capture_error && null === $capture['error'] ) { + $rows = $capture['htmlRows']; + $tag_rows = $capture['tagRows']; + $quirks = $capture['quirks']; + } + + $document = array( 'html' => $html ); + + list( $compound_list, $compound_error ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string ); + } + ); + list( $complex_list, $complex_error ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string ); + } + ); + + if ( null !== $compound_error ) { + $record( 'parse-error', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $compound_error ) ) ); + } + if ( null !== $complex_error ) { + $record( 'parse-error', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $complex_error ) ) ); + } + if ( null !== $compound_list && null === $complex_list && null === $complex_error ) { + $record( 'compound-implies-complex', array() ); + } + + $compound_ast = null; + $complex_ast = null; + if ( null !== $compound_list ) { + list( $compound_ast, $shape_error ) = self::guard( + static function () use ( $compound_list ) { + return AstExtractor::from_compound_list( $compound_list ); + } + ); + if ( null !== $shape_error ) { + $record( 'ast-shape', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $shape_error ) ) ); + } + } + if ( null !== $complex_list ) { + list( $complex_ast, $shape_error ) = self::guard( + static function () use ( $complex_list ) { + return AstExtractor::from_complex_list( $complex_list ); + } + ); + if ( null !== $shape_error ) { + $record( 'ast-shape', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $shape_error ) ) ); + } + } + if ( null !== $compound_ast && null !== $complex_ast && $compound_ast !== $complex_ast ) { + $record( 'ast-cross-grammar', array( 'compoundAst' => $compound_ast, 'complexAst' => $complex_ast ) ); + } + + $html_matches = null; + if ( null !== $complex_ast && null !== $rows ) { + $expected = ReferenceMatcher::expected_html_matches_rows( $complex_ast, $rows, $quirks ); + $html_matches = self::check_select_matches( 'html', $selector_string, $document, $expected, $record ); + self::check_lexbor_differential( $complex_ast, $selector_string, $document, $rows, $quirks, $expected, $record ); + } elseif ( null === $complex_list && null === $complex_error && null !== $rows ) { + self::check_select_rejection( 'html', $selector_string, $document, $record ); + } + + if ( null !== $compound_ast && null !== $tag_rows ) { + $expected = ReferenceMatcher::expected_tag_matches_rows( $compound_ast, $tag_rows ); + self::check_select_matches( 'tag', $selector_string, $document, $expected, $record ); + } elseif ( null === $compound_list && null === $compound_error && null !== $tag_rows ) { + self::check_select_rejection( 'tag', $selector_string, $document, $record ); + } + + $run_metamorph = ( null === $target || $target_is_metamorph ) + && null !== $complex_ast && null !== $html_matches && array() === $failures; + if ( $run_metamorph ) { + /* + * Metamorphic transforms randomize escapes / case / order, so a + * transform-sensitive bug ( e.g. Bug 1 and Bug 3 ) only fires for + * some PRNG draws. run_case sees one draw; here several fixed + * draws are tried so minimization can reliably preserve such a + * signature regardless of which draw first exposed it. With a + * target fixed, stop at the first draw that reproduces it. + */ + for ( $i = 0; $i < self::PAIR_METAMORPH_DRAWS && array() === $failures; $i++ ) { + // A FIXED draw seed ( not derived from the pair ) keeps the + // test monotonic under shrinking: the same coin-flips apply to + // whatever AST survives, so a smaller selector that still has + // the bug reproduces the same transform signature. + $metamorph_prng = new Prng( 'css-selector-fuzz-minimize', "metamorph:{$i}" ); + self::check_metamorphic( $complex_ast, $html_matches, $document, $metamorph_prng, $record ); + if ( $has_target_signature() ) { + break; + } + } + } + + $signatures = array(); + foreach ( $failures as $failure ) { + $signatures[] = self::signature( $failure ); + } + + return array( + 'failures' => $failures, + 'signatures' => array_values( array_unique( $signatures ) ), + ); + } + + /** + * Fragment analogue of check_capture_against_model: the ``-context + * fragment capture must equal the model rows built from the body-level + * children ( with the implicit HTML/BODY ancestors ). + */ + private static function check_fragment_capture_against_model( array $document, array $capture, callable $record ): void { + $model_rows = DocumentGenerator::rows_from_fragment( $document['children'] ); + + $normalize = static function ( array $rows ): array { + $out = array(); + foreach ( $rows as $row ) { + $attrs = array(); + foreach ( $row['attrs'] as $attr ) { + $attrs[ $attr[0] ] = $attr[1]; + } + ksort( $attrs ); + $out[] = array( + 'tag' => $row['tag'], + 'fid' => $row['fid'], + 'attrs' => $attrs, + 'ancestorTags' => $row['ancestorTags'], + ); + } + return $out; + }; + + $expected = $normalize( $model_rows ); + $actual = $normalize( $capture['htmlRows'] ); + if ( $expected !== $actual ) { + $record( + 'model-desync', + array( + 'processor' => 'fragment', + 'expected' => $expected, + 'actual' => $actual, + ) + ); + } + } + + /** + * Verifies that the processor's captured view of a safe (model-built) + * document agrees with the generated model — this guards the oracle + * itself against renderer/model drift, and is what justifies trusting + * the capture on wild documents. + */ + private static function check_capture_against_model( array $document, array $capture, callable $record ): void { + $model_rows = DocumentGenerator::rows_from_model( $document['model'] ); + + $normalize = static function ( array $rows, bool $with_ancestors ): array { + $out = array(); + foreach ( $rows as $row ) { + $attrs = array(); + foreach ( $row['attrs'] as $attr ) { + $attrs[ $attr[0] ] = $attr[1]; + } + ksort( $attrs ); + $normalized = array( + 'tag' => $row['tag'], + 'fid' => $row['fid'], + 'attrs' => $attrs, + ); + if ( $with_ancestors ) { + $normalized['ancestorTags'] = $row['ancestorTags']; + } + $out[] = $normalized; + } + return $out; + }; + + $expected = $normalize( $model_rows, true ); + $actual = $normalize( $capture['htmlRows'], true ); + if ( $expected !== $actual ) { + $record( + 'model-desync', + array( + 'processor' => 'html', + 'expected' => $expected, + 'actual' => $actual, + ) + ); + } + + $expected_tags = $normalize( $model_rows, false ); + $actual_tags = $normalize( $capture['tagRows'], false ); + if ( $expected_tags !== $actual_tags ) { + $record( + 'model-desync', + array( + 'processor' => 'tag', + 'expected' => $expected_tags, + 'actual' => $actual_tags, + ) + ); + } + + if ( $document['quirks'] !== $capture['quirks'] ) { + $record( + 'model-desync', + array( + 'processor' => 'quirks', + 'expected' => $document['quirks'], + 'actual' => $capture['quirks'], + ) + ); + } + } + + /** + * Runs a select() loop over the document, collecting matched data-fids. + * + * @param string $target 'html' or 'tag'. + * @param array $document The case document ( may request fragment mode ). + * @return array{0: string[]|null, 1: \Throwable|null} + */ + private static function collect_matches( string $target, string $selector_string, array $document ): array { + $html = $document['html']; + $context = ( $document['fragment'] ?? false ) ? $document['context'] : null; + return self::guard( + static function () use ( $target, $selector_string, $html, $context ) { + if ( 'tag' === $target ) { + $processor = new \WP_HTML_Tag_Processor( $html ); + } elseif ( null !== $context ) { + $processor = \WP_HTML_Processor::create_fragment( $html, $context ); + } else { + $processor = \WP_HTML_Processor::create_full_parser( $html ); + } + + $matches = array(); + $iterations = 0; + while ( $processor->select( $selector_string ) ) { + $fid = $processor->get_attribute( 'data-fid' ); + // Sanitize identically to TreeCapture/lexbor so a fid with + // a control char can never produce a false divergence on + // the match path ( unreachable today: fids are integers ). + $matches[] = is_string( $fid ) ? TreeCapture::sanitize_fid( $fid ) : '(missing-fid:' . $processor->get_tag() . ')'; + if ( ++$iterations > self::SELECT_ITERATION_LIMIT ) { + throw new \RuntimeException( 'select() did not terminate within the iteration limit.' ); + } + } + + if ( $processor instanceof \WP_HTML_Processor ) { + if ( null !== $processor->get_last_error() ) { + throw new \RuntimeException( 'Processor error state: ' . $processor->get_last_error() ); + } + if ( null !== $processor->get_unsupported_exception() ) { + throw new \RuntimeException( 'Processor unsupported state: ' . $processor->get_unsupported_exception()->getMessage() ); + } + } + + return $matches; + } + ); + } + + /** + * Flushes the select() parse caches. + * + * Both select() implementations memoize the most recently parsed selector + * string in a function-static cache, so whether a select() call re-parses + * — and therefore whether parse-time notices ( the invalid-UTF-8 scrub + * notice from from_selectors() ) fire — depends on what the worker + * happened to parse before. Parsing a sentinel selector first makes the + * next select() call for the case selector deterministic: it always + * re-parses, so exactly one parse happens inside each notice-assertion + * window regardless of worker history or case re-runs. + */ + private static function flush_select_parse_caches(): void { + ( new \WP_HTML_Tag_Processor( '' ) )->select( '#-fuzz-cache-flush-' ); + \WP_HTML_Processor::create_full_parser( '' )->select( '#-fuzz-cache-flush-' ); + } + + /** + * The _doing_it_wrong() name under which from_selectors() reports that an + * invalid-UTF-8 selector string was scrubbed to U+FFFD before parsing. + * + * @param string $target 'html' or 'tag'. + */ + private static function scrub_notice_name( string $target ): string { + return ( 'tag' === $target ? 'WP_CSS_Compound_Selector_List' : 'WP_CSS_Complex_Selector_List' ) . '::from_selectors'; + } + + /** + * Runs a select() loop on a parseable selector and compares the match set + * against the reference matcher. + * + * @param string $target 'html' or 'tag'. + * @return string[]|null The actual match set, or null when matching failed. + */ + private static function check_select_matches( string $target, string $selector_string, array $document, array $expected, callable $record ): ?array { + self::flush_select_parse_caches(); + Bootstrap::reset_doing_it_wrong(); + + list( $actual, $error ) = self::collect_matches( $target, $selector_string, $document ); + + if ( null !== $error ) { + $record( + 'match-error', + array( + 'target' => $target, + 'error' => self::describe_throwable( $error ), + ) + ); + return null; + } + + /* + * A selector string containing invalid UTF-8 is scrubbed to U+FFFD by + * from_selectors(), which reports the replacement with exactly one + * notice on the (single, cache-flushed) parse. Anything else is + * unexpected for a selector that parses. + */ + $expected_calls = \wp_is_valid_utf8( $selector_string ) + ? array() + : array( + array( + 'function' => self::scrub_notice_name( $target ), + ), + ); + + $doing_it_wrong = Bootstrap::doing_it_wrong_calls(); + if ( ! self::notices_match( $expected_calls, $doing_it_wrong ) ) { + $record( + 'doing-it-wrong-unexpected', + array( + 'target' => $target, + 'expectedCalls' => $expected_calls, + 'calls' => $doing_it_wrong, + ) + ); + } + + if ( $actual !== $expected ) { + $record( + 'match-mismatch-' . $target, + array( + 'expected' => $expected, + 'actual' => $actual, + ) + ); + } + + return $actual; + } + + private static function note_match_assertion( array &$match_stats, string $target, array $expected, array $actual ): void { + if ( ! isset( $match_stats[ $target ] ) ) { + $match_stats[ $target ] = array( + 'assertions' => 0, + 'nonVacuous' => 0, + ); + } + + ++$match_stats[ $target ]['assertions']; + if ( array() !== $expected || array() !== $actual ) { + ++$match_stats[ $target ]['nonVacuous']; + } + } + + private static function finalize_match_stats( array $match_stats ): array { + foreach ( $match_stats as $bucket => $targets ) { + foreach ( $targets as $target => $counts ) { + $assertions = (int) ( $counts['assertions'] ?? 0 ); + $non_vacuous = (int) ( $counts['nonVacuous'] ?? 0 ); + $vacuous = max( 0, $assertions - $non_vacuous ); + + $match_stats[ $bucket ][ $target ]['vacuous'] = $vacuous; + $match_stats[ $bucket ][ $target ]['nonVacuousRate'] = $assertions > 0 ? round( $non_vacuous / $assertions, 4 ) : 0.0; + $match_stats[ $bucket ][ $target ]['vacuousRate'] = $assertions > 0 ? round( $vacuous / $assertions, 4 ) : 0.0; + } + } + return $match_stats; + } + + /** + * Runs the lexbor differential — the THIRD, independent matching opinion. + * + * Quirks-mode documents are excluded unless the startup probe confirms + * lexbor has reliable class/#id case folding in both no-quirks and quirks + * mode. The comparison only runs when lexbor built the same element tree + * as WP ( fid/tag/ancestry multiset ), so it tests the selector layer, + * not tree construction. + * + * Verdict triage: + * - 'lexbor-divergence' lexbor != reference: a fuzzer-oracle problem + * ( or an un-compensated lexbor bug ) — never a + * WP verdict on its own. + * - 'lexbor-parse-reject' lexbor refused a selector WP accepted. + * - match-mismatch-html with NO lexbor-divergence on the same case + * means reference == lexbor != WP: a + * high-confidence WP finding. + * + * @return string Tally state: + * unavailable|skipped-quirks|skipped-utf8|error|tree-gated|compared. + */ + private static function check_lexbor_differential( array $complex_ast, string $selector_string, array $document, array $rows, bool $quirks, array $expected, callable $record ): string { + if ( ! LexborOracle::available() ) { + return 'unavailable'; + } + if ( $quirks && ! LexborOracle::quirks_class_id_reliable() ) { + return 'skipped-quirks'; + } + + /* + * lexbor receives a canonical re-render of the (already verified) + * AST rather than the original byte form: the differential targets + * matching semantics, while byte-level parsing (escapes, whitespace, + * modifier case — lexbor e.g. rejects uppercase I/S modifiers) is + * covered by the AST round-trip and metamorphic invariants. ASTs + * containing invalid UTF-8 cannot be re-rendered; since + * from_selectors() scrubs input to U+FFFD before parsing, none should + * exist and this skip is defensive ( a nonzero skipped-utf8 tally + * indicates a normalization bypass ). + */ + if ( ! ast_strings_are_utf8( $complex_ast ) ) { + return 'skipped-utf8'; + } + $canonical = SelectorGenerator::render_canonical( $complex_ast ); + + $lex = LexborOracle::query( $document['html'], $canonical ); + if ( null === $lex ) { + return 'error'; + } + + if ( 'parse' === $lex['error'] ) { + $record( + 'lexbor-parse-reject', + array( + 'note' => 'lexbor rejected the canonical form of a selector the WP parser accepted', + 'canonical' => printable_bytes( $canonical ), + ) + ); + return 'compared'; + } + if ( null !== $lex['error'] ) { + return 'error'; + } + + if ( ! self::trees_agree( $rows, $lex['rows'] ) ) { + return 'tree-gated'; + } + + /* + * Two known lexbor deviations are compensated for so the rest of the + * semantics still get differential coverage; WP itself is still held + * to the strict expectation: + * + * - lexbor #368: class/#id match ASCII case-insensitively even in + * no-quirks documents. Compare lexbor against the reference run + * with quirks-style class/ID folding. + * - lexbor does not implement HTML's case-insensitive attribute + * value list ( [rel=NOFOLLOW] does not match rel="nofollow" ), + * where browsers and WP do. Compare lexbor against the reference + * run with that list disabled. + */ + $expected_for_lexbor = ReferenceMatcher::expected_html_matches_rows( + $complex_ast, + $rows, + LexborOracle::has_issue_368() ? true : $quirks, + false + ); + + // lexbor reports in document order, WP/reference in visit order — + // compare as multisets. + $lex_matches = $lex['matches']; + sort( $lex_matches ); + sort( $expected_for_lexbor ); + + if ( $lex_matches !== $expected_for_lexbor ) { + $record( + 'lexbor-divergence', + array( + 'reference' => $expected_for_lexbor, + 'lexbor' => $lex_matches, + 'issue368' => LexborOracle::has_issue_368(), + ) + ); + } + + return 'compared'; + } + + /** Multiset equality of ( tag, fid, ancestry ) between WP and lexbor rows. */ + private static function trees_agree( array $wp_rows, array $lexbor_rows ): bool { + $serialize = static function ( array $rows ): array { + $out = array(); + foreach ( $rows as $row ) { + $out[] = $row['tag'] . '|' . $row['fid'] . '|' . implode( ',', $row['ancestorTags'] ); + } + sort( $out ); + return $out; + }; + + return $serialize( $wp_rows ) === $serialize( $lexbor_rows ); + } + + /** + * Checks the metamorphic relations: each meaning-preserving transform of + * the parsed selector must parse, must (for AST-preserving transforms) + * parse to exactly the transformed AST, and must select exactly the same + * elements the original selector selected. + * + * @param array $complex_ast Canonical AST of the original selector. + * @param string[] $html_matches The original's WP_HTML_Processor match set. + */ + private static function check_metamorphic( array $complex_ast, array $html_matches, array $document, Prng $prng, callable $record ): void { + foreach ( Metamorph::variants( $complex_ast, $prng ) as $variant ) { + $transform = $variant['name']; + $variant_selector = $variant['selector']; + + list( $variant_list, $parse_error ) = self::guard( + static function () use ( $variant_selector ) { + return \WP_CSS_Complex_Selector_List::from_selectors( $variant_selector ); + } + ); + + if ( null !== $parse_error ) { + $record( + 'metamorphic-error', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'error' => self::describe_throwable( $parse_error ), + ) + ); + continue; + } + + if ( null === $variant_list ) { + $record( + 'metamorphic-parse', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + ) + ); + continue; + } + + if ( $variant['astMustMatch'] ) { + list( $variant_ast, $shape_error ) = self::guard( + static function () use ( $variant_list ) { + return AstExtractor::from_complex_list( $variant_list ); + } + ); + if ( null !== $shape_error ) { + $record( + 'metamorphic-error', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'error' => self::describe_throwable( $shape_error ), + ) + ); + continue; + } + if ( $variant_ast !== $variant['ast'] ) { + $record( + 'metamorphic-ast', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'expectedAst' => $variant['ast'], + 'parsedAst' => $variant_ast, + ) + ); + continue; + } + } + + Bootstrap::reset_doing_it_wrong(); + list( $variant_matches, $match_error ) = self::collect_matches( 'html', $variant_selector, $document ); + + if ( null !== $match_error ) { + $record( + 'metamorphic-error', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'error' => self::describe_throwable( $match_error ), + ) + ); + continue; + } + + if ( $variant_matches !== $html_matches ) { + $record( + 'metamorphic-mismatch', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'expected' => $html_matches, + 'actual' => $variant_matches, + ) + ); + } + } + } + + /** + * For unparseable selectors: select() must return false, leave the + * processor usable, and report misuse exactly once per call. + */ + private static function check_select_rejection( string $target, string $selector_string, array $document, callable $record ): void { + self::flush_select_parse_caches(); + Bootstrap::reset_doing_it_wrong(); + + $context = ( $document['fragment'] ?? false ) ? $document['context'] : null; + list( $results, $error ) = self::guard( + static function () use ( $target, $selector_string, $document, $context ) { + if ( 'tag' === $target ) { + $processor = new \WP_HTML_Tag_Processor( $document['html'] ); + } elseif ( null !== $context ) { + $processor = \WP_HTML_Processor::create_fragment( $document['html'], $context ); + } else { + $processor = \WP_HTML_Processor::create_full_parser( $document['html'] ); + } + + // Two calls: the second exercises the parse cache. + return array( $processor->select( $selector_string ), $processor->select( $selector_string ) ); + } + ); + + if ( null !== $error ) { + $record( + 'match-error', + array( + 'target' => $target, + 'rejected' => true, + 'error' => self::describe_throwable( $error ), + ) + ); + return; + } + + if ( array( false, false ) !== $results ) { + $record( + 'select-on-null', + array( + 'target' => $target, + 'results' => $results, + ) + ); + } + + /* + * Two select() calls report the unparseable selector once each; the + * parse cache only skips re-parsing, never the per-call notice. An + * invalid-UTF-8 selector additionally reports the U+FFFD scrub once, + * on the first call ( the only one that parses after the flush ). + */ + $select_notice_name = ( 'tag' === $target ? 'WP_HTML_Tag_Processor' : 'WP_HTML_Processor' ) . '::select'; + $expected_calls = array( + array( 'function' => $select_notice_name ), + array( 'function' => $select_notice_name ), + ); + if ( ! \wp_is_valid_utf8( $selector_string ) ) { + array_unshift( $expected_calls, array( 'function' => self::scrub_notice_name( $target ) ) ); + } + + $doing_it_wrong = Bootstrap::doing_it_wrong_calls(); + if ( ! self::notices_match( $expected_calls, $doing_it_wrong ) ) { + $record( + 'doing-it-wrong-missing', + array( + 'target' => $target, + 'expectedCalls' => $expected_calls, + 'calls' => $doing_it_wrong, + ) + ); + } + } + + /** + * Compares recorded _doing_it_wrong() calls against expectations: same + * count, in order, matching on every key the expectation specifies + * ( recorded calls also carry 'message', which expectations omit ). + * + * @param array[] $expected_calls Expected calls, each a subset of record keys. + * @param array[] $actual_calls Recorded calls. + */ + private static function notices_match( array $expected_calls, array $actual_calls ): bool { + if ( count( $expected_calls ) !== count( $actual_calls ) ) { + return false; + } + foreach ( $expected_calls as $i => $expected_call ) { + foreach ( $expected_call as $key => $value ) { + if ( ( $actual_calls[ $i ][ $key ] ?? null ) !== $value ) { + return false; + } + } + } + return true; + } + + /* + * ------------- + * Batch running + * ------------- + */ + + /** + * Runs a batch of sequential seeds. + * + * @return array Summary. + */ + public static function run_batch( array $options ): array { + Bootstrap::load(); + + $start_seed = option_int( $options, 'start-seed', 1 ); + $count = option_int( $options, 'count', 100 ); + $failures_out = option_string( $options, 'failures-out', null ); + $progress_file = option_string( $options, 'progress-file', null ); + $determinism_every = option_int( $options, 'determinism-every', 16 ); + $max_failures = option_int( $options, 'max-failures', 200 ); + + $started_at = microtime( true ); + $failures = 0; + $buckets = array(); + $signatures = array(); + $lexbor = array(); + $match_stats = array(); + $last_seed = null; + $stop_reason = 'completed'; + + for ( $seed = $start_seed; $seed < $start_seed + $count; $seed++ ) { + if ( $max_failures > 0 && $failures >= $max_failures ) { + $stop_reason = 'max-failures'; + break; + } + if ( null !== $progress_file ) { + file_put_contents( $progress_file, (string) $seed ); + } + + $result = self::run_case( $seed ); + + if ( $determinism_every > 0 && 0 === $seed % $determinism_every ) { + $repeat = self::run_case( $seed ); + if ( $repeat['digest'] !== $result['digest'] ) { + $result['failures'][] = array( + 'invariant' => 'case-determinism', + 'detail' => array( + 'firstDigest' => $result['digest'], + 'secondDigest' => $repeat['digest'], + ), + ); + } + } + + $buckets[ $result['bucket'] ] = ( $buckets[ $result['bucket'] ] ?? 0 ) + 1; + $lexbor[ $result['lexbor'] ] = ( $lexbor[ $result['lexbor'] ] ?? 0 ) + 1; + $last_seed = $seed; + foreach ( $result['matchStats'] as $target => $stats ) { + if ( ! isset( $match_stats[ $result['bucket'] ][ $target ] ) ) { + $match_stats[ $result['bucket'] ][ $target ] = array( + 'assertions' => 0, + 'nonVacuous' => 0, + ); + } + $match_stats[ $result['bucket'] ][ $target ]['assertions'] += $stats['assertions']; + $match_stats[ $result['bucket'] ][ $target ]['nonVacuous'] += $stats['nonVacuous']; + } + + foreach ( $result['failures'] as $failure ) { + ++$failures; + $signature = self::signature( $failure ); + $signatures[ $signature ] = ( $signatures[ $signature ] ?? 0 ) + 1; + + $entry = array( + 'kind' => 'css-selector-fuzz-failure', + 'seed' => $result['seed'], + 'bucket' => $result['bucket'], + 'invariant' => $failure['invariant'], + 'signature' => $signature, + 'selector' => printable_bytes( $result['selector'] ), + 'selectorBase64' => base64_encode( $result['selector'] ), + 'htmlBase64' => base64_encode( $result['html'] ), + 'detail' => $failure['detail'], + ); + if ( null !== $failures_out ) { + append_ndjson( $failures_out, $entry ); + } else { + fwrite( STDERR, json_encode_safe( $entry ) . "\n" ); + } + } + } + + return array( + 'kind' => 'css-selector-fuzz-batch-summary', + 'startSeed' => $start_seed, + 'count' => $count, + 'lastSeed' => $last_seed, + 'failures' => $failures, + 'buckets' => $buckets, + 'signatures' => $signatures, + 'lexbor' => $lexbor, + 'matchStats' => self::finalize_match_stats( $match_stats ), + 'stopReason' => $stop_reason, + 'durationMs' => (int) round( 1000 * ( microtime( true ) - $started_at ) ), + ); + } + + /** Stable identity for de-duplicating equivalent failures. */ + private static function signature( array $failure ): string { + $parts = array( $failure['invariant'] ); + if ( isset( $failure['detail']['grammar'] ) ) { + $parts[] = $failure['detail']['grammar']; + } + if ( isset( $failure['detail']['target'] ) ) { + $parts[] = $failure['detail']['target']; + } + if ( isset( $failure['detail']['transform'] ) ) { + $parts[] = $failure['detail']['transform']; + } + if ( isset( $failure['detail']['error']['class'] ) ) { + $parts[] = $failure['detail']['error']['class']; + $parts[] = preg_replace( '/[0-9]+/', 'N', (string) ( $failure['detail']['error']['message'] ?? '' ) ); + } + return substr( sha1( implode( '|', $parts ) ), 0, 12 ) . ':' . $failure['invariant']; + } + + /* + * ------- + * Helpers + * ------- + */ + + /** + * Calls $fn with PHP warnings/notices converted to exceptions. + * + * @return array{0: mixed, 1: \Throwable|null} + */ + private static function guard( callable $fn ): array { + set_error_handler( + static function ( $severity, $message, $file, $line ) { + if ( E_DEPRECATED === $severity || E_USER_DEPRECATED === $severity ) { + return true; + } + throw new \ErrorException( $message, 0, $severity, $file, $line ); + } + ); + try { + return array( $fn(), null ); + } catch ( \Throwable $e ) { + return array( null, $e ); + } finally { + restore_error_handler(); + } + } + + public static function describe_throwable( \Throwable $e ): array { + $root = repo_root() . DIRECTORY_SEPARATOR; + return array( + 'class' => get_class( $e ), + 'message' => $e->getMessage(), + 'at' => str_replace( $root, '', $e->getFile() ) . ':' . $e->getLine(), + 'trace' => array_slice( + array_map( + static function ( $frame ) use ( $root ) { + $location = isset( $frame['file'] ) + ? str_replace( $root, '', $frame['file'] ) . ':' . ( $frame['line'] ?? '?' ) + : '[internal]'; + $callable = ( $frame['class'] ?? '' ) . ( $frame['type'] ?? '' ) . ( $frame['function'] ?? '' ); + return $location . ' ' . $callable; + }, + $e->getTrace() + ), + 0, + 6 + ), + ); + } +} diff --git a/tools/css-selector-fuzz/lib/autoload.php b/tools/css-selector-fuzz/lib/autoload.php new file mode 100644 index 0000000000000..6ebdcbc75d6c3 --- /dev/null +++ b/tools/css-selector-fuzz/lib/autoload.php @@ -0,0 +1,13 @@ + array() ); + $count = count( $argv ); + for ( $i = 1; $i < $count; $i++ ) { + $arg = $argv[ $i ]; + if ( 0 === strpos( $arg, '--' ) ) { + $name = substr( $arg, 2 ); + if ( false !== strpos( $name, '=' ) ) { + list( $name, $value ) = explode( '=', $name, 2 ); + $options[ $name ] = $value; + } elseif ( $i + 1 < $count && 0 !== strpos( $argv[ $i + 1 ], '--' ) ) { + $options[ $name ] = $argv[ ++$i ]; + } else { + $options[ $name ] = true; + } + } else { + $options['_'][] = $arg; + } + } + return $options; +} + +function option_string( array $options, string $name, ?string $default = null ): ?string { + if ( ! array_key_exists( $name, $options ) || true === $options[ $name ] ) { + return $default; + } + return (string) $options[ $name ]; +} + +function option_int( array $options, string $name, int $default ): int { + $value = option_string( $options, $name, null ); + return null === $value ? $default : (int) $value; +} + +function option_float( array $options, string $name, float $default ): float { + $value = option_string( $options, $name, null ); + return null === $value ? $default : (float) $value; +} + +function option_bool( array $options, string $name, bool $default ): bool { + if ( ! array_key_exists( $name, $options ) ) { + return $default; + } + $value = $options[ $name ]; + if ( true === $value ) { + return true; + } + return in_array( strtolower( (string) $value ), array( '1', 'true', 'yes', 'on' ), true ); +} + +function ensure_dir( string $dir ): void { + if ( ! is_dir( $dir ) && ! mkdir( $dir, 0777, true ) && ! is_dir( $dir ) ) { + throw new \RuntimeException( "Could not create directory: {$dir}" ); + } +} + +function json_encode_safe( $value ): string { + $encoded = json_encode( $value, JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_SUBSTITUTE ); + if ( false === $encoded ) { + $encoded = json_encode( array( 'jsonError' => json_last_error_msg() ) ); + } + return $encoded; +} + +function write_json_file( string $path, $value ): void { + file_put_contents( $path, json_encode_safe( $value ) . "\n" ); +} + +function read_json_file( string $path ): ?array { + if ( ! is_file( $path ) ) { + return null; + } + $decoded = json_decode( (string) file_get_contents( $path ), true ); + return is_array( $decoded ) ? $decoded : null; +} + +function append_ndjson( string $path, array $value ): void { + file_put_contents( $path, json_encode_safe( $value ) . "\n", FILE_APPEND | LOCK_EX ); +} + +function timestamp(): string { + return gmdate( 'Ymd-His' ); +} + +/** + * Renders bytes for human inspection: printable ASCII passes through, + * everything else becomes \xHH. + */ +function printable_bytes( string $bytes, int $max_length = 4096 ): string { + $out = ''; + $truncated = strlen( $bytes ) > $max_length; + $bytes = substr( $bytes, 0, $max_length ); + for ( $i = 0; $i < strlen( $bytes ); $i++ ) { + $c = $bytes[ $i ]; + $o = ord( $c ); + if ( $o >= 0x20 && $o <= 0x7E ) { + $out .= '\\' === $c ? '\\\\' : $c; + } else { + $out .= sprintf( '\\x%02X', $o ); + } + } + return $out . ( $truncated ? '…(truncated)' : '' ); +} + +function git_metadata(): array { + $head = trim( (string) shell_exec( 'git -C ' . escapeshellarg( repo_root() ) . ' rev-parse HEAD 2>/dev/null' ) ); + $branch = trim( (string) shell_exec( 'git -C ' . escapeshellarg( repo_root() ) . ' rev-parse --abbrev-ref HEAD 2>/dev/null' ) ); + return array( + 'head' => '' !== $head ? $head : null, + 'branch' => '' !== $branch ? $branch : null, + ); +} + +/** Whether every string anywhere in a nested array is valid UTF-8. */ +function ast_strings_are_utf8( $node ): bool { + if ( is_string( $node ) ) { + return (bool) preg_match( '//u', $node ); + } + if ( is_array( $node ) ) { + foreach ( $node as $child ) { + if ( ! ast_strings_are_utf8( $child ) ) { + return false; + } + } + } + return true; +} + +function ascii_strtolower( string $input ): string { + return strtr( $input, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz' ); +} + +function ascii_strtoupper( string $input ): string { + return strtr( $input, 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' ); +} + +/** Flips the case of each ASCII letter independently with 50% probability. */ +function str_shuffle_case( string $input, Prng $prng ): string { + $out = ''; + for ( $i = 0; $i < strlen( $input ); $i++ ) { + $byte = $input[ $i ]; + if ( $prng->chance( 50 ) ) { + $byte = ctype_lower( $byte ) ? ascii_strtoupper( $byte ) : ascii_strtolower( $byte ); + } + $out .= $byte; + } + return $out; +} + +/** + * Splits a valid UTF-8 string into codepoints. + * + * @return array Pairs of ( utf8 bytes, codepoint value ). + */ +function utf8_codepoints( string $input ): array { + $out = array(); + $len = strlen( $input ); + $i = 0; + while ( $i < $len ) { + $byte = ord( $input[ $i ] ); + if ( $byte < 0x80 ) { + $size = 1; + $cp = $byte; + } elseif ( 0xC0 === ( $byte & 0xE0 ) ) { + $size = 2; + $cp = $byte & 0x1F; + } elseif ( 0xE0 === ( $byte & 0xF0 ) ) { + $size = 3; + $cp = $byte & 0x0F; + } else { + $size = 4; + $cp = $byte & 0x07; + } + $size = min( $size, $len - $i ); + for ( $j = 1; $j < $size; $j++ ) { + $cp = ( $cp << 6 ) | ( ord( $input[ $i + $j ] ) & 0x3F ); + } + $out[] = array( substr( $input, $i, $size ), $cp ); + $i += $size; + } + return $out; +} diff --git a/tools/css-selector-fuzz/lib/wp-stubs.php b/tools/css-selector-fuzz/lib/wp-stubs.php new file mode 100644 index 0000000000000..ec9b154ee58d6 --- /dev/null +++ b/tools/css-selector-fuzz/lib/wp-stubs.php @@ -0,0 +1,62 @@ + (string) $function_name, + 'message' => (string) $message, + ); + } +} + +if ( ! function_exists( '_deprecated_argument' ) ) { + function _deprecated_argument( $function_name, $version, $message = '' ) { + } +} + +if ( ! function_exists( 'wp_trigger_error' ) ) { + function wp_trigger_error( $function_name, $message, $error_level = E_USER_NOTICE ) { + $GLOBALS['css_selector_fuzz_doing_it_wrong'][] = array( + 'function' => (string) $function_name, + 'message' => (string) $message, + ); + } +} + +if ( ! function_exists( 'wp_kses_uri_attributes' ) ) { + function wp_kses_uri_attributes() { + return array( + 'action', + 'archive', + 'background', + 'cite', + 'classid', + 'codebase', + 'data', + 'formaction', + 'href', + 'icon', + 'longdesc', + 'manifest', + 'poster', + 'profile', + 'src', + 'usemap', + 'xmlns', + ); + } +} diff --git a/tools/css-selector-fuzz/minimize.php b/tools/css-selector-fuzz/minimize.php new file mode 100644 index 0000000000000..c7fe1f1cb3dda --- /dev/null +++ b/tools/css-selector-fuzz/minimize.php @@ -0,0 +1,268 @@ +#!/usr/bin/env php + metamorphic-ast, Bug 2 -> match-mismatch-html, + * Bug 3 -> metamorphic-parse — reachable via --signature). + * + * Usage: + * php tools/css-selector-fuzz/minimize.php --seed 1234 [--signature SUBSTR] + * php tools/css-selector-fuzz/minimize.php --selector 'sel' --html '<…>' [--signature SUBSTR] + * + * Options: + * --signature SUBSTR Target a signature whose id or invariant contains + * SUBSTR. For --seed, also the way to opt into a + * related self-contained signature when the seed's own + * failure is generator-side (printed as a retarget). + * --max-attempts N Cap test evaluations (default 4000). + * --json Emit the reproducer as JSON. + */ + +require_once __DIR__ . '/lib/autoload.php'; + +use CssSelectorFuzz\Worker; +use function CssSelectorFuzz\json_encode_safe; +use function CssSelectorFuzz\option_bool; +use function CssSelectorFuzz\option_int; +use function CssSelectorFuzz\option_string; +use function CssSelectorFuzz\parse_cli_options; +use function CssSelectorFuzz\printable_bytes; + +$options = parse_cli_options( $argv ); +$max_attempts = option_int( $options, 'max-attempts', 20000 ); +$sig_filter = option_string( $options, 'signature', null ); + +/* + * In --seed mode, the seed's OWN failures ( from run_case ) are the source + * of truth. The minimizer can only preserve "self-contained" signatures + * ( those run_pair re-checks without the generator's intended AST ); the + * generator-side ones ( ast-mismatch, parse-expectation, path-expectation, + * model-desync ) are invisible to run_pair. Targeting must therefore be + * restricted to the intersection of the seed's failures and run_pair's + * view — otherwise the minimizer could silently retarget to an unrelated + * incidental signature and report a false "reproduced". + */ +$seed = option_int( $options, 'seed', -1 ); +$seed_signatures = null; +if ( $seed >= 0 ) { + $case = Worker::run_case( $seed ); + $selector = $case['selector']; + $html = $case['html']; + $seed_signatures = $case['signatures']; + if ( array() === $seed_signatures ) { + fwrite( STDERR, "Seed {$seed} produced no failure; nothing to minimize.\n" ); + exit( 1 ); + } +} else { + $selector = option_string( $options, 'selector', null ); + $html = option_string( $options, 'html', null ); + if ( null === $selector || null === $html ) { + fwrite( STDERR, "Provide --seed N, or both --selector and --html.\n" ); + exit( 1 ); + } +} + +/** Signatures produced by a pair ( $target lets run_pair short-circuit ). */ +$signatures_of = static function ( string $selector, string $html, ?string $target = null ): array { + return Worker::run_pair( $selector, $html, $target )['signatures']; +}; + +$baseline = $signatures_of( $selector, $html ); +if ( array() === $baseline ) { + fwrite( STDERR, "The starting pair does not reproduce any self-contained failure.\n" ); + if ( null !== $seed_signatures ) { + fwrite( STDERR, 'Seed failure(s): ' . implode( ', ', $seed_signatures ) . "\n" ); + fwrite( STDERR, "These are generator-side signatures the minimizer cannot reproduce from the\n" ); + fwrite( STDERR, "pair alone. Minimize a seed whose failure is self-contained, or pass\n" ); + fwrite( STDERR, "--selector/--html directly.\n" ); + } + fwrite( STDERR, 'selector: ' . printable_bytes( $selector ) . "\n" ); + exit( 1 ); +} + +/* + * Candidate targets are matched at the INVARIANT level, not the exact + * signature hash: a signature embeds transform-specific detail ( e.g. + * metamorphic-parse via `rerender` vs via `dup-branch` ), and run_pair's + * fixed metamorphic draws may expose the same invariant through a + * different transform than run_case did. Same invariant == same bug class, + * so that is faithful. A DIFFERENT invariant ( e.g. the seed's generator- + * side ast-mismatch vs an incidental self-contained metamorphic-ast ) is a + * genuine retarget and must be opted into. + */ +$invariant_of = static function ( string $signature ): string { + $pos = strrpos( $signature, ':' ); + return false === $pos ? $signature : substr( $signature, $pos + 1 ); +}; + +$retargeted = false; +if ( null === $seed_signatures ) { + $candidates = $baseline; +} else { + $seed_invariants = array_map( $invariant_of, $seed_signatures ); + $candidates = array(); + foreach ( $baseline as $signature ) { + if ( in_array( $invariant_of( $signature ), $seed_invariants, true ) ) { + $candidates[] = $signature; + } + } +} + +if ( array() === $candidates ) { + // The seed's failures are all generator-side ( no self-contained + // invariant in common ); refuse to silently minimize an unrelated + // incidental signature. + fwrite( STDERR, "Seed {$seed}'s failures are not self-contained, so the minimizer cannot\n" ); + fwrite( STDERR, "faithfully reproduce them.\n" ); + fwrite( STDERR, 'Seed failure(s): ' . implode( ', ', $seed_signatures ) . "\n" ); + fwrite( STDERR, 'Self-contained nearby: ' . implode( ', ', $baseline ) . "\n" ); + fwrite( STDERR, "Re-run with --signature to minimize one of the nearby signatures\n" ); + fwrite( STDERR, "explicitly ( understanding it is a related, not identical, failure ).\n" ); + if ( null === $sig_filter ) { + exit( 1 ); + } + // User explicitly opted into a nearby signature. + $candidates = $baseline; + $retargeted = true; +} + +// Pick the target signature from the eligible candidates. +$target = $candidates[0]; +if ( null !== $sig_filter ) { + foreach ( $candidates as $candidate ) { + if ( false !== strpos( $candidate, $sig_filter ) ) { + $target = $candidate; + break; + } + } +} + +$attempts = 0; +$reproduces = static function ( string $selector, string $html ) use ( $signatures_of, $target, &$attempts, $max_attempts ): bool { + if ( $attempts >= $max_attempts ) { + return false; + } + ++$attempts; + return in_array( $target, $signatures_of( $selector, $html, $target ), true ); +}; + +/** + * Delta-debugging shrink of one byte string: ddmin chunk removal followed + * by per-position single-byte simplification. $test( candidate ) decides + * whether a candidate still reproduces. + */ +$shrink = static function ( string $current, callable $test ) use ( &$attempts, $max_attempts ): string { + $chunks = 2; + while ( strlen( $current ) > 0 && $attempts < $max_attempts ) { + $length = strlen( $current ); + $chunk_size = (int) ceil( $length / $chunks ); + $changed = false; + + for ( $offset = 0; $offset < $length && $attempts < $max_attempts; $offset += $chunk_size ) { + $candidate = substr( $current, 0, $offset ) . substr( $current, min( $length, $offset + $chunk_size ) ); + if ( $candidate === $current ) { + continue; + } + if ( $test( $candidate ) ) { + $current = $candidate; + $chunks = max( 2, $chunks - 1 ); + $changed = true; + break; + } + } + + if ( ! $changed ) { + if ( $chunks >= $length ) { + break; + } + $chunks = min( $length, $chunks * 2 ); + } + } + + // Per-byte canonicalization: replace each byte with a simpler stand-in. + $replacements = array( 'a', ' ', '' ); + for ( $i = 0; $i < strlen( $current ) && $attempts < $max_attempts; $i++ ) { + foreach ( $replacements as $replacement ) { + $candidate = substr( $current, 0, $i ) . $replacement . substr( $current, $i + 1 ); + if ( $candidate === $current ) { + continue; + } + if ( $test( $candidate ) ) { + $current = $candidate; + $i = max( -1, $i - 2 ); + break; + } + } + } + + return $current; +}; + +// Alternate shrinking the HTML and the selector until neither moves. +// HTML first: when the signature is selector-only (e.g. metamorphic-parse) +// the document collapses cheaply before the costlier selector pass. +$prev = null; +while ( $attempts < $max_attempts && ( $selector . "\0" . $html ) !== $prev ) { + $prev = $selector . "\0" . $html; + + $html = $shrink( + $html, + static function ( string $candidate ) use ( $reproduces, &$selector ): bool { + return $reproduces( $selector, $candidate ); + } + ); + $selector = $shrink( + $selector, + static function ( string $candidate ) use ( $reproduces, &$html ): bool { + return $reproduces( $candidate, $html ); + } + ); +} + +$final = $signatures_of( $selector, $html ); +$ok = in_array( $target, $final, true ); + +if ( option_bool( $options, 'json', false ) ) { + echo json_encode_safe( + array( + 'target' => $target, + 'retargeted' => $retargeted, + 'seedSignatures' => $seed_signatures, + 'reproduced' => $ok, + 'attempts' => $attempts, + 'selector' => printable_bytes( $selector ), + 'selectorBytes' => strlen( $selector ), + 'html' => printable_bytes( $html ), + 'htmlBytes' => strlen( $html ), + 'selectorBase64' => base64_encode( $selector ), + 'htmlBase64' => base64_encode( $html ), + ) + ) . "\n"; + exit( $ok ? 0 : 2 ); +} + +echo "target: {$target}\n"; +if ( $retargeted ) { + echo 'NOTE: seed failure(s) ' . implode( ', ', $seed_signatures ) . " are generator-side;\n"; + echo " minimized the related self-contained signature above instead.\n"; +} +echo 'reproduced: ' . ( $ok ? 'yes' : 'NO' ) . "\n"; +echo "attempts: {$attempts}\n"; +echo 'selector: ' . printable_bytes( $selector ) . ' (' . strlen( $selector ) . " bytes)\n"; +echo 'html: ' . printable_bytes( $html ) . ' (' . strlen( $html ) . " bytes)\n"; +echo "\nreplay:\n"; +echo ' php tools/css-selector-fuzz/replay.php --selector ' . escapeshellarg( $selector ) + . ' --html ' . escapeshellarg( $html ) . "\n"; +exit( $ok ? 0 : 2 ); diff --git a/tools/css-selector-fuzz/replay.php b/tools/css-selector-fuzz/replay.php new file mode 100644 index 0000000000000..38ffb8a43678e --- /dev/null +++ b/tools/css-selector-fuzz/replay.php @@ -0,0 +1,91 @@ +#!/usr/bin/env php + bar' [--html '
'] + */ + +require_once __DIR__ . '/lib/autoload.php'; + +use CssSelectorFuzz\Bootstrap; +use CssSelectorFuzz\Worker; +use function CssSelectorFuzz\json_encode_safe; +use function CssSelectorFuzz\option_bool; +use function CssSelectorFuzz\option_int; +use function CssSelectorFuzz\option_string; +use function CssSelectorFuzz\parse_cli_options; +use function CssSelectorFuzz\printable_bytes; + +$options = parse_cli_options( $argv ); + +$probe_selector = option_string( $options, 'selector', null ); +if ( null !== $probe_selector ) { + // Quick probe mode: parse a selector and report what the API does with it. + Bootstrap::load(); + + $compound = \WP_CSS_Compound_Selector_List::from_selectors( $probe_selector ); + $complex = \WP_CSS_Complex_Selector_List::from_selectors( $probe_selector ); + + $report = array( + 'selector' => printable_bytes( $probe_selector ), + 'compoundList' => null === $compound ? null : \CssSelectorFuzz\AstExtractor::from_compound_list( $compound ), + 'complexList' => null === $complex ? null : \CssSelectorFuzz\AstExtractor::from_complex_list( $complex ), + ); + + $html = option_string( $options, 'html', null ); + if ( null !== $html && null !== $complex ) { + $processor = \WP_HTML_Processor::create_full_parser( $html ); + $matches = array(); + while ( $processor->select( $probe_selector ) ) { + $matches[] = array( + 'tag' => $processor->get_tag(), + 'breadcrumbs' => $processor->get_breadcrumbs(), + ); + } + $report['htmlProcessorMatches'] = $matches; + } + + echo json_encode_safe( $report ) . "\n"; + exit( 0 ); +} + +$seed = option_int( $options, 'seed', -1 ); +if ( $seed < 0 ) { + echo "Usage: php tools/css-selector-fuzz/replay.php --seed N [--json] [--show-html]\n"; + echo " php tools/css-selector-fuzz/replay.php --selector 'div > .cls' [--html '
']\n"; + exit( 1 ); +} + +$result = Worker::run_case( $seed ); + +if ( option_bool( $options, 'json', false ) ) { + echo json_encode_safe( $result ) . "\n"; + exit( array() === $result['failures'] ? 0 : 2 ); +} + +echo "seed: {$result['seed']}\n"; +echo "bucket: {$result['bucket']}\n"; +echo 'selector: ' . printable_bytes( $result['selector'] ) . "\n"; +echo "digest: {$result['digest']}\n"; + +if ( option_bool( $options, 'show-html', false ) ) { + echo "html: " . printable_bytes( $result['html'] ) . "\n"; +} + +if ( array() === $result['failures'] ) { + echo "failures: none\n"; + exit( 0 ); +} + +echo 'failures: ' . count( $result['failures'] ) . "\n"; +foreach ( $result['failures'] as $i => $failure ) { + echo "--- failure {$i}: {$failure['invariant']} ---\n"; + echo json_encode_safe( $failure['detail'] ) . "\n"; +} +exit( 2 ); diff --git a/tools/css-selector-fuzz/runner.php b/tools/css-selector-fuzz/runner.php new file mode 100644 index 0000000000000..414eb167a660e --- /dev/null +++ b/tools/css-selector-fuzz/runner.php @@ -0,0 +1,337 @@ +#!/usr/bin/env php + array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'pipe', 'w' ), + ); + + $started = microtime( true ); + $proc = proc_open( $command, $descriptors, $pipes, repo_root() ); + if ( ! is_resource( $proc ) ) { + return array( + 'code' => null, + 'timedOut' => false, + 'stdout' => '', + 'stderr' => 'proc_open failed', + 'durationMs' => 0, + ); + } + + fclose( $pipes[0] ); + stream_set_blocking( $pipes[1], false ); + stream_set_blocking( $pipes[2], false ); + + $stdout = ''; + $stderr = ''; + $timed_out = false; + $deadline = $started + $timeout_ms / 1000; + + while ( true ) { + $status = proc_get_status( $proc ); + $stdout .= (string) stream_get_contents( $pipes[1] ); + $stderr .= (string) stream_get_contents( $pipes[2] ); + + if ( ! $status['running'] ) { + $code = $status['exitcode']; + break; + } + if ( microtime( true ) > $deadline ) { + $timed_out = true; + proc_terminate( $proc, 9 ); + $code = null; + break; + } + usleep( 10000 ); + } + + $stdout .= (string) stream_get_contents( $pipes[1] ); + $stderr .= (string) stream_get_contents( $pipes[2] ); + fclose( $pipes[1] ); + fclose( $pipes[2] ); + proc_close( $proc ); + + return array( + 'code' => $code, + 'timedOut' => $timed_out, + 'stdout' => $stdout, + 'stderr' => $stderr, + 'durationMs' => (int) round( 1000 * ( microtime( true ) - $started ) ), + ); +} + +/** Extracts the batch summary from worker stdout, or null. */ +function css_selector_fuzz_worker_summary( string $stdout ): ?array { + foreach ( array_reverse( explode( "\n", trim( $stdout ) ) ) as $line ) { + $decoded = json_decode( $line, true ); + if ( is_array( $decoded ) && 'css-selector-fuzz-batch-summary' === ( $decoded['kind'] ?? null ) ) { + return $decoded; + } + } + return null; +} + +/** Merges per-bucket/per-target match assertion counts. */ +function css_selector_fuzz_merge_match_stats( array &$target, array $source ): void { + foreach ( $source as $bucket => $targets ) { + foreach ( $targets as $match_target => $stats ) { + if ( ! isset( $target[ $bucket ][ $match_target ] ) ) { + $target[ $bucket ][ $match_target ] = array( + 'assertions' => 0, + 'nonVacuous' => 0, + ); + } + $target[ $bucket ][ $match_target ]['assertions'] += (int) ( $stats['assertions'] ?? 0 ); + $target[ $bucket ][ $match_target ]['nonVacuous'] += (int) ( $stats['nonVacuous'] ?? 0 ); + } + } +} + +/** Adds derived rates after all count aggregation is finished. */ +function css_selector_fuzz_finalize_match_stats( array $stats ): array { + foreach ( $stats as $bucket => $targets ) { + foreach ( $targets as $match_target => $counts ) { + $assertions = (int) ( $counts['assertions'] ?? 0 ); + $non_vacuous = (int) ( $counts['nonVacuous'] ?? 0 ); + $vacuous = max( 0, $assertions - $non_vacuous ); + + $stats[ $bucket ][ $match_target ]['vacuous'] = $vacuous; + $stats[ $bucket ][ $match_target ]['nonVacuousRate'] = $assertions > 0 ? round( $non_vacuous / $assertions, 4 ) : 0.0; + $stats[ $bucket ][ $match_target ]['vacuousRate'] = $assertions > 0 ? round( $vacuous / $assertions, 4 ) : 0.0; + } + } + return $stats; +} + +function css_selector_fuzz_write_state( string $state_path, array $state ): void { + $state['matchStats'] = css_selector_fuzz_finalize_match_stats( $state['matchStats'] ?? array() ); + write_json_file( $state_path, $state ); +} + +function css_selector_fuzz_state_for_output( array $state ): array { + $state['matchStats'] = css_selector_fuzz_finalize_match_stats( $state['matchStats'] ?? array() ); + return $state; +} + +$options = parse_cli_options( $argv ); +if ( option_bool( $options, 'help', false ) || option_bool( $options, 'h', false ) ) { + echo "Usage: php tools/css-selector-fuzz/runner.php [--start-seed N] [--max-seeds N] [--duration-seconds N] [--chunk-size N] [--timeout-ms N] [--output-dir DIR] [--stop-on-failure]\n"; + exit( 0 ); +} + +$start_seed = option_int( $options, 'start-seed', 1 ); +$max_seeds = option_int( $options, 'max-seeds', 1000 ); +$duration_seconds = option_int( $options, 'duration-seconds', 120 ); +$chunk_size = max( 1, option_int( $options, 'chunk-size', 200 ) ); +$timeout_ms = option_int( $options, 'timeout-ms', 0 ); +$stop_on_failure = option_bool( $options, 'stop-on-failure', false ); +$output_dir = option_string( $options, 'output-dir', repo_root() . '/artifacts/css-selector-fuzz/run-' . timestamp() ); + +if ( $max_seeds < 1 ) { + fwrite( STDERR, "--max-seeds must be at least 1; refusing to run unbounded.\n" ); + exit( 1 ); +} +if ( 0 === $timeout_ms ) { + // Generous per-chunk budget: ~50ms per case plus startup. + $timeout_ms = $chunk_size * 50 + 10000; +} + +ensure_dir( $output_dir ); +$failures_path = $output_dir . '/failures.ndjson'; +$state_path = $output_dir . '/state.json'; +$worker_script = __DIR__ . '/worker.php'; + +$state = array( + 'kind' => 'css-selector-fuzz-runner-state', + 'startedAt' => gmdate( 'c' ), + 'updatedAt' => gmdate( 'c' ), + 'git' => git_metadata(), + 'phpVersion' => PHP_VERSION, + 'outputDir' => $output_dir, + 'startSeed' => $start_seed, + 'maxSeeds' => $max_seeds, + 'durationSeconds' => $duration_seconds, + 'chunkSize' => $chunk_size, + 'casesCompleted' => 0, + 'failures' => 0, + 'crashes' => 0, + 'buckets' => array(), + 'signatures' => array(), + 'lexbor' => array(), + 'matchStats' => array(), + 'nextSeed' => $start_seed, + 'stopReason' => null, +); +css_selector_fuzz_write_state( $state_path, $state ); + +$deadline = $duration_seconds > 0 ? microtime( true ) + $duration_seconds : null; +$seed = $start_seed; +$end_seed = $start_seed + $max_seeds; + +while ( $seed < $end_seed ) { + if ( null !== $deadline && microtime( true ) > $deadline ) { + $state['stopReason'] = 'duration-elapsed'; + break; + } + + $count = min( $chunk_size, $end_seed - $seed ); + $args = array( + $worker_script, + '--start-seed', + (string) $seed, + '--count', + (string) $count, + '--failures-out', + $failures_path, + '--progress-file', + $output_dir . '/progress.txt', + ); + + $proc = css_selector_fuzz_run_php( $args, $timeout_ms ); + $summary = css_selector_fuzz_worker_summary( $proc['stdout'] ); + + if ( null === $summary ) { + /* + * The worker crashed, hung, or died fatally. Re-run each seed of the + * chunk in its own process to attribute the crash. + */ + fwrite( STDERR, "chunk seed={$seed} count={$count}: worker crashed/hung; isolating…\n" ); + for ( $isolated = $seed; $isolated < $seed + $count; $isolated++ ) { + $single = css_selector_fuzz_run_php( + array( + $worker_script, + '--start-seed', + (string) $isolated, + '--count', + '1', + '--failures-out', + $failures_path, + '--determinism-every', + '0', + ), + max( 5000, (int) ( $timeout_ms / $count ) + 5000 ) + ); + $single_summary = css_selector_fuzz_worker_summary( $single['stdout'] ); + if ( null === $single_summary ) { + ++$state['crashes']; + ++$state['failures']; + append_ndjson( + $failures_path, + array( + 'kind' => 'css-selector-fuzz-failure', + 'seed' => $isolated, + 'invariant' => $single['timedOut'] ? 'worker-timeout' : 'worker-crash', + 'signature' => $single['timedOut'] ? 'worker-timeout' : 'worker-crash', + 'exitCode' => $single['code'], + 'stderrTail' => substr( $single['stderr'], -2000 ), + ) + ); + $key = $single['timedOut'] ? 'worker-timeout' : 'worker-crash'; + $state['signatures'][ $key ] = ( $state['signatures'][ $key ] ?? 0 ) + 1; + } else { + ++$state['casesCompleted']; + $state['failures'] += $single_summary['failures']; + foreach ( $single_summary['buckets'] as $bucket => $bucket_count ) { + $state['buckets'][ $bucket ] = ( $state['buckets'][ $bucket ] ?? 0 ) + $bucket_count; + } + foreach ( $single_summary['signatures'] as $signature => $signature_count ) { + $state['signatures'][ $signature ] = ( $state['signatures'][ $signature ] ?? 0 ) + $signature_count; + } + foreach ( $single_summary['lexbor'] ?? array() as $lexbor_state => $lexbor_count ) { + $state['lexbor'][ $lexbor_state ] = ( $state['lexbor'][ $lexbor_state ] ?? 0 ) + $lexbor_count; + } + css_selector_fuzz_merge_match_stats( $state['matchStats'], $single_summary['matchStats'] ?? array() ); + } + } + } else { + $state['casesCompleted'] += array_sum( $summary['buckets'] ); + $state['failures'] += $summary['failures']; + foreach ( $summary['buckets'] as $bucket => $bucket_count ) { + $state['buckets'][ $bucket ] = ( $state['buckets'][ $bucket ] ?? 0 ) + $bucket_count; + } + foreach ( $summary['signatures'] as $signature => $signature_count ) { + $state['signatures'][ $signature ] = ( $state['signatures'][ $signature ] ?? 0 ) + $signature_count; + } + foreach ( $summary['lexbor'] ?? array() as $lexbor_state => $lexbor_count ) { + $state['lexbor'][ $lexbor_state ] = ( $state['lexbor'][ $lexbor_state ] ?? 0 ) + $lexbor_count; + } + css_selector_fuzz_merge_match_stats( $state['matchStats'], $summary['matchStats'] ?? array() ); + } + + $seed += $count; + $state['nextSeed'] = $seed; + $state['updatedAt'] = gmdate( 'c' ); + css_selector_fuzz_write_state( $state_path, $state ); + + if ( $stop_on_failure && $state['failures'] > 0 ) { + $state['stopReason'] = 'stop-on-failure'; + break; + } +} + +if ( null === $state['stopReason'] ) { + $state['stopReason'] = 'max-seeds'; +} +$state['updatedAt'] = gmdate( 'c' ); +css_selector_fuzz_write_state( $state_path, $state ); + +/* + * The lexbor differential is the third oracle. If it ever ran ( 'compared' ) + * it was built and live; any 'unavailable' or 'error' tally then means it + * was missing for some cases or died mid-run, so part of the run had only + * two oracles. Surface that loudly rather than letting a green run hide it. + */ +$lexbor = $state['lexbor']; +$lexbor_ran = ( $lexbor['compared'] ?? 0 ) > 0; +$lexbor_lost = ( $lexbor['unavailable'] ?? 0 ) + ( $lexbor['error'] ?? 0 ); +if ( $lexbor_ran && $lexbor_lost > 0 ) { + fwrite( STDERR, "WARNING: lexbor third oracle was unavailable/errored for {$lexbor_lost} case(s); those ran with two oracles.\n" ); +} elseif ( ! $lexbor_ran ) { + fwrite( STDERR, "NOTE: lexbor third oracle never ran (harness not built?); run `sh tools/css-selector-fuzz/lexbor/build.sh` for the differential.\n" ); +} + +echo json_encode_safe( css_selector_fuzz_state_for_output( $state ) ) . "\n"; +exit( 0 === $state['failures'] ? 0 : 2 ); diff --git a/tools/css-selector-fuzz/tests/self-check.php b/tools/css-selector-fuzz/tests/self-check.php new file mode 100644 index 0000000000000..9664367f1e300 --- /dev/null +++ b/tools/css-selector-fuzz/tests/self-check.php @@ -0,0 +1,368 @@ +#!/usr/bin/env php + substr_count( $selector, ']' ) ) { + return 'eof-auto-closes-attribute-selector'; + } + if ( preg_match( '/\\[[^\\]]*=\\s*[-_a-zA-Z0-9]\\]$/', $selector ) ) { + return 'single-char-unquoted-attribute-value-at-eof'; + } + if ( has_identity_escape_after_multibyte( $selector ) ) { + return 'identity-escape-after-multibyte'; + } + + return null; +} + +function has_identity_escape_after_multibyte( string $selector ): bool { + $seen_multibyte = false; + $length = strlen( $selector ); + for ( $i = 0; $i < $length; $i++ ) { + $byte = ord( $selector[ $i ] ); + if ( $byte > 0x7F ) { + $seen_multibyte = true; + continue; + } + if ( ! $seen_multibyte || '\\' !== $selector[ $i ] || $i + 1 >= $length ) { + continue; + } + + $next = $selector[ $i + 1 ]; + if ( "\n" === $next || "\r" === $next || "\f" === $next || ctype_xdigit( $next ) ) { + continue; + } + return true; + } + return false; +} + +Bootstrap::load(); + +// --- Prng determinism and independence ------------------------------------- + +$a = new Prng( '42', 'label' ); +$b = new Prng( '42', 'label' ); +check( $a->bytes( 64 ) === $b->bytes( 64 ), 'Identical seeds produce identical streams.' ); + +$c = new Prng( '42', 'label' ); +$d = new Prng( '43', 'label' ); +check( $c->bytes( 64 ) !== $d->bytes( 64 ), 'Different seeds produce different streams.' ); + +$e = new Prng( '42', 'fork-test' ); +$f = new Prng( '42', 'fork-test' ); +$fork1 = $e->fork( 'x' ); +$fork2 = $f->fork( 'x' ); +check( $fork1->bytes( 32 ) === $fork2->bytes( 32 ), 'Forked streams are deterministic.' ); + +// --- utf8_codepoints -------------------------------------------------------- + +$points = utf8_codepoints( "a\u{E9}\u{1F600}" ); +check( 3 === count( $points ), 'utf8_codepoints splits into 3 codepoints.' ); +check( 0x61 === $points[0][1] && 0xE9 === $points[1][1] && 0x1F600 === $points[2][1], 'utf8_codepoints decodes values.' ); + +// --- Document generator: model matches parse for many seeds --------------- +// ( Worker::run_case checks this per case as model-desync; here only a couple +// of seeds are sampled for a fast signal. ) + +for ( $seed = 1; $seed <= 3; $seed++ ) { + $document = DocumentGenerator::generate( new Prng( (string) $seed, 'self-check-doc' ) ); + check( is_string( $document['html'] ) && '' !== $document['html'], "Document {$seed} renders." ); + check( str_contains( $document['html'], 'data-fid' ) || false !== strpos( $document['html'], 'data-fid' ), "Document {$seed} has fids." ); +} + +// --- Selector generator expectations over many seeds ----------------------- + +$by_bucket = array(); +$allowed_parse_mismatches = array(); +for ( $seed = 1; $seed <= 400; $seed++ ) { + $prng = new Prng( (string) $seed, 'self-check-selector' ); + $document = DocumentGenerator::generate( $prng->fork( 'doc' ) ); + $selector = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'] ); + + $by_bucket[ $selector['bucket'] ] = ( $by_bucket[ $selector['bucket'] ] ?? 0 ) + 1; + + $compound = WP_CSS_Compound_Selector_List::from_selectors( $selector['selector'] ); + $complex = WP_CSS_Complex_Selector_List::from_selectors( $selector['selector'] ); + + if ( null !== $selector['expectCompound'] ) { + $expected = $selector['expectCompound']; + $actual = null !== $compound; + $known = known_core_parse_mismatch( $selector['selector'], $expected, $actual ); + if ( null !== $known ) { + $allowed_parse_mismatches[ "compound:{$known}" ] = ( $allowed_parse_mismatches[ "compound:{$known}" ] ?? 0 ) + 1; + } else { + check( + $expected === $actual, + "Seed {$seed} ({$selector['bucket']}): compound parse expectation for: " . \CssSelectorFuzz\printable_bytes( $selector['selector'] ) + ); + } + } + if ( null !== $selector['expectComplex'] ) { + $expected = $selector['expectComplex']; + $actual = null !== $complex; + $known = known_core_parse_mismatch( $selector['selector'], $expected, $actual ); + if ( null !== $known ) { + $allowed_parse_mismatches[ "complex:{$known}" ] = ( $allowed_parse_mismatches[ "complex:{$known}" ] ?? 0 ) + 1; + } else { + check( + $expected === $actual, + "Seed {$seed} ({$selector['bucket']}): complex parse expectation for: " . \CssSelectorFuzz\printable_bytes( $selector['selector'] ) + ); + } + } +} + +check( count( $by_bucket ) >= 5, 'Bucket variety: saw ' . count( $by_bucket ) . ' buckets.' ); +if ( array() !== $allowed_parse_mismatches ) { + fwrite( STDERR, 'Allowed known core parse bug signatures: ' . \CssSelectorFuzz\json_encode_safe( $allowed_parse_mismatches ) . "\n" ); +} + +// --- Document generator: randomized class NUL injection -------------------- + +$safe_class_nul = 0; +for ( $seed = 1; $seed <= 200; $seed++ ) { + $document = DocumentGenerator::generate( new Prng( (string) $seed, 'self-check-class-nul-safe' ) ); + if ( false !== strpos( $document['html'], "\0" ) ) { + ++$safe_class_nul; + check( false === str_contains( implode( "\n", $document['pools']['attrValues'] ), "\0" ), "Safe document {$seed}: class NUL does not leak into attrValues pool." ); + check( \CssSelectorFuzz\ast_strings_are_utf8( $document['pools']['classes'] ), "Safe document {$seed}: class pool strings stay valid UTF-8." ); + check( in_array( true, array_map( static function ( string $class ): bool { + return false !== strpos( $class, "\u{FFFD}" ); + }, $document['pools']['classes'] ), true ), "Safe document {$seed}: class pool contains decoded U+FFFD token." ); + } +} +check( $safe_class_nul > 0, "Safe document generator emits randomized class NUL values ({$safe_class_nul} of 200)." ); + +$wild_class_nul = 0; +for ( $seed = 1; $seed <= 200; $seed++ ) { + $document = WildDocumentGenerator::generate( new Prng( (string) $seed, 'self-check-class-nul-wild' ) ); + if ( false !== strpos( $document['html'], "\0" ) ) { + ++$wild_class_nul; + check( false === str_contains( implode( "\n", $document['pools']['attrValues'] ), "\0" ), "Wild document {$seed}: class NUL does not leak into attrValues pool." ); + check( \CssSelectorFuzz\ast_strings_are_utf8( $document['pools']['classes'] ), "Wild document {$seed}: class pool strings stay valid UTF-8." ); + check( in_array( true, array_map( static function ( string $class ): bool { + return false !== strpos( $class, "\u{FFFD}" ); + }, $document['pools']['classes'] ), true ), "Wild document {$seed}: class pool contains decoded U+FFFD token." ); + } +} +check( $wild_class_nul > 0, "Wild document generator emits randomized class NUL values ({$wild_class_nul} of 200)." ); + +// --- Invalid-UTF-8 bucket: post-scrub AST expectations by construction ------ +// from_selectors() replaces each maximal subpart of an ill-formed UTF-8 +// sequence with one U+FFFD before parsing ( CSS Syntax §3.2 via the WHATWG +// decoder ). The bucket injects raw ill-formed sequences and carries the +// post-scrub AST, with the per-class subpart counts hard-coded in the +// generator — independent of wp_scrub_utf8(), so this loop is a real +// differential between the generator's WHATWG expectations and the core +// scrub + parse pipeline. + +$fffd_ast_counts = array(); +$injection_sites = array(); +$byte_classes = array(); + +// The class names AND byte values are duplicated here on purpose: tallying +// from the generator's own table would silently shrink the assertion with a +// deleted entry and self-validate on a drifted byte value. +$expected_byte_classes = array( + 'lone-continuation' => "\x80", + 'truncated-2-byte' => "\xC3", + 'truncated-3-byte' => "\xE2\x8C", + 'truncated-4-byte' => "\xF0\x9F\x82", + 'invalid-lead-f5' => "\xF5", + 'invalid-lead-ff' => "\xFF", + 'overlong-min' => "\xC0\x80", + 'overlong-max' => "\xC1\xBF", + 'surrogate-half' => "\xED\xA0\x80", + 'beyond-max' => "\xF4\x90\x80\x80", +); + +$count_fffd = static function ( $node ) use ( &$count_fffd ): int { + if ( is_string( $node ) ) { + return substr_count( $node, "\u{FFFD}" ); + } + $total = 0; + if ( is_array( $node ) ) { + foreach ( $node as $child ) { + $total += $count_fffd( $child ); + } + } + return $total; +}; + +for ( $seed = 1; $seed <= 150; $seed++ ) { + $prng = new Prng( (string) $seed, 'self-check-invalid-utf8' ); + $document = DocumentGenerator::generate( $prng->fork( 'doc' ) ); + $case = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'], null, 'invalid-utf8' ); + $printable = \CssSelectorFuzz\printable_bytes( $case['selector'] ); + + check( 'invalid-utf8' === $case['bucket'], "Seed {$seed}: forced invalid-utf8 bucket, got {$case['bucket']}." ); + check( ! wp_is_valid_utf8( $case['selector'] ), "Seed {$seed}: selector must contain invalid UTF-8: {$printable}" ); + check( true === $case['expectCompound'] && true === $case['expectComplex'], "Seed {$seed}: invalid-utf8 cases must expect to parse in both grammars." ); + check( is_array( $case['ast'] ) && \CssSelectorFuzz\ast_strings_are_utf8( $case['ast'] ), "Seed {$seed}: expected AST must be valid UTF-8." ); + + $compound = WP_CSS_Compound_Selector_List::from_selectors( $case['selector'] ); + $complex = WP_CSS_Complex_Selector_List::from_selectors( $case['selector'] ); + check( null !== $compound, "Seed {$seed}: compound parse after scrub for: {$printable}" ); + check( null !== $complex, "Seed {$seed}: complex parse after scrub for: {$printable}" ); + if ( null === $complex || ! is_array( $case['ast'] ) ) { + continue; + } + + $parsed_ast = \CssSelectorFuzz\AstExtractor::from_complex_list( $complex ); + check( $case['ast'] === $parsed_ast, "Seed {$seed}: parsed AST equals maximal-subpart scrub expectation for: {$printable}" ); + + $fffd_ast_counts[ $count_fffd( $case['ast'] ) ] = true; + foreach ( (array) $case['ast'][0]['self']['subs'] as $sub ) { + $injection_sites[ 'attr' === $sub['kind'] && null !== $sub['matcher'] ? 'attr-value' : $sub['kind'] ] = true; + } + foreach ( $expected_byte_classes as $class_name => $class_bytes ) { + // Substring attribution is ambiguous only for lone-continuation, + // whose byte occurs inside three longer classes — good enough for + // an at-least-once variety tally. + if ( str_contains( $case['selector'], $class_bytes ) ) { + $byte_classes[ $class_name ] = true; + } + } +} + +foreach ( array( 1, 2, 3, 4 ) as $expected_count ) { + check( isset( $fffd_ast_counts[ $expected_count ] ), "Invalid-utf8 variety: a {$expected_count}-subpart byte class was generated." ); +} +foreach ( array( 'class', 'id', 'attr', 'attr-value' ) as $site ) { + check( isset( $injection_sites[ $site ] ), "Invalid-utf8 variety: injection site {$site} was generated." ); +} +foreach ( array_keys( $expected_byte_classes ) as $class_name ) { + check( isset( $byte_classes[ $class_name ] ), "Invalid-utf8 variety: byte class {$class_name} was generated." ); +} + +// --- Mutated bucket: raw invalid-byte splicing ------------------------------- +// mutate() must be able to splice raw ill-formed UTF-8 into a selector at +// arbitrary byte offsets; these cases carry no AST expectation and exercise +// crash / scrub-notice / differential paths only. The marker bytes here can +// appear in NO rendered selector (the pools' multibyte characters use other +// lead bytes), so their presence proves the mutation operation fired. + +$mutated_with_invalid = 0; +for ( $seed = 1; $seed <= 200; $seed++ ) { + $prng = new Prng( (string) $seed, 'self-check-mutated-utf8' ); + $document = DocumentGenerator::generate( $prng->fork( 'doc' ) ); + $case = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'], null, 'mutated' ); + if ( false !== strpbrk( $case['selector'], "\xC0\xC1\xED\xF4\xF5\xFF" ) ) { + ++$mutated_with_invalid; + } +} +check( $mutated_with_invalid >= 10, "Mutated bucket splices raw invalid bytes ({$mutated_with_invalid} of 200 seeds)." ); + +// --- Known-answer matching cases ------------------------------------------- + +$known_html = '' + . '
' + . '
' + . ''; + +function select_fids( string $html, string $selector ): array { + $processor = WP_HTML_Processor::create_full_parser( $html ); + $out = array(); + while ( $processor->select( $selector ) ) { + $out[] = $processor->get_attribute( 'data-fid' ); + } + return $out; +} + +check( array( 'e4' ) === select_fids( $known_html, '#x' ), 'Known: #x.' ); +check( array( 'e3', 'e4' ) === select_fids( $known_html, '.b' ), 'Known: .b.' ); +check( array( 'e4' ) === select_fids( $known_html, 'div > span.b' ), 'Known: div > span.b.' ); +check( array( 'e7' ) === select_fids( $known_html, 'section em' ), 'Known: section em.' ); +check( array() === select_fids( $known_html, 'section > em' ), 'Known: section > em matches nothing.' ); +check( array( 'e4' ) === select_fids( $known_html, '[data-v|="hello"]' ), 'Known: [data-v|=hello].' ); +check( array( 'e7' ) === select_fids( $known_html, '[lang^="en"]' ), 'Known: [lang^=en].' ); + +// --- Class-value decode boundary (ReferenceMatcher vs WP class_list) -------- +// WP's class_list() folds NUL -> U+FFFD and treats FF as a separator; the +// reference matcher reimplements tokenization independently. Pin both engines +// against each other on these boundary inputs; randomized generator sampling +// above verifies that the same NUL boundary is present in the hot path. Each +// case also checks the reference matcher agrees with select() over a +// TreeCapture of the same markup. + +function ref_fids( string $html, string $selector ): array { + $capture = \CssSelectorFuzz\TreeCapture::capture( $html ); + $list = WP_CSS_Complex_Selector_List::from_selectors( $selector ); + if ( null !== $capture['error'] || null === $list ) { + return array( '(error)' ); + } + $ast = \CssSelectorFuzz\AstExtractor::from_complex_list( $list ); + return \CssSelectorFuzz\ReferenceMatcher::expected_html_matches_rows( $ast, $capture['htmlRows'], $capture['quirks'] ); +} + +$nul_html = ""; +$ff_html = ""; + +$nul_cases = array( + array( "class NUL -> FFFD", $nul_html, ".foo\u{FFFD}bar", array( 'n0' ) ), + array( "class trailing NUL", $nul_html, ".x\u{FFFD}", array( 'n1' ) ), + array( "class raw NUL no-match", $nul_html, '.foobar', array() ), + array( "class FF separator (first)", $ff_html, '.alpha', array( 'f0' ) ), + array( "class FF separator (second)", $ff_html, '.beta', array( 'f0' ) ), +); +foreach ( $nul_cases as $case ) { + list( $label, $html, $selector, $expected ) = $case; + $wp = select_fids( $html, $selector ); + $ref = ref_fids( $html, $selector ); + check( $expected === $wp, "Decode boundary ({$label}): select() == expected." ); + check( $ref === $wp, "Decode boundary ({$label}): ReferenceMatcher == select()." ); +} + +// --- Worker end-to-end on a few seeds --------------------------------------- + +for ( $seed = 1; $seed <= 5; $seed++ ) { + $first = Worker::run_case( $seed ); + $second = Worker::run_case( $seed ); + check( $first['digest'] === $second['digest'], "Seed {$seed}: case digest is deterministic." ); +} + +if ( 0 === $failures ) { + echo "self-check OK\n"; + exit( 0 ); +} +echo "self-check FAILED: {$failures} failure(s)\n"; +exit( 1 ); diff --git a/tools/css-selector-fuzz/worker.php b/tools/css-selector-fuzz/worker.php new file mode 100644 index 0000000000000..bdcde442aa943 --- /dev/null +++ b/tools/css-selector-fuzz/worker.php @@ -0,0 +1,34 @@ +#!/usr/bin/env php + 'css-selector-fuzz-worker-fatal', + 'error' => \CssSelectorFuzz\Worker::describe_throwable( $e ), + ) + ) . "\n" + ); + exit( 1 ); +}