diff --git a/src/wp-includes/css-api/class-wp-css-builder.php b/src/wp-includes/css-api/class-wp-css-builder.php
new file mode 100644
index 0000000000000..56ae7a5ba009a
--- /dev/null
+++ b/src/wp-includes/css-api/class-wp-css-builder.php
@@ -0,0 +1,241 @@
+= 0x80 ) {
+ $result .= $value[ $i ];
+ continue;
+ }
+
+ // ASCII letters and underscore: always valid in idents.
+ if (
+ ( $byte >= 0x41 && $byte <= 0x5A ) || // A-Z
+ ( $byte >= 0x61 && $byte <= 0x7A ) || // a-z
+ 0x5F === $byte // _
+ ) {
+ $result .= $value[ $i ];
+ continue;
+ }
+
+ // Hyphen: valid in idents, but check for hyphen-digit at start.
+ if ( 0x2D === $byte ) {
+ // Hyphen at position 0 followed by a digit at position 1: escape the digit.
+ if ( 0 === $i && $i + 1 < $length && ord( $value[ $i + 1 ] ) >= 0x30 && ord( $value[ $i + 1 ] ) <= 0x39 ) {
+ $result .= '-';
+ ++$i;
+ $result .= sprintf( '\\%X ', ord( $value[ $i ] ) );
+ continue;
+ }
+ $result .= '-';
+ continue;
+ }
+
+ // Digits: valid except at position 0.
+ if ( $byte >= 0x30 && $byte <= 0x39 ) {
+ if ( 0 === $i ) {
+ $result .= sprintf( '\\%X ', $byte );
+ } else {
+ $result .= $value[ $i ];
+ }
+ continue;
+ }
+
+ // Everything else: hex-escape.
+ $result .= sprintf( '\\%X ', $byte );
+ }
+
+ return $result;
+ }
+
+ /**
+ * Create a quoted CSS string from a plain PHP string value.
+ *
+ * Example:
+ * $value = 'CSS & a "";
+ *
+ * CSS strings are quoted many characters that are problematic in HTML
+ * or may be complicated for rudimentary CSS or HTML processors to handle
+ * are encoded using Unicode escape sequences.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#escaping
+ */
+ public static function string( string $value ): string {
+ $value = wp_scrub_utf8( $value );
+ $escaped = strtr(
+ $value,
+ array(
+ // Escape existing backslashes to prevent unintentional escapes in result.
+ '\\' => '\\5C ',
+
+ // Pre-processing replaces NULLs and some newlines. Replace and escape as necessary.
+ "\0" => "\u{FFFD}",
+
+ // Normalize and replace newlines. https://www.w3.org/TR/css-syntax-3/#input-preprocessing
+ "\r\n" => '\\A ',
+ "\r" => '\\A ',
+ "\f" => '\\A ',
+
+ // Newlines must be escaped in CSS strings.
+ "\n" => '\\A ',
+
+ // Arbitrary characters for Unicode escaping:
+
+ // HTML syntax may be problematic.
+ '<' => '\\3C ',
+ '>' => '\\3E ',
+ '&' => '\\26 ',
+
+ // CSS syntax may be problematic.
+ ',' => '\\2C ',
+ ';' => '\\3B ',
+ '{' => '\\7B ',
+ '}' => '\\7D ',
+ '"' => '\\22 ',
+ "'" => '\\27 ',
+ )
+ );
+ return "\"{$escaped}\"";
+ }
+
+ public static function normalize_and_escape_css( string $css ): string {
+ $css = wp_scrub_utf8( $css );
+ $processor = WP_CSS_Token_Processor::create( $css );
+ if ( null === $processor ) {
+ return '';
+ }
+
+ $normalized_css = '';
+
+ while ( $processor->next_token() ) {
+ switch ( $processor->get_token_type() ) {
+
+ // Basic punctuation:
+ case WP_CSS_Token_Processor::TOKEN_SEMICOLON: $normalized_css .= ';'; break;
+ case WP_CSS_Token_Processor::TOKEN_COMMA: $normalized_css .= ','; break;
+ case WP_CSS_Token_Processor::TOKEN_WHITESPACE: $normalized_css .= ' '; break;
+ case WP_CSS_Token_Processor::TOKEN_COLON: $normalized_css .= ':'; break;
+
+ // Paired punctuation:
+ case WP_CSS_Token_Processor::TOKEN_LEFT_BRACE: $normalized_css .= '{'; break;
+ case WP_CSS_Token_Processor::TOKEN_RIGHT_BRACE: $normalized_css .= '}'; break;
+ case WP_CSS_Token_Processor::TOKEN_LEFT_PAREN: $normalized_css .= '('; break;
+ case WP_CSS_Token_Processor::TOKEN_RIGHT_PAREN: $normalized_css .= ')'; break;
+ case WP_CSS_Token_Processor::TOKEN_LEFT_BRACKET: $normalized_css .= '['; break;
+ case WP_CSS_Token_Processor::TOKEN_RIGHT_BRACKET: $normalized_css .= ']'; break;
+
+ // "@" + ident
+ case WP_CSS_Token_Processor::TOKEN_AT_KEYWORD:
+ $normalized_css .= '@' . self::ident( $processor->get_token_value() );
+ break;
+
+ // ident + "("
+ case WP_CSS_Token_Processor::TOKEN_FUNCTION:
+ $normalized_css .= self::ident( $processor->get_token_value() ) . '(';
+ break;
+
+ /*
+ * Hash tokens are not idents but their value can be escaped as such.
+ *
+ * ‖→ "#" →─┐ ┌──────────────────────────────┐ ┌─→‖
+ * ├─→─┤ a-z A-Z 0-9 _ - or non-ASCII ├─→─┤
+ * │ └──────────────────────────────┘ │
+ * │ ┌──────────────────────────────┐ │
+ * ├─→─┤ escape ├─→─┤
+ * │ └──────────────────────────────┘ │
+ * └──────────────────←───────────────────┘
+ */
+ case WP_CSS_Token_Processor::TOKEN_HASH:
+ $normalized_css .= '#' . self::ident( $processor->get_token_value() );
+ break;
+
+ case WP_CSS_Token_Processor::TOKEN_DIMENSION:
+ $normalized_css .= $processor->get_token_value() . $processor->get_token_unit();
+ break;
+
+ case WP_CSS_Token_Processor::TOKEN_PERCENTAGE:
+ $normalized_css .= "%{$processor->get_token_value()}";
+ break;
+
+ case WP_CSS_Token_Processor::TOKEN_NUMBER:
+ $normalized_css .= $processor->get_token_value();
+ break;
+
+ case WP_CSS_Token_Processor::TOKEN_DELIM:
+ $normalized_css .= $processor->get_token_value();
+ break;
+
+ case WP_CSS_Token_Processor::TOKEN_IDENT:
+ $normalized_css .= self::ident( $processor->get_token_value() );
+ break;
+
+ case WP_CSS_Token_Processor::TOKEN_STRING:
+ var_dump( $processor->get_token_value() );
+ $normalized_css .= self::string( $processor->get_token_value() );
+ break;
+
+ // Keep or strip comments?
+ case WP_CSS_Token_Processor::TOKEN_COMMENT:
+ $normalized_css .= substr( $css, $processor->get_token_start(), $processor->get_token_length() );
+ break;
+
+ /**
+ * A is an open string that reaches a newline.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#consume-string-token
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#preserved-tokens
+ * > Note: The tokens <}-token>s, <)-token>s, <]-token>, , and are always parse errors, but they are preserved in the token stream by this specification to allow other specs, such as Media Queries, to define more fine-grained error-handling than just dropping an entire declaration or block.
+ */
+ case WP_CSS_Token_Processor::TOKEN_BAD_STRING:
+ $normalized_css .= substr( $css, $processor->get_token_start(), $processor->get_token_length() ) . "\n";
+ break;
+
+ case WP_CSS_Token_Processor::TOKEN_URL:
+ case WP_CSS_Token_Processor::TOKEN_BAD_URL:
+ case WP_CSS_Token_Processor::TOKEN_CDC:
+ case WP_CSS_Token_Processor::TOKEN_CDO:
+ default:
+ throw new Error( 'unhandled token type ' . $processor->get_token_type() . ' with value ' . var_export( $processor->get_token_value(), true ) );
+ }
+ }
+
+ return strtr(
+ $normalized_css,
+ array(
+ ' ' => '␠',
+ "\t" => "␉\t",
+ "\n" => "␊\n",
+ )
+ );
+ }
+}
diff --git a/src/wp-includes/css-api/class-wp-css-token-processor.php b/src/wp-includes/css-api/class-wp-css-token-processor.php
new file mode 100644
index 0000000000000..f61775fdf6228
--- /dev/null
+++ b/src/wp-includes/css-api/class-wp-css-token-processor.php
@@ -0,0 +1,1845 @@
+ Replace any U+000D CARRIAGE RETURN (CR) code points, U+000C FORM FEED (FF)
+ * > code points, or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE
+ * > FEED (LF) in input by a single U+000A LINE FEED (LF) code point.
+ * > Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT
+ * > CHARACTER (�).
+ *
+ * This processor delays normalization as much as possible. That keeps the raw byte
+ * positions intact for accurate rewrites while still letting consumers ask for a
+ * normalized token when they need one.
+ *
+ * ### No EOF token
+ *
+ * The EOF token is a CSS parsing concept, not CSS tokenization concept. Therefore,
+ * this processor does not produce it.
+ *
+ * ### UTF-8 handling
+ *
+ * Only UTF-8 strings are supported. Invalid sequences are replaced with U+FFFD (�)
+ * using the maximal subpart approach described in
+ * https://www.unicode.org/versions/Unicode9.0.0/ch03.pdf, section 3.9 Best Practices
+ * for Using U+FFFD.
+ *
+ * ## Usage
+ *
+ * Basic iteration:
+ *
+ * $css = 'width: 10px;';
+ * $processor = WP_CSS_Token_Processor::create( $css );
+ * while ( $processor->next_token() ) {
+ * echo $processor->get_normalized_token();
+ * }
+ * // Outputs:
+ * // width: 10px;
+ *
+ * Rewriting a URL while keeping the rest of the stylesheet intact:
+ *
+ * $css = 'background: url(old.jpg) center / cover;';
+ * $processor = WP_CSS_Token_Processor::create( $css );
+ * while ( $processor->next_token() ) {
+ * if ( WP_CSS_Token_Processor::TOKEN_URL === $processor->get_token_type() ) {
+ * $processor->set_value( 'uploads/new.jpg' );
+ * }
+ * }
+ * $result = $processor->get_updated_css();
+ * // background: url(uploads/new.jpg) center / cover;
+ *
+ * Gathering diagnostics with byte offsets:
+ *
+ * $css = "color: red;\ncolor: re\nd;";
+ * $processor = WP_CSS_Token_Processor::create( $css );
+ * $bad_strings = array();
+ * while ( $processor->next_token() ) {
+ * if ( WP_CSS_Token_Processor::TOKEN_BAD_STRING === $processor->get_token_type() ) {
+ * $bad_strings[] = array(
+ * 'start' => $processor->get_token_start(),
+ * 'length' => $processor->get_token_length(),
+ * 'value' => $processor->get_unnormalized_token(),
+ * );
+ * }
+ * }
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#tokenization
+ */
+class WP_CSS_Token_Processor {
+ /**
+ * Token type constants matching the CSS Syntax Level 3 specification.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#tokenization
+ */
+ public const TOKEN_WHITESPACE = 'whitespace-token';
+ public const TOKEN_COMMENT = 'comment';
+ public const TOKEN_STRING = 'string-token';
+
+ /**
+ * BAD-STRING tokens occur when a string contains an unescaped newline.
+ *
+ * Valid strings: "hello", 'world', "line1\Aline2" (escaped newline)
+ * Invalid (produces bad-string): "hello
+ * world" (literal newline breaks the string)
+ *
+ * The processor stops at the newline and produces a bad-string token for error recovery.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#typedef-bad-string-token
+ */
+ public const TOKEN_BAD_STRING = 'bad-string-token';
+ public const TOKEN_HASH = 'hash-token';
+ public const HASH_TOKEN_ID = 'id';
+ public const HASH_TOKEN_UNRESTRICTED = 'unrestricted';
+ public const TOKEN_DELIM = 'delim-token';
+ public const TOKEN_NUMBER = 'number-token';
+ public const TOKEN_PERCENTAGE = 'percentage-token';
+ public const TOKEN_DIMENSION = 'dimension-token';
+ public const TOKEN_AT_KEYWORD = 'at-keyword-token';
+ public const TOKEN_COLON = 'colon-token';
+ public const TOKEN_SEMICOLON = 'semicolon-token';
+ public const TOKEN_COMMA = 'comma-token';
+ public const TOKEN_LEFT_PAREN = '(-token';
+ public const TOKEN_RIGHT_PAREN = ')-token';
+ public const TOKEN_LEFT_BRACKET = '[-token';
+ public const TOKEN_RIGHT_BRACKET = ']-token';
+ public const TOKEN_LEFT_BRACE = '{-token';
+ public const TOKEN_RIGHT_BRACE = '}-token';
+ public const TOKEN_FUNCTION = 'function-token';
+
+ /**
+ * URL tokens represent unquoted URLs in url() notation.
+ *
+ * For example, `url(image.jpg)` is a URL token.
+ *
+ * Quoted URLs like `url( "https://example.com" )` are handled as a function
+ * token, _not_ a URL token.
+ *
+ * Bad URL tokens are created when invalid characters are encountered in
+ * a URL token.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#typedef-url-token
+ */
+ public const TOKEN_URL = 'url-token';
+
+ /**
+ * BAD-URL tokens occur when a URL contains invalid characters.
+ *
+ * Invalid characters: quotes ("), apostrophes ('), parentheses (()
+ * Example invalid: url(image(.jpg) or url(image".jpg)
+ *
+ * When detected, the processor consumes everything up to ) or EOF.
+ * This prevents the bad URL from breaking subsequent tokens.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#typedef-bad-url-token
+ */
+ public const TOKEN_BAD_URL = 'bad-url-token';
+
+ /**
+ * Identifier tokens, such as `color`, `margin-top`, `red`,
+ * `inherit`, `--my-var`, `\x-escaped`, `über` (Unicode), etc.
+ *
+ * There are restrictions on the codepoints that start or are contained in
+ * an identifier, and identifiers may contain escape sequences.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#typedef-ident-token
+ */
+ public const TOKEN_IDENT = 'ident-token';
+
+ /**
+ * CDC (Comment Delimiter Close) token: -->
+ *
+ * Legacy token from when CSS was embedded in HTML
+ *
+ * Modern CSS no longer needs these, but they're preserved for compatibility.
+ * In stylesheets, they're typically treated like whitespace.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#typedef-CDC-token
+ */
+ public const TOKEN_CDC = 'CDC-token';
+
+ /**
+ * CDO (Comment Delimiter Open) token: )
+ *
+ * Comment Delimiter Close - legacy HTML comment syntax in CSS.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#CDC-token-diagram
+ */
+ if (
+ $this->at + 2 < $this->length &&
+ '-' === $this->css[ $this->at + 1 ] &&
+ '>' === $this->css[ $this->at + 2 ]
+ ) {
+ // Consume them and return a .
+ $this->at += 3;
+ $this->token_type = self::TOKEN_CDC;
+ $this->token_length = 3;
+ return true;
+ }
+
+ // Otherwise, if the input stream starts with an ident sequence,
+ // reconsume the current input code point, consume an ident-like
+ // token, and return it.
+ if ( $this->check_if_3_code_points_start_an_ident_sequence( $this->at ) ) {
+ return $this->consume_ident_like();
+ }
+
+ // Otherwise, return a with its value set to the current input code point.
+ ++$this->at;
+ $this->token_type = self::TOKEN_DELIM;
+ $this->token_length = 1;
+ return true;
+ }
+
+ /*
+ * U+003C LESS-THAN SIGN (<)
+ * If followed by !--, this is a CDO token (\n",
+ "tokens": [
+ {
+ "type": "CDC-token",
+ "raw": "-->",
+ "startIndex": 0,
+ "endIndex": 3,
+ "normalized": "-->",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 3,
+ "endIndex": 4,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0001": {
+ "css": "foo\n",
+ "tokens": [
+ {
+ "type": "ident-token",
+ "raw": "foo",
+ "startIndex": 0,
+ "endIndex": 3,
+ "normalized": "foo",
+ "value": "foo"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 3,
+ "endIndex": 4,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0002": {
+ "css": "--\n",
+ "tokens": [
+ {
+ "type": "ident-token",
+ "raw": "--",
+ "startIndex": 0,
+ "endIndex": 2,
+ "normalized": "--",
+ "value": "--"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 2,
+ "endIndex": 3,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0003": {
+ "css": "--0\n",
+ "tokens": [
+ {
+ "type": "ident-token",
+ "raw": "--0",
+ "startIndex": 0,
+ "endIndex": 3,
+ "normalized": "--0",
+ "value": "--0"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 3,
+ "endIndex": 4,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0004": {
+ "css": "-\\\n",
+ "tokens": [
+ {
+ "type": "delim-token",
+ "raw": "-",
+ "startIndex": 0,
+ "endIndex": 1,
+ "normalized": "-",
+ "value": "-"
+ },
+ {
+ "type": "delim-token",
+ "raw": "\\",
+ "startIndex": 1,
+ "endIndex": 2,
+ "normalized": "\\",
+ "value": "\\"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 2,
+ "endIndex": 3,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0005": {
+ "css": "-\\ \n",
+ "tokens": [
+ {
+ "type": "ident-token",
+ "raw": "-\\ ",
+ "startIndex": 0,
+ "endIndex": 3,
+ "normalized": "- ",
+ "value": "- "
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 3,
+ "endIndex": 4,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0006": {
+ "css": "--💅\n",
+ "tokens": [
+ {
+ "type": "ident-token",
+ "raw": "--💅",
+ "startIndex": 0,
+ "endIndex": 6,
+ "normalized": "--💅",
+ "value": "--💅"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 6,
+ "endIndex": 7,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0007": {
+ "css": "-§\n",
+ "tokens": [
+ {
+ "type": "ident-token",
+ "raw": "-§",
+ "startIndex": 0,
+ "endIndex": 3,
+ "normalized": "-§",
+ "value": "-§"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 3,
+ "endIndex": 4,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0008": {
+ "css": "-×\n",
+ "tokens": [
+ {
+ "type": "ident-token",
+ "raw": "-×",
+ "startIndex": 0,
+ "endIndex": 3,
+ "normalized": "-×",
+ "value": "-×"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 3,
+ "endIndex": 4,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident/0009": {
+ "css": "--a𐀀\n",
+ "tokens": [
+ {
+ "type": "ident-token",
+ "raw": "--a𐀀",
+ "startIndex": 0,
+ "endIndex": 7,
+ "normalized": "--a𐀀",
+ "value": "--a𐀀"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 7,
+ "endIndex": 8,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0001": {
+ "css": "url(foo)\n",
+ "tokens": [
+ {
+ "type": "url-token",
+ "raw": "url(foo)",
+ "startIndex": 0,
+ "endIndex": 8,
+ "normalized": "url(foo)",
+ "value": "foo"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 8,
+ "endIndex": 9,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0002": {
+ "css": "\\75 Rl(foo)\n",
+ "tokens": [
+ {
+ "type": "url-token",
+ "raw": "\\75 Rl(foo)",
+ "startIndex": 0,
+ "endIndex": 11,
+ "normalized": "uRl(foo)",
+ "value": "foo"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 11,
+ "endIndex": 12,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0003": {
+ "css": "uR\\6c (foo)\n",
+ "tokens": [
+ {
+ "type": "url-token",
+ "raw": "uR\\6c (foo)",
+ "startIndex": 0,
+ "endIndex": 11,
+ "normalized": "uRl(foo)",
+ "value": "foo"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 11,
+ "endIndex": 12,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0004": {
+ "css": "url('foo')\n",
+ "tokens": [
+ {
+ "type": "function-token",
+ "raw": "url(",
+ "startIndex": 0,
+ "endIndex": 4,
+ "normalized": "url(",
+ "value": "url"
+ },
+ {
+ "type": "string-token",
+ "raw": "'foo'",
+ "startIndex": 4,
+ "endIndex": 9,
+ "normalized": "'foo'",
+ "value": "foo"
+ },
+ {
+ "type": ")-token",
+ "raw": ")",
+ "startIndex": 9,
+ "endIndex": 10,
+ "normalized": ")",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 10,
+ "endIndex": 11,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0005": {
+ "css": "url( 'foo')\n",
+ "tokens": [
+ {
+ "type": "function-token",
+ "raw": "url(",
+ "startIndex": 0,
+ "endIndex": 4,
+ "normalized": "url(",
+ "value": "url"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": " ",
+ "startIndex": 4,
+ "endIndex": 5,
+ "normalized": " ",
+ "value": null
+ },
+ {
+ "type": "string-token",
+ "raw": "'foo'",
+ "startIndex": 5,
+ "endIndex": 10,
+ "normalized": "'foo'",
+ "value": "foo"
+ },
+ {
+ "type": ")-token",
+ "raw": ")",
+ "startIndex": 10,
+ "endIndex": 11,
+ "normalized": ")",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 11,
+ "endIndex": 12,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0006": {
+ "css": "url( 'foo')\n",
+ "tokens": [
+ {
+ "type": "function-token",
+ "raw": "url(",
+ "startIndex": 0,
+ "endIndex": 4,
+ "normalized": "url(",
+ "value": "url"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": " ",
+ "startIndex": 4,
+ "endIndex": 6,
+ "normalized": " ",
+ "value": null
+ },
+ {
+ "type": "string-token",
+ "raw": "'foo'",
+ "startIndex": 6,
+ "endIndex": 11,
+ "normalized": "'foo'",
+ "value": "foo"
+ },
+ {
+ "type": ")-token",
+ "raw": ")",
+ "startIndex": 11,
+ "endIndex": 12,
+ "normalized": ")",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 12,
+ "endIndex": 13,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0007": {
+ "css": "url( 'foo')\n",
+ "tokens": [
+ {
+ "type": "function-token",
+ "raw": "url(",
+ "startIndex": 0,
+ "endIndex": 4,
+ "normalized": "url(",
+ "value": "url"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": " ",
+ "startIndex": 4,
+ "endIndex": 7,
+ "normalized": " ",
+ "value": null
+ },
+ {
+ "type": "string-token",
+ "raw": "'foo'",
+ "startIndex": 7,
+ "endIndex": 12,
+ "normalized": "'foo'",
+ "value": "foo"
+ },
+ {
+ "type": ")-token",
+ "raw": ")",
+ "startIndex": 12,
+ "endIndex": 13,
+ "normalized": ")",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 13,
+ "endIndex": 14,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0008": {
+ "css": "not-url( 'foo')\n",
+ "tokens": [
+ {
+ "type": "function-token",
+ "raw": "not-url(",
+ "startIndex": 0,
+ "endIndex": 8,
+ "normalized": "not-url(",
+ "value": "not-url"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": " ",
+ "startIndex": 8,
+ "endIndex": 11,
+ "normalized": " ",
+ "value": null
+ },
+ {
+ "type": "string-token",
+ "raw": "'foo'",
+ "startIndex": 11,
+ "endIndex": 16,
+ "normalized": "'foo'",
+ "value": "foo"
+ },
+ {
+ "type": ")-token",
+ "raw": ")",
+ "startIndex": 16,
+ "endIndex": 17,
+ "normalized": ")",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 17,
+ "endIndex": 18,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/ident-like/0009": {
+ "css": "url( foo)\n",
+ "tokens": [
+ {
+ "type": "url-token",
+ "raw": "url( foo)",
+ "startIndex": 0,
+ "endIndex": 11,
+ "normalized": "url( foo)",
+ "value": "foo"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 11,
+ "endIndex": 12,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/left-curly-bracket/0001": {
+ "css": "{\n",
+ "tokens": [
+ {
+ "type": "{-token",
+ "raw": "{",
+ "startIndex": 0,
+ "endIndex": 1,
+ "normalized": "{",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 1,
+ "endIndex": 2,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/left-parenthesis/0001": {
+ "css": "(\n",
+ "tokens": [
+ {
+ "type": "(-token",
+ "raw": "(",
+ "startIndex": 0,
+ "endIndex": 1,
+ "normalized": "(",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 1,
+ "endIndex": 2,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/left-square-bracket/0001": {
+ "css": "[\n",
+ "tokens": [
+ {
+ "type": "[-token",
+ "raw": "[",
+ "startIndex": 0,
+ "endIndex": 1,
+ "normalized": "[",
+ "value": null
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 1,
+ "endIndex": 2,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/less-than/0001": {
+ "css": "<\n",
+ "tokens": [
+ {
+ "type": "delim-token",
+ "raw": "<",
+ "startIndex": 0,
+ "endIndex": 1,
+ "normalized": "<",
+ "value": "<"
+ },
+ {
+ "type": "whitespace-token",
+ "raw": "\n",
+ "startIndex": 1,
+ "endIndex": 2,
+ "normalized": "\n",
+ "value": null
+ }
+ ]
+ },
+ "tests/less-than/0002": {
+ "css": "' );
+ $html = '';
+ foreach ( $bits as $bit ) {
+ if ( $this->prng->chance( 35 ) ) {
+ $html .= $this->prng->choice( $filler );
+ }
+ $html .= $bit;
+ }
+
+ foreach ( $this->pools as $key => $values ) {
+ $this->pools[ $key ] = array_values( array_unique( $values ) );
+ }
+
+ return array(
+ 'model' => null,
+ 'children' => $children,
+ 'html' => $html,
+ 'context' => '',
+ 'fragment' => true,
+ 'quirks' => false,
+ 'pools' => $this->pools,
+ );
+ }
+
+ /**
+ * Rows ( TreeCapture shape ) for a ``-context fragment: the
+ * top-level children flattened with the implicit HTML/BODY ancestors the
+ * fragment parser reports.
+ */
+ public static function rows_from_fragment( array $children ): array {
+ $html_root = array( 'tag' => 'html', 'fid' => '(html)', 'attrs' => array(), 'children' => array() );
+ $body_root = array( 'tag' => 'body', 'fid' => '(body)', 'attrs' => array(), 'children' => $children );
+
+ $rows = array();
+ foreach ( $children as $child ) {
+ foreach ( self::flatten_with_ancestors( $child, array( $body_root, $html_root ) ) as $pair ) {
+ list( $element, $ancestors ) = $pair;
+
+ $attrs = array();
+ $seen = array();
+ foreach ( $element['attrs'] as $attr ) {
+ $lower = ascii_strtolower( $attr[0] );
+ if ( isset( $seen[ $lower ] ) ) {
+ continue;
+ }
+ $seen[ $lower ] = true;
+ $attrs[] = array( $lower, $attr[1] );
+ }
+
+ $ancestor_tags = array();
+ foreach ( $ancestors as $ancestor ) {
+ $ancestor_tags[] = strtoupper( ascii_strtolower( $ancestor['tag'] ) );
+ }
+
+ $rows[] = array(
+ 'tag' => strtoupper( ascii_strtolower( $element['tag'] ) ),
+ 'fid' => $element['fid'],
+ 'attrs' => $attrs,
+ 'ancestorTags' => $ancestor_tags,
+ );
+ }
+ }
+ return $rows;
+ }
+
+ private function build(): array {
+ $has_doctype = $this->prng->chance( 85 );
+
+ $head_children = array();
+ if ( $this->prng->chance( 60 ) ) {
+ $head_children[] = $this->make_element( 'title', array(), array() );
+ }
+ if ( $this->prng->chance( 30 ) ) {
+ $head_children[] = $this->make_element( 'meta', $this->random_attrs(), array() );
+ }
+
+ $body_children = array();
+ $child_budget = $this->prng->int( 1, 6 );
+ for ( $i = 0; $i < $child_budget && $this->element_count < $this->max_elements; $i++ ) {
+ $body_children[] = $this->random_subtree( 0 );
+ }
+
+ $head = $this->make_element( 'head', array(), $head_children );
+ $body = $this->make_element( 'body', $this->prng->chance( 30 ) ? $this->random_attrs() : array(), $body_children );
+ $html = $this->make_element( 'html', $this->prng->chance( 20 ) ? $this->random_attrs() : array(), array( $head, $body ) );
+
+ $rendered = ( $has_doctype ? '' : '' ) . $this->render_element( $html );
+
+ foreach ( $this->pools as $key => $values ) {
+ $this->pools[ $key ] = array_values( array_unique( $values ) );
+ }
+
+ return array(
+ 'model' => $html,
+ 'html' => $rendered,
+ 'quirks' => ! $has_doctype,
+ 'pools' => $this->pools,
+ );
+ }
+
+ private function random_subtree( int $depth ): array {
+ ++$this->element_count;
+
+ if ( $depth >= 7 || $this->element_count >= $this->max_elements || $this->prng->chance( 25 ) ) {
+ // Leaf.
+ if ( $this->prng->chance( 25 ) ) {
+ return $this->make_element( $this->prng->choice( self::VOID_TAGS ), $this->random_attrs(), array(), true );
+ }
+ return $this->make_element( $this->prng->choice( self::SAFE_TAGS ), $this->random_attrs(), array() );
+ }
+
+ $children = array();
+ $child_count = $this->prng->int( 1, 4 );
+ for ( $i = 0; $i < $child_count && $this->element_count < $this->max_elements; $i++ ) {
+ $children[] = $this->random_subtree( $depth + 1 );
+ }
+
+ return $this->make_element( $this->prng->choice( self::SAFE_TAGS ), $this->random_attrs(), $children );
+ }
+
+ private function make_element( string $tag, array $attrs, array $children, bool $is_void = false ): array {
+ $fid = 'e' . $this->fid_counter++;
+
+ $written_tag = $this->prng->chance( 15 ) ? $this->random_case( $tag ) : $tag;
+
+ $this->pools['tags'][] = $tag;
+
+ return array(
+ 'tag' => $written_tag,
+ 'fid' => $fid,
+ 'attrs' => $attrs,
+ 'children' => $children,
+ 'void' => $is_void || in_array( strtolower( $tag ), array( 'meta', 'br', 'hr', 'img', 'wbr', 'input', 'embed' ), true ),
+ );
+ }
+
+ /** @return array name/value pairs in source order. */
+ private function random_attrs(): array {
+ $attrs = array();
+ $count = $this->prng->weighted(
+ array(
+ 0 => 15,
+ 1 => 30,
+ 2 => 30,
+ 3 => 15,
+ 4 => 10,
+ )
+ );
+
+ $used_names = array();
+ for ( $i = 0; $i < $count; $i++ ) {
+ $name = $this->prng->choice( self::ATTR_NAMES );
+
+ // Occasionally repeat an attribute name: the processor keeps the first.
+ $is_duplicate = isset( $used_names[ ascii_strtolower( $name ) ] );
+ if ( $is_duplicate && ! $this->prng->chance( 20 ) ) {
+ continue;
+ }
+ $used_names[ ascii_strtolower( $name ) ] = true;
+
+ if ( $this->prng->chance( 12 ) ) {
+ $name = $this->random_case( $name );
+ }
+
+ $lower = ascii_strtolower( $name );
+ if ( 'class' === $lower ) {
+ $value = $this->random_class_value();
+ } elseif ( 'id' === $lower ) {
+ $value = $this->prng->chance( 85 ) ? $this->random_id_value() : ( $this->prng->chance( 50 ) ? '' : true );
+ } elseif ( in_array( $lower, array( 'disabled', 'hidden' ), true ) ) {
+ $value = $this->prng->chance( 70 ) ? true : $this->prng->choice( array( '', 'disabled', 'true' ) );
+ } else {
+ $value = $this->prng->chance( 12 ) ? true : $this->random_attr_value();
+ }
+
+ $this->pools['attrNames'][] = ascii_strtolower( $name );
+ if ( is_string( $value ) && 'class' !== $lower ) {
+ $this->pools['attrValues'][] = $value;
+ }
+
+ $attrs[] = array( $name, $value );
+ }
+
+ return $attrs;
+ }
+
+ private function random_class_value(): string {
+ $count = $this->prng->int( 1, 4 );
+ $classes = array();
+ for ( $i = 0; $i < $count; $i++ ) {
+ $class = $this->random_word( true );
+ $raw_class = $this->maybe_inject_class_nul( $class );
+ $classes[] = $raw_class;
+ foreach ( self::class_tokens( $raw_class ) as $token ) {
+ $this->pools['classes'][] = $token;
+ }
+ }
+
+ $ws = array( ' ', ' ', ' ', "\t", "\n", "\f", ' ' );
+ $value = $this->prng->chance( 20 ) ? $this->prng->choice( $ws ) : '';
+ foreach ( $classes as $i => $class ) {
+ if ( $i > 0 ) {
+ $value .= $this->prng->choice( $ws );
+ }
+ $value .= $class;
+ }
+ if ( $this->prng->chance( 20 ) ) {
+ $value .= $this->prng->choice( $ws );
+ }
+ return $value;
+ }
+
+ private function maybe_inject_class_nul( string $class ): string {
+ if ( '' === $class || ! $this->prng->chance( 12 ) ) {
+ return $class;
+ }
+
+ $points = utf8_codepoints( $class );
+ $at = $this->prng->int( 0, count( $points ) );
+ $out = '';
+ foreach ( $points as $i => $point ) {
+ if ( $i === $at ) {
+ $out .= "\0";
+ }
+ $out .= $point[0];
+ }
+ return $at === count( $points ) ? $out . "\0" : $out;
+ }
+
+ private function random_id_value(): string {
+ $id = $this->random_word( true );
+ $this->pools['ids'][] = $id;
+ return $id;
+ }
+
+ private function random_attr_value(): string {
+ $kind = $this->prng->weighted(
+ array(
+ 'word' => 35,
+ 'words' => 20,
+ 'hyphenated' => 15,
+ 'empty' => 8,
+ 'spicy' => 12,
+ 'unicode' => 10,
+ )
+ );
+
+ switch ( $kind ) {
+ case 'word':
+ return $this->random_word( true );
+ case 'words':
+ $parts = array();
+ $n = $this->prng->int( 2, 4 );
+ for ( $i = 0; $i < $n; $i++ ) {
+ $parts[] = $this->random_word( true );
+ }
+ return implode( $this->prng->choice( array( ' ', ' ', "\t", "\n" ) ), $parts );
+ case 'hyphenated':
+ return $this->random_word( false ) . '-' . $this->random_word( false );
+ case 'empty':
+ return '';
+ case 'spicy':
+ $spice = array( 'a"b', "a'b", 'a&b', 'ab', 'a=b', 'a b c', '&', '"x', '100%', 'semi;colon', 'a,b' );
+ return $this->prng->choice( $spice );
+ case 'unicode':
+ $unicode = array( 'héllo', 'ÄÖÜ', '✓done', 'naïve', 'Ωmega', '\u{1F600}smile' );
+ $value = $this->prng->choice( $unicode );
+ return str_replace( '\u{1F600}', "\u{1F600}", $value );
+ }
+ return 'fallback';
+ }
+
+ private function random_word( bool $allow_mixed_case ): string {
+ $stems = array( 'alpha', 'beta', 'gamma', 'delta', 'box', 'col', 'item', 'note', 'wide', 'main-item', 'x', 'a', '-lead', '--var', '_under', 'Über', 'mixedCase' );
+ $word = $this->prng->choice( $stems );
+ if ( $this->prng->chance( 30 ) ) {
+ $word .= (string) $this->prng->int( 0, 99 );
+ }
+ if ( $allow_mixed_case && $this->prng->chance( 15 ) ) {
+ $word = $this->random_case( $word );
+ }
+ return $word;
+ }
+
+ private function random_case( string $input ): string {
+ $out = '';
+ for ( $i = 0; $i < strlen( $input ); $i++ ) {
+ $c = $input[ $i ];
+ $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c );
+ }
+ return $out;
+ }
+
+ /*
+ * ---------
+ * Rendering
+ * ---------
+ */
+
+ private function render_element( array $element ): string {
+ $out = '<' . $element['tag'];
+
+ $rendered_attrs = array( ' data-fid="' . $element['fid'] . '"' );
+ foreach ( $element['attrs'] as $attr ) {
+ $rendered_attrs[] = ' ' . $this->render_attr( $attr[0], $attr[1] );
+ }
+ $out .= implode( '', $rendered_attrs );
+
+ if ( $element['void'] ) {
+ $out .= $this->prng->chance( 25 ) ? ' />' : '>';
+ return $out;
+ }
+
+ $out .= '>';
+
+ $child_bits = array();
+ foreach ( $element['children'] as $child ) {
+ $child_bits[] = $this->render_element( $child );
+ }
+
+ /*
+ * Sprinkle text and comments between children — but never directly
+ * inside `html` or `head`, where character tokens would trigger
+ * insertion-mode changes (early body creation, head popping) that
+ * desynchronize the model from the parsed tree.
+ */
+ $lower_tag = strtolower( $element['tag'] );
+ $may_have_filler = ! in_array( $lower_tag, array( 'html', 'head' ), true );
+ $filler_options = array(
+ '',
+ 'text',
+ ' more text ',
+ "\n ",
+ '& <escaped>',
+ '',
+ 'café ✓',
+ );
+ $content = '';
+ foreach ( $child_bits as $bit ) {
+ if ( $may_have_filler && $this->prng->chance( 40 ) ) {
+ $content .= $this->prng->choice( $filler_options );
+ }
+ $content .= $bit;
+ }
+ if ( $may_have_filler && $this->prng->chance( 40 ) ) {
+ $content .= $this->prng->choice( $filler_options );
+ }
+ if ( 'title' === $lower_tag ) {
+ // RAWTEXT: keep it plain.
+ $content = $this->prng->chance( 60 ) ? 'Fuzz Title' : '';
+ }
+
+ return $out . $content . '' . $element['tag'] . '>';
+ }
+
+ /** @param string|true $value */
+ private function render_attr( string $name, $value ): string {
+ if ( true === $value ) {
+ return $name;
+ }
+
+ $style = $this->prng->weighted(
+ array(
+ 'double' => 60,
+ 'single' => 20,
+ 'unquoted' => 20,
+ )
+ );
+
+ if ( 'unquoted' === $style && ( '' === $value || strlen( $value ) !== strspn( $value, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._:-' ) ) ) {
+ $style = 'double';
+ }
+
+ switch ( $style ) {
+ case 'unquoted':
+ return $name . '=' . $value;
+ case 'single':
+ return $name . "='" . str_replace( array( '&', "'", '<' ), array( '&', ''', '<' ), $value ) . "'";
+ default:
+ return $name . '="' . str_replace( array( '&', '"', '<' ), array( '&', '"', '<' ), $value ) . '"';
+ }
+ }
+
+ /*
+ * ----------------
+ * Model utilities
+ * ----------------
+ */
+
+ /** Pre-order (document order) list of elements. */
+ public static function flatten( array $element ): array {
+ $out = array( $element );
+ foreach ( $element['children'] as $child ) {
+ foreach ( self::flatten( $child ) as $descendant ) {
+ $out[] = $descendant;
+ }
+ }
+ return $out;
+ }
+
+ /**
+ * Pre-order list of ( element, ancestors ) pairs where ancestors is the
+ * chain from nearest ancestor to root — the same orientation as
+ * WP_HTML_Processor::get_breadcrumbs() reversed past self.
+ */
+ public static function flatten_with_ancestors( array $element, array $ancestors = array() ): array {
+ $out = array( array( $element, $ancestors ) );
+ $next_ancestors = array_merge( array( $element ), $ancestors );
+ foreach ( $element['children'] as $child ) {
+ foreach ( self::flatten_with_ancestors( $child, $next_ancestors ) as $pair ) {
+ $out[] = $pair;
+ }
+ }
+ return $out;
+ }
+
+ /**
+ * Flat element rows ( the TreeCapture row shape ) derived from a model:
+ * pre-order, tags uppercased, attribute names lowercased with the first
+ * of duplicates winning — directly comparable to a TreeCapture of the
+ * rendered document.
+ */
+ public static function rows_from_model( array $model ): array {
+ $rows = array();
+ foreach ( self::flatten_with_ancestors( $model ) as $pair ) {
+ list( $element, $ancestors ) = $pair;
+
+ $attrs = array();
+ $seen = array();
+ foreach ( $element['attrs'] as $attr ) {
+ $lower = ascii_strtolower( $attr[0] );
+ if ( isset( $seen[ $lower ] ) ) {
+ continue;
+ }
+ $seen[ $lower ] = true;
+ $attrs[] = array( $lower, $attr[1] );
+ }
+
+ $ancestor_tags = array();
+ foreach ( $ancestors as $ancestor ) {
+ $ancestor_tags[] = strtoupper( ascii_strtolower( $ancestor['tag'] ) );
+ }
+
+ $rows[] = array(
+ 'tag' => strtoupper( ascii_strtolower( $element['tag'] ) ),
+ 'fid' => $element['fid'],
+ 'attrs' => $attrs,
+ 'ancestorTags' => $ancestor_tags,
+ );
+ }
+ return $rows;
+ }
+
+ /** First attribute value for a name, ASCII case-insensitive; null if absent. */
+ public static function get_attribute_value( array $element, string $name ) {
+ $comparable = ascii_strtolower( $name );
+ foreach ( $element['attrs'] as $attr ) {
+ if ( ascii_strtolower( $attr[0] ) === $comparable ) {
+ return $attr[1];
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Class tokens as seen by selector matching: ASCII whitespace separates
+ * tokens, and NUL inside a token is exposed as U+FFFD by class_list().
+ *
+ * @return string[]
+ */
+ public static function class_tokens( string $class_value ): array {
+ $tokens = array();
+ $length = strlen( $class_value );
+ $at = 0;
+ $ws = " \t\r\n\f";
+ while ( $at < $length ) {
+ $at += strspn( $class_value, $ws, $at );
+ if ( $at >= $length ) {
+ break;
+ }
+
+ $token_length = strcspn( $class_value, $ws, $at );
+ $tokens[] = str_replace( "\0", "\u{FFFD}", substr( $class_value, $at, $token_length ) );
+ $at += $token_length;
+ }
+ return $tokens;
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/LexborOracle.php b/tools/css-selector-fuzz/lib/LexborOracle.php
new file mode 100644
index 0000000000000..afb508ddc1278
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/LexborOracle.php
@@ -0,0 +1,215 @@
+ fuzzer-oracle problem ( investigate
+ * the fuzzer, 'lexbor-divergence' ).
+ * reference == lexbor != WP => high-confidence WP finding ( the
+ * regular match-mismatch-html failure
+ * with no accompanying divergence ).
+ *
+ * Known bug compensated for: lexbor #368 — class and #id selectors match
+ * ASCII case-insensitively even in no-quirks mode ( attribute selectors
+ * like [id=x] are correctly case-sensitive ). Detected by probe at startup;
+ * when present, lexbor is compared against the reference matcher run with
+ * quirks-style class/ID folding. Quirks documents are compared only when
+ * the probe also confirms class and #id selectors fold in quirks mode.
+ */
+class LexborOracle {
+
+ const READ_TIMEOUT_SECONDS = 5;
+
+ /** @var resource|null */
+ private static $process = null;
+ /** @var array|null */
+ private static $pipes = null;
+ /** @var bool|null */
+ private static $available = null;
+ /** @var bool */
+ private static $issue368 = false;
+ /** @var bool */
+ private static $quirks_class_id_reliable = false;
+
+ public static function harness_path(): string {
+ return dirname( __DIR__ ) . '/lexbor/harness';
+ }
+
+ /** Whether the harness is built, starts, and answered the probes. */
+ public static function available(): bool {
+ if ( null !== self::$available ) {
+ return self::$available;
+ }
+
+ self::$available = false;
+ if ( ! is_executable( self::harness_path() ) || ! self::start() ) {
+ return false;
+ }
+
+ // Probe: sanity plus class/#id case-sensitivity behavior.
+ $sane = self::query( '', 'div.a' );
+ if ( null === $sane || array( 'x' ) !== $sane['matches'] ) {
+ self::stop();
+ return false;
+ }
+
+ $no_quirks_class = self::query( '', '.A' );
+ $no_quirks_id = self::query( '', '#A' );
+ $quirks_class = self::query( '', '.A' );
+ $quirks_id = self::query( '', '#A' );
+ foreach ( array( $no_quirks_class, $no_quirks_id, $quirks_class, $quirks_id ) as $probe ) {
+ if ( null === $probe || null !== $probe['error'] ) {
+ self::stop();
+ return false;
+ }
+ }
+
+ self::$issue368 = array( 'x' ) === $no_quirks_class['matches']
+ || array( 'x' ) === $no_quirks_id['matches'];
+ self::$quirks_class_id_reliable = ! self::$issue368
+ && array() === $no_quirks_class['matches']
+ && array() === $no_quirks_id['matches']
+ && array( 'x' ) === $quirks_class['matches']
+ && array( 'x' ) === $quirks_id['matches'];
+ self::$available = true;
+ return true;
+ }
+
+ /** Whether the built lexbor exhibits issue #368 ( class/ID case folding ). */
+ public static function has_issue_368(): bool {
+ return self::$issue368;
+ }
+
+ /** Whether lexbor can be trusted on quirks class/#id case folding. */
+ public static function quirks_class_id_reliable(): bool {
+ return self::$quirks_class_id_reliable;
+ }
+
+ /**
+ * Runs one case through lexbor.
+ *
+ * @return array{
+ * rows: array,
+ * matches: string[],
+ * error: string|null,
+ * }|null Null when the harness is unavailable or misbehaved ( the
+ * harness is stopped; the caller should skip the differential ).
+ */
+ public static function query( string $html, string $selector ): ?array {
+ if ( null === self::$process && ! self::start() ) {
+ return null;
+ }
+
+ $line = base64_encode( $html ) . "\t" . base64_encode( $selector ) . "\n";
+ $written = fwrite( self::$pipes[0], $line );
+ fflush( self::$pipes[0] );
+ if ( strlen( $line ) !== $written ) {
+ self::stop();
+ self::$available = false;
+ return null;
+ }
+
+ $rows = array();
+ $matches = array();
+ $error = null;
+
+ while ( true ) {
+ $response = self::read_line();
+ if ( null === $response ) {
+ self::stop();
+ self::$available = false;
+ return null;
+ }
+ if ( 'D' === $response ) {
+ break;
+ }
+
+ $parts = explode( "\t", $response );
+ switch ( $parts[0] ) {
+ case 'R':
+ $rows[] = array(
+ 'tag' => $parts[1] ?? '',
+ 'fid' => $parts[2] ?? '',
+ 'ancestorTags' => '' === ( $parts[3] ?? '' ) ? array() : explode( ',', $parts[3] ),
+ );
+ break;
+ case 'M':
+ $matches[] = $parts[1] ?? '';
+ break;
+ case 'X':
+ $error = $parts[1] ?? 'unknown';
+ break;
+ }
+ }
+
+ return array(
+ 'rows' => $rows,
+ 'matches' => $matches,
+ 'error' => $error,
+ );
+ }
+
+ private static function start(): bool {
+ $descriptors = array(
+ 0 => array( 'pipe', 'r' ),
+ 1 => array( 'pipe', 'w' ),
+ 2 => array( 'file', '/dev/null', 'w' ),
+ );
+
+ $process = proc_open( array( self::harness_path() ), $descriptors, $pipes );
+ if ( ! is_resource( $process ) ) {
+ return false;
+ }
+
+ self::$process = $process;
+ self::$pipes = $pipes;
+ stream_set_blocking( $pipes[1], false );
+ return true;
+ }
+
+ private static function stop(): void {
+ if ( null === self::$process ) {
+ return;
+ }
+ @fclose( self::$pipes[0] );
+ @fclose( self::$pipes[1] );
+ @proc_terminate( self::$process, 9 );
+ @proc_close( self::$process );
+ self::$process = null;
+ self::$pipes = null;
+ }
+
+ /** Reads one newline-terminated line with a timeout; null on failure. */
+ private static function read_line(): ?string {
+ $line = '';
+ $deadline = microtime( true ) + self::READ_TIMEOUT_SECONDS;
+
+ while ( true ) {
+ $read = array( self::$pipes[1] );
+ $write = null;
+ $except = null;
+ $left = $deadline - microtime( true );
+ if ( $left <= 0 ) {
+ return null;
+ }
+ $ready = stream_select( $read, $write, $except, 0, (int) ( $left * 1e6 ) );
+ if ( false === $ready || 0 === $ready ) {
+ return null;
+ }
+ $chunk = fgets( self::$pipes[1] );
+ if ( false === $chunk ) {
+ return null;
+ }
+ $line .= $chunk;
+ if ( str_ends_with( $line, "\n" ) ) {
+ return substr( $line, 0, -1 );
+ }
+ }
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/Metamorph.php b/tools/css-selector-fuzz/lib/Metamorph.php
new file mode 100644
index 0000000000000..159d8e8bedd52
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/Metamorph.php
@@ -0,0 +1,150 @@
+
+ */
+ public static function variants( array $list_ast, Prng $prng ): array {
+ /*
+ * from_selectors() scrubs invalid UTF-8 to U+FFFD before parsing, so
+ * parsed AST names are always valid UTF-8 and this guard should be
+ * unreachable. It stays as defense in depth: the renderer can only
+ * round-trip valid UTF-8 names, and a future AST source that skips
+ * normalization would otherwise corrupt the variants silently.
+ */
+ if ( ! ast_strings_are_utf8( $list_ast ) ) {
+ return array();
+ }
+
+ $out = array();
+
+ $out[] = array(
+ 'name' => 'rerender',
+ 'selector' => SelectorGenerator::render( $prng->fork( 'rerender' ), $list_ast, true ),
+ 'ast' => $list_ast,
+ 'astMustMatch' => true,
+ );
+
+ $typecase = self::map_types(
+ $list_ast,
+ static function ( string $type ) use ( $prng ): string {
+ if ( '*' === $type ) {
+ return $type;
+ }
+ $out = '';
+ for ( $i = 0; $i < strlen( $type ); $i++ ) {
+ $c = $type[ $i ];
+ $out .= $prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c );
+ }
+ return $out;
+ }
+ );
+ if ( $typecase !== $list_ast ) {
+ $out[] = array(
+ 'name' => 'typecase',
+ 'selector' => SelectorGenerator::render( $prng->fork( 'typecase' ), $typecase ),
+ 'ast' => $typecase,
+ 'astMustMatch' => true,
+ );
+ }
+
+ $reordered = self::rotate_subs( $list_ast );
+ if ( $reordered !== $list_ast ) {
+ $out[] = array(
+ 'name' => 'subs-reorder',
+ 'selector' => SelectorGenerator::render( $prng->fork( 'subs-reorder' ), $reordered ),
+ 'ast' => $reordered,
+ 'astMustMatch' => true,
+ );
+ }
+
+ $universal = self::explicit_universal( $list_ast );
+ if ( $universal !== $list_ast ) {
+ $out[] = array(
+ 'name' => 'universal',
+ 'selector' => SelectorGenerator::render( $prng->fork( 'universal' ), $universal ),
+ 'ast' => $universal,
+ 'astMustMatch' => true,
+ );
+ }
+
+ $duplicated = $list_ast;
+ $duplicated[] = $list_ast[ $prng->int( 0, count( $list_ast ) - 1 ) ];
+ $out[] = array(
+ 'name' => 'dup-branch',
+ 'selector' => SelectorGenerator::render( $prng->fork( 'dup-branch' ), $duplicated ),
+ 'ast' => $duplicated,
+ 'astMustMatch' => true,
+ );
+
+ return $out;
+ }
+
+ /** Applies $fn to every type-selector name: compound types and context types. */
+ private static function map_types( array $list_ast, callable $fn ): array {
+ foreach ( $list_ast as &$complex ) {
+ foreach ( $complex['context'] as &$pair ) {
+ $pair[0] = $fn( $pair[0] );
+ }
+ unset( $pair );
+ if ( null !== $complex['self']['type'] ) {
+ $complex['self']['type'] = $fn( $complex['self']['type'] );
+ }
+ }
+ unset( $complex );
+ return $list_ast;
+ }
+
+ /** Rotates the subclass list of every compound that has two or more. */
+ private static function rotate_subs( array $list_ast ): array {
+ foreach ( $list_ast as &$complex ) {
+ $subs = $complex['self']['subs'];
+ if ( is_array( $subs ) && count( $subs ) >= 2 ) {
+ $subs[] = array_shift( $subs );
+ $complex['self']['subs'] = $subs;
+ }
+ }
+ unset( $complex );
+ return $list_ast;
+ }
+
+ /** Writes an explicit `*` wherever a compound omitted its type selector. */
+ private static function explicit_universal( array $list_ast ): array {
+ foreach ( $list_ast as &$complex ) {
+ if ( null === $complex['self']['type'] && null !== $complex['self']['subs'] ) {
+ $complex['self']['type'] = '*';
+ }
+ }
+ unset( $complex );
+ return $list_ast;
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/Prng.php b/tools/css-selector-fuzz/lib/Prng.php
new file mode 100644
index 0000000000000..b8d8737304277
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/Prng.php
@@ -0,0 +1,65 @@
+key = $seed . "\x1f" . $label;
+ }
+
+ /** Derives an independent child stream; consuming it does not affect this stream. */
+ public function fork( string $label ): Prng {
+ return new Prng( $this->key, $label . ':' . $this->uint32() );
+ }
+
+ public function bytes( int $length ): string {
+ while ( strlen( $this->buffer ) < $length ) {
+ $this->buffer .= hash( 'sha256', $this->key . ':' . $this->counter++, true );
+ }
+ $out = substr( $this->buffer, 0, $length );
+ $this->buffer = substr( $this->buffer, $length );
+ return $out;
+ }
+
+ public function uint32(): int {
+ $parts = unpack( 'Nvalue', $this->bytes( 4 ) );
+ return (int) $parts['value'];
+ }
+
+ public function int( int $min, int $max ): int {
+ if ( $max <= $min ) {
+ return $min;
+ }
+ return $min + ( $this->uint32() % ( $max - $min + 1 ) );
+ }
+
+ public function chance( int $numerator, int $denominator = 100 ): bool {
+ return $this->int( 1, $denominator ) <= $numerator;
+ }
+
+ public function choice( array $values ) {
+ return $values[ $this->int( 0, count( $values ) - 1 ) ];
+ }
+
+ /** @param array $weights value => weight */
+ public function weighted( array $weights ) {
+ $total = array_sum( $weights );
+ $pick = $this->int( 1, max( 1, (int) $total ) );
+ foreach ( $weights as $value => $weight ) {
+ $pick -= $weight;
+ if ( $pick <= 0 ) {
+ return $value;
+ }
+ }
+ return array_key_first( $weights );
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/ReferenceMatcher.php b/tools/css-selector-fuzz/lib/ReferenceMatcher.php
new file mode 100644
index 0000000000000..ea422078e9893
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/ReferenceMatcher.php
@@ -0,0 +1,323 @@
+ true,
+ 'accept-charset' => true,
+ 'align' => true,
+ 'alink' => true,
+ 'axis' => true,
+ 'bgcolor' => true,
+ 'charset' => true,
+ 'checked' => true,
+ 'clear' => true,
+ 'codetype' => true,
+ 'color' => true,
+ 'compact' => true,
+ 'declare' => true,
+ 'defer' => true,
+ 'dir' => true,
+ 'direction' => true,
+ 'disabled' => true,
+ 'enctype' => true,
+ 'face' => true,
+ 'frame' => true,
+ 'hreflang' => true,
+ 'http-equiv' => true,
+ 'lang' => true,
+ 'language' => true,
+ 'link' => true,
+ 'media' => true,
+ 'method' => true,
+ 'multiple' => true,
+ 'nohref' => true,
+ 'noresize' => true,
+ 'noshade' => true,
+ 'nowrap' => true,
+ 'readonly' => true,
+ 'rel' => true,
+ 'rev' => true,
+ 'rules' => true,
+ 'scope' => true,
+ 'scrolling' => true,
+ 'selected' => true,
+ 'shape' => true,
+ 'target' => true,
+ 'text' => true,
+ 'type' => true,
+ 'valign' => true,
+ 'valuetype' => true,
+ 'vlink' => true,
+ );
+
+ /**
+ * Expected match list for WP_HTML_Processor::select().
+ *
+ * @param array $list_ast Canonical complex selector list AST.
+ * @param array $rows Element rows in visit order, with ancestorTags.
+ * @param bool $quirks Whether the document parses in quirks mode.
+ * @param bool $html_attr_ci Whether HTML's case-insensitive attribute value
+ * list applies. True models WP/browsers; false
+ * models an engine without the rule ( lexbor ).
+ * @return string[] data-fid values in visit order.
+ */
+ public static function expected_html_matches_rows( array $list_ast, array $rows, bool $quirks, bool $html_attr_ci = true ): array {
+ $out = array();
+ foreach ( $rows as $row ) {
+ if ( self::list_matches_row( $list_ast, $row, $quirks, $html_attr_ci ) ) {
+ $out[] = $row['fid'];
+ }
+ }
+ return $out;
+ }
+
+ /**
+ * Expected match list for WP_HTML_Tag_Processor::select() over the same
+ * markup. The tag processor never enters quirks mode on its own and a
+ * compound selector list never inspects ancestors.
+ *
+ * @param array $list_ast Canonical complex selector list AST ( contexts must be empty ).
+ * @param array $rows Tag-view element rows in token order.
+ * @return string[] data-fid values in token order.
+ */
+ public static function expected_tag_matches_rows( array $list_ast, array $rows ): array {
+ $out = array();
+ foreach ( $rows as $row ) {
+ $matched = false;
+ foreach ( $list_ast as $complex ) {
+ if ( self::compound_matches( $complex['self'], $row, false, true ) ) {
+ $matched = true;
+ break;
+ }
+ }
+ if ( $matched ) {
+ $out[] = $row['fid'];
+ }
+ }
+ return $out;
+ }
+
+ /** Back-compat: expected html-processor matches from a generated model. */
+ public static function expected_html_processor_matches( array $list_ast, array $model, bool $quirks ): array {
+ return self::expected_html_matches_rows( $list_ast, DocumentGenerator::rows_from_model( $model ), $quirks );
+ }
+
+ /** Back-compat: expected tag-processor matches from a generated model. */
+ public static function expected_tag_processor_matches( array $list_ast, array $model ): array {
+ return self::expected_tag_matches_rows( $list_ast, DocumentGenerator::rows_from_model( $model ) );
+ }
+
+ public static function list_matches_row( array $list_ast, array $row, bool $quirks, bool $html_attr_ci = true ): bool {
+ foreach ( $list_ast as $complex ) {
+ if (
+ self::compound_matches( $complex['self'], $row, $quirks, $html_attr_ci ) &&
+ self::explore_context( $complex['context'], $row['ancestorTags'] )
+ ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * @param array $context Right-to-left ( type, combinator ) pairs.
+ * @param string[] $ancestor_tags Nearest-ancestor-first tag names.
+ */
+ private static function explore_context( array $context, array $ancestor_tags ): bool {
+ if ( array() === $context ) {
+ return true;
+ }
+ if ( array() === $ancestor_tags ) {
+ return false;
+ }
+
+ list( $type, $combinator ) = $context[0];
+ $rest = array_slice( $context, 1 );
+
+ if ( '>' === $combinator ) {
+ return self::type_matches( $type, $ancestor_tags[0] )
+ && self::explore_context( $rest, array_slice( $ancestor_tags, 1 ) );
+ }
+
+ // Descendant: try every matching ancestor.
+ $count = count( $ancestor_tags );
+ for ( $i = 0; $i < $count; $i++ ) {
+ if (
+ self::type_matches( $type, $ancestor_tags[ $i ] ) &&
+ self::explore_context( $rest, array_slice( $ancestor_tags, $i + 1 ) )
+ ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public static function compound_matches( array $compound, array $row, bool $quirks, bool $html_attr_ci = true ): bool {
+ if ( null !== $compound['type'] && ! self::type_matches( $compound['type'], $row['tag'] ) ) {
+ return false;
+ }
+ foreach ( (array) $compound['subs'] as $sub ) {
+ if ( ! self::sub_matches( $sub, $row, $quirks, $html_attr_ci ) ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static function type_matches( string $type, string $tag ): bool {
+ return '*' === $type || ascii_strtolower( $type ) === ascii_strtolower( $tag );
+ }
+
+ private static function sub_matches( array $sub, array $row, bool $quirks, bool $html_attr_ci ): bool {
+ switch ( $sub['kind'] ) {
+ case 'class':
+ return self::class_matches( $sub['name'], $row, $quirks );
+ case 'id':
+ return self::id_matches( $sub['name'], $row, $quirks );
+ case 'attr':
+ return self::attr_matches( $sub, $row, $html_attr_ci );
+ }
+ return false;
+ }
+
+ private static function class_matches( string $wanted, array $row, bool $quirks ): bool {
+ $class_value = DocumentGenerator::get_attribute_value( $row, 'class' );
+ if ( ! is_string( $class_value ) ) {
+ return false;
+ }
+
+ foreach ( DocumentGenerator::class_tokens( $class_value ) as $word ) {
+ if (
+ $quirks
+ ? ascii_strtolower( $word ) === ascii_strtolower( $wanted )
+ : $word === $wanted
+ ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static function id_matches( string $wanted, array $row, bool $quirks ): bool {
+ $id = DocumentGenerator::get_attribute_value( $row, 'id' );
+ if ( ! is_string( $id ) ) {
+ return false;
+ }
+ return $quirks
+ ? ascii_strtolower( $id ) === ascii_strtolower( $wanted )
+ : $id === $wanted;
+ }
+
+ private static function attr_matches( array $sub, array $row, bool $html_attr_ci ): bool {
+ $attr_value = DocumentGenerator::get_attribute_value( $row, $sub['name'] );
+ if ( null === $attr_value ) {
+ return false;
+ }
+ if ( null === $sub['matcher'] ) {
+ return true;
+ }
+ if ( true === $attr_value ) {
+ $attr_value = '';
+ }
+
+ $wanted = (string) $sub['value'];
+ $case_insensitive = 'case-insensitive' === $sub['modifier'] || (
+ $html_attr_ci &&
+ null === $sub['modifier'] &&
+ 'html' === ( $row['namespace'] ?? 'html' ) &&
+ isset( self::HTML_CASE_INSENSITIVE_ATTRIBUTES[ ascii_strtolower( $sub['name'] ) ] )
+ );
+ if ( $case_insensitive ) {
+ $attr_value = ascii_strtolower( $attr_value );
+ $wanted = ascii_strtolower( $wanted );
+ }
+
+ switch ( $sub['matcher'] ) {
+ case 'exact':
+ return $attr_value === $wanted;
+
+ case 'one-of':
+ if ( '' === $wanted || strlen( $wanted ) !== strcspn( $wanted, self::WHITESPACE ) ) {
+ return false;
+ }
+ $length = strlen( $attr_value );
+ $at = 0;
+ while ( $at < $length ) {
+ $at += strspn( $attr_value, self::WHITESPACE, $at );
+ if ( $at >= $length ) {
+ break;
+ }
+ $word_length = strcspn( $attr_value, self::WHITESPACE, $at );
+ if ( substr( $attr_value, $at, $word_length ) === $wanted ) {
+ return true;
+ }
+ $at += $word_length;
+ }
+ return false;
+
+ case 'exact-or-hyphen-suffixed':
+ if ( $attr_value === $wanted ) {
+ return true;
+ }
+ return 0 === strncmp( $attr_value, $wanted . '-', strlen( $wanted ) + 1 );
+
+ case 'prefixed':
+ if ( '' === $wanted ) {
+ return false;
+ }
+ return 0 === strncmp( $attr_value, $wanted, strlen( $wanted ) );
+
+ case 'suffixed':
+ if ( '' === $wanted ) {
+ return false;
+ }
+ return strlen( $attr_value ) >= strlen( $wanted )
+ && substr( $attr_value, -strlen( $wanted ) ) === $wanted;
+
+ case 'contains':
+ if ( '' === $wanted ) {
+ return false;
+ }
+ return false !== strpos( $attr_value, $wanted );
+ }
+
+ return false;
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/SelectorGenerator.php b/tools/css-selector-fuzz/lib/SelectorGenerator.php
new file mode 100644
index 0000000000000..a41a3525878c2
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/SelectorGenerator.php
@@ -0,0 +1,1736 @@
+ array( "\x80", 1 ),
+ 'truncated-2-byte' => array( "\xC3", 1 ),
+ 'truncated-3-byte' => array( "\xE2\x8C", 1 ),
+ 'truncated-4-byte' => array( "\xF0\x9F\x82", 1 ),
+ 'invalid-lead-f5' => array( "\xF5", 1 ),
+ 'invalid-lead-ff' => array( "\xFF", 1 ),
+ 'overlong-min' => array( "\xC0\x80", 2 ),
+ 'overlong-max' => array( "\xC1\xBF", 2 ),
+ 'surrogate-half' => array( "\xED\xA0\x80", 3 ),
+ 'beyond-max' => array( "\xF4\x90\x80\x80", 4 ),
+ );
+
+ /** @var Prng */
+ private $prng;
+ /** @var array */
+ private $pools;
+ /** @var bool Escape ident codepoints aggressively when rendering. */
+ private $escape_boost = false;
+
+ private function __construct( Prng $prng, array $pools ) {
+ $this->prng = $prng;
+ $this->pools = $pools;
+ }
+
+ /**
+ * Renders a canonical complex-list AST to a selector string. Parsing the
+ * result must yield exactly the given AST. With $escape_boost, idents are
+ * escaped far more often (exercises the escape decoder on no-op escapes).
+ */
+ public static function render( Prng $prng, array $list_ast, bool $escape_boost = false ): string {
+ $generator = new self( $prng, array() );
+ $generator->escape_boost = $escape_boost;
+ return $generator->render_complex_list( $list_ast );
+ }
+
+ /**
+ * Renders a canonical complex-list AST deterministically with minimal
+ * escaping: single spaces around combinators, `, ` between branches,
+ * double-quoted attribute values, lowercase `i`/`s` modifiers, and all
+ * non-ASCII codepoints hex-escaped. Used to hand a semantically-identical
+ * selector to external engines: lexbor rejects some byte-level forms WP
+ * correctly accepts ( uppercase I/S attribute modifiers; raw non-ASCII
+ * ident codepoints in U+00B7, U+00C0-U+00F6 — its non-ASCII ident table
+ * starts at U+00F8 ). Escaping sidesteps codepoint classification.
+ */
+ public static function render_canonical( array $list_ast ): string {
+ $branches = array();
+ foreach ( $list_ast as $complex ) {
+ $out = '';
+ foreach ( array_reverse( $complex['context'] ) as $pair ) {
+ list( $type, $combinator ) = $pair;
+ $out .= '*' === $type ? '*' : self::canonical_ident( $type );
+ $out .= '>' === $combinator ? ' > ' : ' ';
+ }
+
+ $compound = $complex['self'];
+ if ( null !== $compound['type'] ) {
+ $out .= '*' === $compound['type'] ? '*' : self::canonical_ident( $compound['type'] );
+ }
+ foreach ( (array) $compound['subs'] as $sub ) {
+ switch ( $sub['kind'] ) {
+ case 'class':
+ $out .= '.' . self::canonical_ident( $sub['name'] );
+ break;
+ case 'id':
+ $out .= '#' . self::canonical_ident( $sub['name'] );
+ break;
+ case 'attr':
+ $out .= '[' . self::canonical_ident( $sub['name'] );
+ if ( null !== $sub['matcher'] ) {
+ $matchers = array(
+ 'exact' => '=',
+ 'one-of' => '~=',
+ 'exact-or-hyphen-suffixed' => '|=',
+ 'prefixed' => '^=',
+ 'suffixed' => '$=',
+ 'contains' => '*=',
+ );
+ $out .= $matchers[ $sub['matcher'] ] . self::canonical_string( (string) $sub['value'] );
+ if ( 'case-insensitive' === $sub['modifier'] ) {
+ $out .= ' i';
+ } elseif ( 'case-sensitive' === $sub['modifier'] ) {
+ $out .= ' s';
+ }
+ }
+ $out .= ']';
+ break;
+ }
+ }
+ $branches[] = $out;
+ }
+ return implode( ', ', $branches );
+ }
+
+ private static function canonical_ident( string $name ): string {
+ $points = utf8_codepoints( $name );
+ $count = count( $points );
+ $out = '';
+
+ foreach ( $points as $i => $point ) {
+ list( $char, $cp ) = $point;
+
+ $is_digit = $cp >= 0x30 && $cp <= 0x39;
+ $is_ident_char = (
+ '-' === $char ||
+ '_' === $char ||
+ $is_digit ||
+ ( $cp >= 0x41 && $cp <= 0x5A ) ||
+ ( $cp >= 0x61 && $cp <= 0x7A )
+ );
+
+ $must_escape = ! $is_ident_char
+ || ( 0 === $i && $is_digit )
+ || ( 1 === $i && '-' === $points[0][0] && $is_digit )
+ || ( 1 === $count && '-' === $char );
+
+ $out .= $must_escape ? '\\' . dechex( $cp ) . ' ' : $char;
+ }
+
+ return $out;
+ }
+
+ private static function canonical_string( string $value ): string {
+ $out = '"';
+ foreach ( utf8_codepoints( $value ) as $point ) {
+ list( $char, $cp ) = $point;
+ if ( '"' === $char || '\\' === $char || $cp < 0x20 || $cp > 0x7E ) {
+ $out .= '\\' . dechex( $cp ) . ' ';
+ } else {
+ $out .= $char;
+ }
+ }
+ return $out . '"';
+ }
+
+ /**
+ * @param array $pools Pools from DocumentGenerator ( tags, classes, ids, attrNames, attrValues ).
+ * @param array|null $rows Element rows ( TreeCapture shape ) with real
+ * fids; enables the path-directed bucket.
+ * @return array{
+ * bucket: string,
+ * selector: string,
+ * expectCompound: bool|null,
+ * expectComplex: bool|null,
+ * ast: array|null,
+ * mustMatchFid: string|null,
+ * mustNotMatchFid: string|null,
+ * }
+ */
+ public static function generate( Prng $prng, array $pools, ?array $rows = null, ?string $bucket = null ): array {
+ $generator = new self( $prng, $pools );
+
+ if ( null === $bucket ) {
+ $bucket = $prng->weighted(
+ null === $rows || array() === $rows
+ ? array(
+ 'supported-compound' => 28,
+ 'supported-complex' => 24,
+ 'unsupported' => 14,
+ 'invalid' => 11,
+ 'invalid-utf8' => 5,
+ 'chaos' => 8,
+ 'mutated' => 10,
+ 'edge-escape' => 5,
+ )
+ : array(
+ 'supported-compound' => 23,
+ 'supported-complex' => 19,
+ 'path-directed' => 21,
+ 'unsupported' => 11,
+ 'invalid' => 9,
+ 'invalid-utf8' => 5,
+ 'chaos' => 6,
+ 'mutated' => 6,
+ 'edge-escape' => 5,
+ )
+ );
+ }
+
+ if ( 'path-directed' === $bucket && ( null === $rows || array() === $rows ) ) {
+ $bucket = 'supported-complex';
+ }
+
+ switch ( $bucket ) {
+ case 'supported-compound':
+ $ast = $generator->gen_complex_list( false );
+ return array(
+ 'bucket' => $bucket,
+ 'selector' => $generator->render_complex_list( $ast ),
+ 'expectCompound' => true,
+ 'expectComplex' => true,
+ 'ast' => $ast,
+ );
+
+ case 'supported-complex':
+ $ast = $generator->gen_complex_list( true );
+ return array(
+ 'bucket' => $bucket,
+ 'selector' => $generator->render_complex_list( $ast ),
+ 'expectCompound' => false,
+ 'expectComplex' => true,
+ 'ast' => $ast,
+ );
+
+ case 'path-directed':
+ return $generator->gen_path_directed( $rows );
+
+ case 'edge-escape':
+ return $generator->gen_edge_escape();
+
+ case 'invalid-utf8':
+ return $generator->gen_invalid_utf8();
+
+ case 'unsupported':
+ return array(
+ 'bucket' => $bucket,
+ 'selector' => $generator->gen_unsupported(),
+ 'expectCompound' => false,
+ 'expectComplex' => false,
+ 'ast' => null,
+ );
+
+ case 'invalid':
+ return array(
+ 'bucket' => $bucket,
+ 'selector' => $generator->gen_invalid(),
+ 'expectCompound' => false,
+ 'expectComplex' => false,
+ 'ast' => null,
+ );
+
+ case 'chaos':
+ return array(
+ 'bucket' => $bucket,
+ 'selector' => $generator->gen_chaos(),
+ 'expectCompound' => null,
+ 'expectComplex' => null,
+ 'ast' => null,
+ );
+
+ case 'mutated':
+ default:
+ $ast = $generator->gen_complex_list( $generator->prng->chance( 50 ) );
+ $rendered = $generator->render_complex_list( $ast );
+ return array(
+ 'bucket' => 'mutated',
+ 'selector' => $generator->mutate( $rendered ),
+ 'expectCompound' => null,
+ 'expectComplex' => null,
+ 'ast' => null,
+ );
+ }
+ }
+
+ /*
+ * --------------
+ * AST generation
+ * --------------
+ *
+ * Canonical AST shapes (matching what AstExtractor produces from
+ * parsed WP_CSS_* objects):
+ *
+ * list: array of complex
+ * complex: array( 'context' => array( array( type, combinator ) ... right-to-left ), 'self' => compound )
+ * compound: array( 'type' => string|null, 'subs' => array|null )
+ * sub: array( 'kind' => 'class'|'id', 'name' => string )
+ * | array( 'kind' => 'attr', 'name' => string, 'matcher' => string|null,
+ * 'value' => string|null, 'modifier' => string|null )
+ */
+
+ private function gen_complex_list( bool $require_combinator ): array {
+ $count = $this->prng->weighted(
+ array(
+ 1 => 55,
+ 2 => 30,
+ 3 => 15,
+ )
+ );
+
+ $list = array();
+ $combinator_at = $require_combinator ? $this->prng->int( 0, $count - 1 ) : -1;
+ for ( $i = 0; $i < $count; $i++ ) {
+ $wants_combinators = $i === $combinator_at || ( $require_combinator && $this->prng->chance( 30 ) );
+ $list[] = $this->gen_complex( $require_combinator ? $wants_combinators : false );
+ }
+ return $list;
+ }
+
+ private function gen_complex( bool $with_combinators ): array {
+ $context = array();
+ if ( $with_combinators ) {
+ $context_count = $this->prng->int( 1, 3 );
+ for ( $i = 0; $i < $context_count; $i++ ) {
+ $context[] = array(
+ $this->gen_type_name( true ),
+ $this->prng->chance( 50 ) ? ' ' : '>',
+ );
+ }
+ }
+
+ return array(
+ 'context' => $context,
+ 'self' => $this->gen_compound(),
+ );
+ }
+
+ private function gen_compound(): array {
+ $has_type = $this->prng->chance( 65 );
+ $sub_count = $this->prng->weighted(
+ array(
+ 0 => 30,
+ 1 => 40,
+ 2 => 20,
+ 3 => 10,
+ )
+ );
+ if ( ! $has_type && 0 === $sub_count ) {
+ if ( $this->prng->chance( 50 ) ) {
+ $has_type = true;
+ } else {
+ $sub_count = 1;
+ }
+ }
+
+ $subs = array();
+ for ( $i = 0; $i < $sub_count; $i++ ) {
+ $subs[] = $this->gen_subclass();
+ }
+
+ return array(
+ 'type' => $has_type ? $this->gen_type_name( false ) : null,
+ 'subs' => array() === $subs ? null : $subs,
+ );
+ }
+
+ private function gen_type_name( bool $for_context ): string {
+ if ( $this->prng->chance( $for_context ? 25 : 12 ) ) {
+ return '*';
+ }
+ $pool = $this->pools['tags'] ?? array();
+ if ( array() !== $pool && $this->prng->chance( 70 ) ) {
+ $name = $this->prng->choice( $pool );
+ return $this->prng->chance( 25 ) ? $this->random_case( $name ) : $name;
+ }
+ return $this->prng->choice( array( 'video', 'table', 'x-absent', 'object', 'span' ) );
+ }
+
+ private function gen_subclass(): array {
+ $kind = $this->prng->weighted(
+ array(
+ 'class' => 40,
+ 'id' => 25,
+ 'attr' => 35,
+ )
+ );
+
+ switch ( $kind ) {
+ case 'class':
+ return array(
+ 'kind' => 'class',
+ 'name' => $this->pick_name( 'classes' ),
+ );
+ case 'id':
+ return array(
+ 'kind' => 'id',
+ 'name' => $this->pick_name( 'ids' ),
+ );
+ default:
+ return $this->gen_attr_selector();
+ }
+ }
+
+ private function gen_attr_selector(): array {
+ $name = $this->pick_name( 'attrNames' );
+
+ $matcher = $this->prng->weighted(
+ array(
+ '' => 25,
+ 'exact' => 20,
+ 'one-of' => 12,
+ 'exact-or-hyphen-suffixed' => 11,
+ 'prefixed' => 11,
+ 'suffixed' => 11,
+ 'contains' => 10,
+ )
+ );
+ $matcher = '' === $matcher ? null : $matcher;
+
+ if ( null === $matcher ) {
+ return array(
+ 'kind' => 'attr',
+ 'name' => $name,
+ 'matcher' => null,
+ 'value' => null,
+ 'modifier' => null,
+ );
+ }
+
+ $modifier = $this->prng->weighted(
+ array(
+ '' => 70,
+ 'case-insensitive' => 18,
+ 'case-sensitive' => 12,
+ )
+ );
+
+ $value = $this->gen_attr_value();
+
+ /*
+ * HTML's case-insensitive attribute value list: with no modifier,
+ * the values of listed attributes ( type, rel, lang, dir, ... )
+ * match ASCII case-insensitively on HTML elements. Sometimes flip
+ * the case of the selector value for a listed attribute so the
+ * differential exercises that rule rather than relying on sampled
+ * values happening to differ in case.
+ */
+ if (
+ '' === $modifier &&
+ isset( ReferenceMatcher::HTML_CASE_INSENSITIVE_ATTRIBUTES[ ascii_strtolower( $name ) ] ) &&
+ $this->prng->chance( 40 )
+ ) {
+ $value = $this->prng->chance( 50 ) ? ascii_strtoupper( $value ) : str_shuffle_case( $value, $this->prng );
+ }
+
+ return array(
+ 'kind' => 'attr',
+ 'name' => $name,
+ 'matcher' => $matcher,
+ 'value' => $value,
+ 'modifier' => '' === $modifier ? null : $modifier,
+ );
+ }
+
+ private function gen_attr_value(): string {
+ $pool = $this->pools['attrValues'] ?? array();
+
+ $kind = $this->prng->weighted(
+ array(
+ 'pool' => 35,
+ 'pool-part' => 20,
+ 'pool-case' => 10,
+ 'empty' => 10,
+ 'word' => 15,
+ 'tricky' => 10,
+ )
+ );
+
+ if ( in_array( $kind, array( 'pool', 'pool-part', 'pool-case' ), true ) && array() === $pool ) {
+ $kind = 'word';
+ }
+
+ switch ( $kind ) {
+ case 'pool':
+ return $this->prng->choice( $pool );
+
+ case 'pool-part':
+ $value = $this->prng->choice( $pool );
+ if ( '' === $value ) {
+ return '';
+ }
+ $points = utf8_codepoints( $value );
+ $total = count( $points );
+ $start = $this->prng->int( 0, max( 0, $total - 1 ) );
+ $length = $this->prng->int( 1, $total - $start );
+ $part = '';
+ for ( $i = $start; $i < $start + $length; $i++ ) {
+ $part .= $points[ $i ][0];
+ }
+ return $part;
+
+ case 'pool-case':
+ return $this->random_case( $this->prng->choice( $pool ) );
+
+ case 'empty':
+ return '';
+
+ case 'word':
+ return $this->prng->choice( array( 'alpha', 'beta9', 'value', 'main-item', 'Z', 'i', 's', 'one two', 'x-y-z' ) );
+
+ case 'tricky':
+ default:
+ return $this->prng->choice(
+ array(
+ 'a b',
+ " lead",
+ "trail ",
+ "tab\there",
+ "line\nbreak",
+ 'quote"inside',
+ "apos'inside",
+ 'back\\slash',
+ '-',
+ '--',
+ '0digit',
+ 'ünïcode',
+ )
+ );
+ }
+ }
+
+ private function pick_name( string $pool_key ): string {
+ $pool = $this->pools[ $pool_key ] ?? array();
+ if ( array() !== $pool && $this->prng->chance( 65 ) ) {
+ $name = $this->prng->choice( $pool );
+ if ( '' !== $name && $this->prng->chance( 20 ) ) {
+ $name = $this->random_case( $name );
+ }
+ if ( '' !== $name ) {
+ return $name;
+ }
+ }
+ return $this->prng->choice(
+ array(
+ 'absent',
+ 'no-such-thing',
+ 'x',
+ '-lead',
+ '--double',
+ '_under',
+ 'Ünïcode',
+ 'with space',
+ '9starts-with-digit',
+ '-9hyphen-digit',
+ 'mixedCase',
+ )
+ );
+ }
+
+ /*
+ * ---------------------------
+ * Edge-case escapes and input
+ * ---------------------------
+ *
+ * Targets parser branches the structural generators can't reach:
+ * - hex escapes whose codepoint is NUL / a surrogate / over-max, which
+ * the tokenizer must decode to U+FFFD;
+ * - raw NUL / CR / CRLF / FF bytes in the selector input, which
+ * the selector token stream preprocesses ( NUL→U+FFFD, the rest→LF ).
+ *
+ * These carry a known intended AST: the decoded ident is the U+FFFD
+ * replacement character ( or, for input normalization, the same selector
+ * with whitespace normalized ), so the AST round-trip still applies.
+ */
+ private function gen_edge_escape(): array {
+ $kind = $this->prng->weighted(
+ array(
+ 'fffd-ident' => 35,
+ 'eof-escape' => 20,
+ 'eof-truncated' => 15,
+ 'nul-input' => 15,
+ 'ws-input' => 15,
+ )
+ );
+
+ if ( 'eof-truncated' === $kind ) {
+ /*
+ * The end of input auto-closes an unterminated attribute selector
+ * block ( and an unterminated string inside it ): `[a=b` is the
+ * same selector as `[a=b]`.
+ *
+ * https://www.w3.org/TR/css-syntax-3/#consume-simple-block
+ */
+ $matcher = $this->prng->choice( array( null, 'exact', 'one-of', 'exact-or-hyphen-suffixed', 'prefixed', 'suffixed', 'contains' ) );
+ $value = null === $matcher ? null : $this->prng->choice( array( 'v' . $this->prng->int( 0, 99 ), 'a b', '', 'x,y', "caf\u{E9}" ) );
+ $modifier = null !== $matcher && $this->prng->chance( 30 )
+ ? $this->prng->choice( array( 'case-insensitive', 'case-sensitive' ) )
+ : null;
+ $compound = array(
+ 'type' => $this->prng->chance( 50 ) ? 'div' : null,
+ 'subs' => array(
+ array(
+ 'kind' => 'attr',
+ 'name' => 'a' . $this->prng->int( 0, 99 ),
+ 'matcher' => $matcher,
+ 'value' => $value,
+ 'modifier' => $modifier,
+ ),
+ ),
+ );
+
+ // The attribute selector is the final rendered unit, so the render always ends with ']'.
+ $rendered = $this->render_compound( $compound );
+ $truncated = substr( $rendered, 0, -1 );
+
+ // Sometimes also drop a closing string quote: EOF terminates the string, then closes the block.
+ $last_byte = substr( $truncated, -1 );
+ if ( ( '"' === $last_byte || "'" === $last_byte ) && $this->prng->chance( 50 ) ) {
+ $truncated = substr( $truncated, 0, -1 );
+
+ // A backslash at the end of an unterminated string "does nothing": the value is unchanged.
+ if ( $this->prng->chance( 40 ) ) {
+ $truncated .= '\\';
+ }
+ }
+
+ return array(
+ 'bucket' => 'edge-escape',
+ 'selector' => $truncated,
+ 'expectCompound' => true,
+ 'expectComplex' => true,
+ 'ast' => array(
+ array(
+ 'context' => array(),
+ 'self' => $compound,
+ ),
+ ),
+ );
+ }
+
+ if ( 'eof-escape' === $kind ) {
+ /*
+ * A backslash at the end of input is a valid escape ( EOF is not
+ * a newline ) and decodes to U+FFFD, in ident context only:
+ * `.foo\` is the class `foo\u{FFFD}`.
+ *
+ * https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
+ */
+ $name = $this->prng->chance( 30 ) ? '' : 'a' . $this->prng->int( 0, 99 );
+ list( $selector, $self ) = $this->prng->choice(
+ array(
+ array(
+ '.' . $name . '\\',
+ array(
+ 'type' => null,
+ 'subs' => array( array( 'kind' => 'class', 'name' => $name . "\u{FFFD}" ) ),
+ ),
+ ),
+ array(
+ '#' . $name . '\\',
+ array(
+ 'type' => null,
+ 'subs' => array( array( 'kind' => 'id', 'name' => $name . "\u{FFFD}" ) ),
+ ),
+ ),
+ array(
+ $name . '\\',
+ array(
+ 'type' => $name . "\u{FFFD}",
+ 'subs' => null,
+ ),
+ ),
+ )
+ );
+ return array(
+ 'bucket' => 'edge-escape',
+ 'selector' => $selector,
+ 'expectCompound' => true,
+ 'expectComplex' => true,
+ 'ast' => array(
+ array(
+ 'context' => array(),
+ 'self' => $self,
+ ),
+ ),
+ );
+ }
+
+ if ( 'fffd-ident' === $kind ) {
+ // A class selector whose name is a single U+FFFD, produced by a
+ // hex escape for an out-of-range codepoint.
+ $hex = $this->prng->choice(
+ array(
+ '0',
+ '00',
+ '000000',
+ dechex( $this->prng->int( 0xD800, 0xDFFF ) ), // surrogate
+ dechex( $this->prng->int( 0x110000, 0xFFFFFF ) ), // over-max
+ )
+ );
+ if ( $this->prng->chance( 40 ) ) {
+ $hex = strtoupper( $hex );
+ }
+ $selector = '.\\' . $hex . ' ';
+ $ast = array(
+ array(
+ 'context' => array(),
+ 'self' => array(
+ 'type' => null,
+ 'subs' => array( array( 'kind' => 'class', 'name' => "\u{FFFD}" ) ),
+ ),
+ ),
+ );
+ return array(
+ 'bucket' => 'edge-escape',
+ 'selector' => $selector,
+ 'expectCompound' => true,
+ 'expectComplex' => true,
+ 'ast' => $ast,
+ );
+ }
+
+ /*
+ * Raw control bytes in the selector input. A small fixed compound
+ * keeps the case focused on selector input preprocessing and avoids
+ * entangling with unrelated attribute-selector edge cases.
+ */
+ $compound = array(
+ 'type' => $this->prng->chance( 50 ) ? 'span' : null,
+ 'subs' => array(
+ array( 'kind' => 'class', 'name' => 'foo' ),
+ array( 'kind' => 'id', 'name' => 'bar' ),
+ ),
+ );
+ if ( null === $compound['type'] && $this->prng->chance( 50 ) ) {
+ array_pop( $compound['subs'] );
+ }
+ $rendered = $this->render_compound( $compound );
+
+ if ( 'nul-input' === $kind ) {
+ // A NUL between a class dot's selectors becomes part of an ident
+ // only in limited spots; simplest reliable case: a class whose
+ // name contains a NUL ( → U+FFFD ).
+ $ast = array(
+ array(
+ 'context' => array(),
+ 'self' => array(
+ 'type' => null,
+ 'subs' => array( array( 'kind' => 'class', 'name' => "a\u{FFFD}b" ) ),
+ ),
+ ),
+ );
+ return array(
+ 'bucket' => 'edge-escape',
+ 'selector' => ".a\0b",
+ 'expectCompound' => true,
+ 'expectComplex' => true,
+ 'ast' => $ast,
+ );
+ }
+
+ // ws-input: wrap/insert CR, CRLF, FF as insignificant whitespace.
+ $lead = $this->prng->choice( array( "\r", "\f", "\r\n", "\r\r", "\f\f" ) );
+ $trail = $this->prng->choice( array( "\r", "\f", "\r\n", '' ) );
+ return array(
+ 'bucket' => 'edge-escape',
+ 'selector' => $lead . $rendered . $trail,
+ 'expectCompound' => true,
+ 'expectComplex' => true,
+ 'ast' => array(
+ array(
+ 'context' => array(),
+ 'self' => $compound,
+ ),
+ ),
+ );
+ }
+
+ /*
+ * -----------------------
+ * Invalid-UTF-8 injection
+ * -----------------------
+ *
+ * Raw ill-formed UTF-8 byte sequences in the selector input, mirroring
+ * the nul-input pattern: a small fixed simple selector keeps the case
+ * focused on the selector token stream scrub. Each maximal subpart
+ * of the injected sequence decodes to one U+FFFD ( per-class counts
+ * pinned in INVALID_UTF8_CLASSES ), and U+FFFD is a valid ident
+ * codepoint — including in start position — so the scrubbed selector
+ * must parse and the post-scrub AST is known by construction.
+ */
+ private function gen_invalid_utf8(): array {
+ list( $bytes, $subparts ) = $this->prng->choice( array_values( self::INVALID_UTF8_CLASSES ) );
+
+ $position = $this->prng->choice( array( 'lead', 'mid', 'trail', 'whole' ) );
+ $prefix = in_array( $position, array( 'lead', 'whole' ), true ) ? '' : 'a' . $this->prng->int( 0, 9 );
+ $suffix = in_array( $position, array( 'trail', 'whole' ), true ) ? '' : 'z' . $this->prng->int( 0, 9 );
+ $raw = $prefix . $bytes . $suffix;
+ $decoded = $prefix . str_repeat( "\u{FFFD}", $subparts ) . $suffix;
+
+ switch ( $this->prng->choice( array( 'class', 'id', 'attr-name', 'attr-value' ) ) ) {
+ case 'class':
+ $rendered = '.' . $raw;
+ $sub = array(
+ 'kind' => 'class',
+ 'name' => $decoded,
+ );
+ break;
+
+ case 'id':
+ $rendered = '#' . $raw;
+ $sub = array(
+ 'kind' => 'id',
+ 'name' => $decoded,
+ );
+ break;
+
+ case 'attr-name':
+ $rendered = '[' . $raw . ']';
+ $sub = array(
+ 'kind' => 'attr',
+ 'name' => $decoded,
+ 'matcher' => null,
+ 'value' => null,
+ 'modifier' => null,
+ );
+ break;
+
+ case 'attr-value':
+ default:
+ $name = 'a' . $this->prng->int( 0, 99 );
+ $quote = $this->prng->chance( 50 ) ? '"' : "'";
+ $rendered = '[' . $name . '=' . $quote . $raw . $quote . ']';
+ $sub = array(
+ 'kind' => 'attr',
+ 'name' => $name,
+ 'matcher' => 'exact',
+ 'value' => $decoded,
+ 'modifier' => null,
+ );
+ break;
+ }
+
+ $type = $this->prng->chance( 40 ) ? 'span' : null;
+
+ return array(
+ 'bucket' => 'invalid-utf8',
+ 'selector' => ( null === $type ? '' : $type ) . $rendered,
+ 'expectCompound' => true,
+ 'expectComplex' => true,
+ 'ast' => array(
+ array(
+ 'context' => array(),
+ 'self' => array(
+ 'type' => $type,
+ 'subs' => array( $sub ),
+ ),
+ ),
+ ),
+ );
+ }
+
+ /*
+ * ------------------------
+ * Path-directed generation
+ * ------------------------
+ *
+ * Synthesizes a selector from a real element of the model tree so that
+ * the selector is guaranteed (by construction) to match that element:
+ * the type comes from its tag, subclasses from its actual classes / id /
+ * attributes, and the context chain from its actual ancestor tags with
+ * combinators consistent with the real nesting. Optionally one feature
+ * is then flipped into a "near-miss" that is guaranteed NOT to match
+ * the element ( or, for combinator loosening, still guaranteed to ).
+ */
+
+ private function gen_path_directed( array $rows ): array {
+ // Bias toward elements deep enough for a meaningful context chain.
+ $deep = array();
+ foreach ( $rows as $row ) {
+ if ( count( $row['ancestorTags'] ) >= 2 ) {
+ $deep[] = $row;
+ }
+ }
+ $element = array() !== $deep && $this->prng->chance( 75 )
+ ? $this->prng->choice( $deep )
+ : $this->prng->choice( $rows );
+
+ $compound = $this->path_compound_for( $element );
+ $context = array() !== $element['ancestorTags'] && $this->prng->chance( 75 )
+ ? $this->path_context_for( $element['ancestorTags'] )
+ : array();
+
+ $list = array(
+ array(
+ 'context' => $context,
+ 'self' => $compound,
+ ),
+ );
+
+ $must_match = $element['fid'];
+ $must_not_match = null;
+
+ if ( $this->prng->chance( 40 ) ) {
+ list( $list, $must_match, $must_not_match ) = $this->path_near_miss( $list, $element );
+ } elseif ( $this->prng->chance( 20 ) ) {
+ // Extra unrelated branch: a list union can only add matches.
+ $list[] = $this->gen_complex( $this->prng->chance( 30 ) );
+ }
+
+ $has_context = false;
+ foreach ( $list as $complex ) {
+ if ( array() !== $complex['context'] ) {
+ $has_context = true;
+ break;
+ }
+ }
+
+ return array(
+ 'bucket' => 'path-directed',
+ 'selector' => $this->render_complex_list( $list ),
+ 'expectCompound' => ! $has_context,
+ 'expectComplex' => true,
+ 'ast' => $list,
+ 'mustMatchFid' => $must_match,
+ 'mustNotMatchFid' => $must_not_match,
+ );
+ }
+
+ /** A compound selector built only from features the element row really has. */
+ private function path_compound_for( array $element ): array {
+ $tag = ascii_strtolower( $element['tag'] );
+
+ $features = array();
+
+ $class_value = DocumentGenerator::get_attribute_value( $element, 'class' );
+ if ( is_string( $class_value ) ) {
+ foreach ( DocumentGenerator::class_tokens( $class_value ) as $word ) {
+ $features[] = array( 'kind' => 'class', 'name' => $word );
+ }
+ }
+
+ $id_value = DocumentGenerator::get_attribute_value( $element, 'id' );
+ if ( is_string( $id_value ) && '' !== $id_value ) {
+ $features[] = array( 'kind' => 'id', 'name' => $id_value );
+ }
+
+ $seen_attrs = array();
+ foreach ( $element['attrs'] as $attr ) {
+ $lower = ascii_strtolower( $attr[0] );
+ if ( isset( $seen_attrs[ $lower ] ) ) {
+ continue;
+ }
+ $seen_attrs[ $lower ] = true;
+ if ( 'class' === $lower && is_string( $attr[1] ) && false !== strpos( $attr[1], "\0" ) ) {
+ continue;
+ }
+ $features[] = $this->path_attr_feature( $lower, $attr[1], 'html' === ( $element['namespace'] ?? 'html' ) );
+ }
+
+ $subs = array();
+ $available = count( $features );
+ if ( $available > 0 ) {
+ $want = min( $available, $this->prng->weighted( array( 0 => 25, 1 => 40, 2 => 25, 3 => 10 ) ) );
+ for ( $i = 0; $i < $want; $i++ ) {
+ $at = $this->prng->int( 0, count( $features ) - 1 );
+ $subs[] = $features[ $at ];
+ array_splice( $features, $at, 1 );
+ }
+ }
+
+ $type = null;
+ if ( array() === $subs || $this->prng->chance( 70 ) ) {
+ $type = $this->prng->chance( 12 ) ? '*' : ( $this->prng->chance( 30 ) ? $this->random_case( $tag ) : $tag );
+ }
+
+ return array(
+ 'type' => $type,
+ 'subs' => array() === $subs ? null : $subs,
+ );
+ }
+
+ /** An attribute selector that the (name, value) pair satisfies. */
+ private function path_attr_feature( string $name, $value, bool $is_html_namespace = true ): array {
+ $presence = array(
+ 'kind' => 'attr',
+ 'name' => $this->prng->chance( 15 ) ? $this->random_case( $name ) : $name,
+ 'matcher' => null,
+ 'value' => null,
+ 'modifier' => null,
+ );
+
+ if ( true === $value ) {
+ // A boolean attribute has the empty string as its value.
+ $value = '';
+ }
+ if ( ! is_string( $value ) || $this->prng->chance( 30 ) ) {
+ return $presence;
+ }
+
+ $points = utf8_codepoints( $value );
+ $total = count( $points );
+
+ $candidates = array( array( 'exact', $value ) );
+
+ foreach ( preg_split( '/[ \t\n\f\r]+/', $value, -1, PREG_SPLIT_NO_EMPTY ) as $word ) {
+ $candidates[] = array( 'one-of', $word );
+ break;
+ }
+
+ $hyphen_at = strpos( $value, '-' );
+ $candidates[] = array( 'exact-or-hyphen-suffixed', false === $hyphen_at ? $value : substr( $value, 0, $hyphen_at ) );
+
+ if ( $total > 0 ) {
+ $slice = static function ( array $points, int $start, int $length ): string {
+ $out = '';
+ for ( $i = $start; $i < $start + $length; $i++ ) {
+ $out .= $points[ $i ][0];
+ }
+ return $out;
+ };
+
+ $candidates[] = array( 'prefixed', $slice( $points, 0, $this->prng->int( 1, $total ) ) );
+ $length = $this->prng->int( 1, $total );
+ $candidates[] = array( 'suffixed', $slice( $points, $total - $length, $length ) );
+ $start = $this->prng->int( 0, $total - 1 );
+ $candidates[] = array( 'contains', $slice( $points, $start, $this->prng->int( 1, $total - $start ) ) );
+ }
+
+ list( $matcher, $operand ) = $this->prng->choice( $candidates );
+
+ /*
+ * `|=` with an operand cut at a hyphen only matches when the operand
+ * is non-empty and actually a value prefix; an operand equal to the
+ * value always matches. Guard the degenerate empty-operand cases.
+ */
+ if ( 'exact-or-hyphen-suffixed' === $matcher && '' === $operand && '' !== $value ) {
+ $matcher = 'exact';
+ $operand = $value;
+ }
+ if ( in_array( $matcher, array( 'one-of', 'prefixed', 'suffixed', 'contains' ), true ) && '' === $operand ) {
+ return $presence;
+ }
+
+ $modifier = null;
+ if ( $this->prng->chance( 25 ) ) {
+ if ( $this->prng->chance( 60 ) ) {
+ $modifier = 'case-insensitive';
+ $operand = $this->random_case( $operand );
+ } else {
+ $modifier = 'case-sensitive';
+ }
+ } elseif (
+ $is_html_namespace &&
+ isset( ReferenceMatcher::HTML_CASE_INSENSITIVE_ATTRIBUTES[ $name ] ) &&
+ $this->prng->chance( 50 )
+ ) {
+ /*
+ * HTML's case-insensitive attribute value list: with no modifier
+ * the flipped operand still satisfies the (name, value) pair on
+ * an html-namespace element, which makes the folding rule
+ * load-bearing for the mustMatchFid invariant — name and value
+ * here come from the same real element, unlike the independent
+ * pools in gen_attr_selector.
+ */
+ $operand = $this->random_case( $operand );
+ }
+
+ return array(
+ 'kind' => 'attr',
+ 'name' => $presence['name'],
+ 'matcher' => $matcher,
+ 'value' => $operand,
+ 'modifier' => $modifier,
+ );
+ }
+
+ /**
+ * A context chain ( right-to-left ( type, combinator ) pairs ) drawn from
+ * the element's real ancestors so the chain is satisfied by construction:
+ * `>` is only used for the immediately-next ancestor, descendant
+ * combinators may skip generations.
+ *
+ * @param string[] $ancestor_tags Nearest-first ancestor tag names.
+ */
+ private function path_context_for( array $ancestor_tags ): array {
+ $chain = array();
+ $pos = 0;
+ $count = count( $ancestor_tags );
+
+ while ( $pos < $count && ( array() === $chain || $this->prng->chance( 45 ) ) ) {
+ $jump = $this->prng->chance( 65 ) ? 0 : $this->prng->int( 0, $count - 1 - $pos );
+ $at = $pos + $jump;
+
+ $combinator = ( 0 === $jump && $this->prng->chance( 55 ) ) ? '>' : ' ';
+ $tag = ascii_strtolower( $ancestor_tags[ $at ] );
+ $type = $this->prng->chance( 12 )
+ ? '*'
+ : ( $this->prng->chance( 25 ) ? $this->random_case( $tag ) : $tag );
+
+ $chain[] = array( $type, $combinator );
+ $pos = $at + 1;
+ }
+
+ return $chain;
+ }
+
+ /**
+ * Flips one feature of the guaranteed-match selector. Most flips
+ * guarantee the element no longer matches; loosening a `>` to a
+ * descendant combinator must keep it matching.
+ *
+ * @return array{0: array, 1: string|null, 2: string|null} list, mustMatchFid, mustNotMatchFid.
+ */
+ private function path_near_miss( array $list, array $element ): array {
+ $complex = $list[0];
+ $compound = $complex['self'];
+ $fid = $element['fid'];
+
+ $flips = array( 'wrong-class', 'wrong-attr' );
+ if ( null !== $compound['type'] && '*' !== $compound['type'] ) {
+ $flips[] = 'wrong-type';
+ }
+ foreach ( $complex['context'] as $pair ) {
+ if ( '>' === $pair[1] ) {
+ $flips[] = 'loosen-combinator';
+ }
+ $flips[] = 'tighten-combinator';
+ break;
+ }
+
+ switch ( $this->prng->choice( $flips ) ) {
+ case 'wrong-type':
+ $tag = ascii_strtolower( $element['tag'] );
+ do {
+ $other = $this->prng->choice( DocumentGenerator::SAFE_TAGS );
+ } while ( $other === $tag );
+ $complex['self']['type'] = $this->prng->chance( 25 ) ? $this->random_case( $other ) : $other;
+ return array( array( $complex ), null, $fid );
+
+ case 'wrong-attr':
+ $subs = (array) $complex['self']['subs'];
+ $subs[] = array(
+ 'kind' => 'attr',
+ 'name' => 'zz-no-such-attr',
+ 'matcher' => null,
+ 'value' => null,
+ 'modifier' => null,
+ );
+ $complex['self']['subs'] = $subs;
+ return array( array( $complex ), null, $fid );
+
+ case 'loosen-combinator':
+ // Replacing every `>` with a descendant combinator can only
+ // widen the context; the element must still match.
+ foreach ( $complex['context'] as &$pair ) {
+ $pair[1] = ' ';
+ }
+ unset( $pair );
+ $list[0] = $complex;
+ return array( $list, $fid, null );
+
+ case 'tighten-combinator':
+ // May or may not still match; no membership expectation.
+ $at = $this->prng->int( 0, count( $complex['context'] ) - 1 );
+ $complex['context'][ $at ][1] = '>';
+ $list[0] = $complex;
+ return array( $list, null, null );
+
+ case 'wrong-class':
+ default:
+ $subs = (array) $complex['self']['subs'];
+ $subs[] = array(
+ 'kind' => 'class',
+ 'name' => 'zz-no-such-class',
+ );
+ $complex['self']['subs'] = $subs;
+ return array( array( $complex ), null, $fid );
+ }
+ }
+
+ /*
+ * ---------
+ * Rendering
+ * ---------
+ */
+
+ private function render_complex_list( array $list ): string {
+ $bits = array();
+ foreach ( $list as $complex ) {
+ $bits[] = $this->render_complex( $complex );
+ }
+
+ $out = $this->maybe_ws( 25 );
+ foreach ( $bits as $i => $bit ) {
+ if ( $i > 0 ) {
+ $out .= $this->maybe_ws( 40 ) . ',' . $this->maybe_ws( 60 );
+ }
+ $out .= $bit;
+ }
+ return $out . $this->maybe_ws( 25 );
+ }
+
+ private function render_complex( array $complex ): string {
+ $out = '';
+ // Context selectors are stored right-to-left; render left-to-right.
+ $reversed = array_reverse( $complex['context'] );
+ foreach ( $reversed as $pair ) {
+ list( $type, $combinator ) = $pair;
+ $out .= '*' === $type ? '*' : $this->render_ident( $type );
+ if ( '>' === $combinator ) {
+ $out .= $this->maybe_ws( 50 ) . '>' . $this->maybe_ws( 50 );
+ } else {
+ $out .= $this->ws();
+ }
+ }
+ return $out . $this->render_compound( $complex['self'] );
+ }
+
+ private function render_compound( array $compound ): string {
+ $out = '';
+ if ( null !== $compound['type'] ) {
+ $out .= '*' === $compound['type'] ? '*' : $this->render_ident( $compound['type'] );
+ }
+ foreach ( (array) $compound['subs'] as $sub ) {
+ switch ( $sub['kind'] ) {
+ case 'class':
+ $out .= '.' . $this->render_ident( $sub['name'] );
+ break;
+ case 'id':
+ $out .= '#' . $this->render_ident( $sub['name'] );
+ break;
+ case 'attr':
+ $out .= $this->render_attr_selector( $sub );
+ break;
+ }
+ }
+ return $out;
+ }
+
+ private function render_attr_selector( array $sub ): string {
+ $out = '[' . $this->maybe_ws( 20 ) . $this->render_ident( $sub['name'] ) . $this->maybe_ws( 20 );
+
+ if ( null === $sub['matcher'] ) {
+ return $out . ']';
+ }
+
+ $matcher_strings = array(
+ 'exact' => '=',
+ 'one-of' => '~=',
+ 'exact-or-hyphen-suffixed' => '|=',
+ 'prefixed' => '^=',
+ 'suffixed' => '$=',
+ 'contains' => '*=',
+ );
+ $out .= $matcher_strings[ $sub['matcher'] ] . $this->maybe_ws( 25 );
+
+ $value = $sub['value'];
+ $value_as_ident = '' !== $value && $this->can_render_as_ident( $value ) && $this->prng->chance( 45 );
+ if ( $value_as_ident ) {
+ $out .= $this->render_ident( $value );
+ } else {
+ $out .= $this->render_string( $value );
+ }
+
+ if ( null !== $sub['modifier'] ) {
+ // After an ident value, whitespace is mandatory before the modifier.
+ $out .= $value_as_ident ? $this->ws() : $this->maybe_ws( 60 );
+
+ if ( 'case-insensitive' === $sub['modifier'] ) {
+ $out .= $this->prng->chance( 70 ) ? 'i' : 'I';
+ } else {
+ $out .= $this->prng->chance( 70 ) ? 's' : 'S';
+ }
+ }
+
+ return $out . $this->maybe_ws( 25 ) . ']';
+ }
+
+ /**
+ * Whether a value contains only codepoints this renderer is willing to
+ * put in an ident token (everything can be escaped, but a value ending
+ * in whitespace as an ident is fragile to read — strings handle those).
+ */
+ private function can_render_as_ident( string $value ): bool {
+ return '' !== $value;
+ }
+
+ /**
+ * Renders a name as a CSS ident token, escaping wherever required and
+ * sometimes where merely allowed. Parsing the result must yield $name.
+ */
+ private function render_ident( string $name ): string {
+ $points = utf8_codepoints( $name );
+ $count = count( $points );
+ $out = '';
+
+ foreach ( $points as $i => $point ) {
+ list( $char, $cp ) = $point;
+
+ $is_digit = $cp >= 0x30 && $cp <= 0x39;
+ $is_ident_char = (
+ '-' === $char ||
+ '_' === $char ||
+ $is_digit ||
+ ( $cp >= 0x41 && $cp <= 0x5A ) ||
+ ( $cp >= 0x61 && $cp <= 0x7A ) ||
+ $cp > 0x7F
+ );
+
+ $must_escape = ! $is_ident_char
+ || ( 0 === $i && $is_digit )
+ || ( 1 === $i && '-' === $points[0][0] && $is_digit )
+ || ( 1 === $count && '-' === $char );
+
+ if ( $must_escape || $this->prng->chance( $this->escape_boost ? 50 : 8 ) ) {
+ $out .= $this->render_escape( $char, $cp );
+ } else {
+ $out .= $char;
+ }
+ }
+
+ return $out;
+ }
+
+ /**
+ * Renders one codepoint as a CSS escape sequence that decodes back to it.
+ */
+ private function render_escape( string $char, int $cp ): string {
+ $is_hex_digit = ( $cp >= 0x30 && $cp <= 0x39 )
+ || ( $cp >= 0x41 && $cp <= 0x46 )
+ || ( $cp >= 0x61 && $cp <= 0x66 );
+ $is_newline_like = "\n" === $char || "\r" === $char || "\f" === $char;
+
+ /*
+ * Identity escapes are only safe for single-byte chars that are not
+ * hex digits (they would start a hex escape) and not newlines
+ * (backslash-newline is not a valid escape).
+ */
+ $identity_ok = ! $is_hex_digit && ! $is_newline_like && $cp >= 0x20;
+
+ if ( $identity_ok && $this->prng->chance( 35 ) ) {
+ return '\\' . $char;
+ }
+
+ $hex = dechex( $cp );
+ if ( $this->prng->chance( 25 ) && strlen( $hex ) < 6 ) {
+ $hex = str_pad( $hex, $this->prng->int( strlen( $hex ), 6 ), '0', STR_PAD_LEFT );
+ }
+ if ( $this->prng->chance( 30 ) ) {
+ $hex = strtoupper( $hex );
+ }
+
+ // The trailing space is always emitted; it is consumed by the escape.
+ return '\\' . $hex . ' ';
+ }
+
+ /**
+ * Renders a value as a CSS string token. Parsing must yield $value.
+ */
+ private function render_string( string $value ): string {
+ $quote = $this->prng->chance( 60 ) ? '"' : "'";
+ $out = $quote;
+ $points = utf8_codepoints( $value );
+
+ foreach ( $points as $point ) {
+ list( $char, $cp ) = $point;
+
+ if ( "\n" === $char || "\r" === $char || "\f" === $char ) {
+ // Literal newlines end (break) the string; always hex-escape.
+ $out .= '\\' . dechex( $cp ) . ' ';
+ continue;
+ }
+ if ( $char === $quote || '\\' === $char ) {
+ $out .= $this->prng->chance( 60 ) ? '\\' . $char : '\\' . dechex( $cp ) . ' ';
+ continue;
+ }
+ if ( $this->prng->chance( 5 ) ) {
+ $out .= $this->render_escape( $char, $cp );
+ continue;
+ }
+ $out .= $char;
+ }
+
+ // Rarely add a backslash-newline line continuation (decodes to nothing).
+ if ( $this->prng->chance( 4 ) ) {
+ $out .= "\\\n";
+ }
+
+ return $out . $quote;
+ }
+
+ private function ws(): string {
+ $options = array( ' ', ' ', ' ', "\t", "\n", "\f", "\r", ' ', " \t " );
+ return $this->prng->choice( $options );
+ }
+
+ private function maybe_ws( int $percent ): string {
+ return $this->prng->chance( $percent ) ? $this->ws() : '';
+ }
+
+ private function random_case( string $input ): string {
+ $out = '';
+ for ( $i = 0; $i < strlen( $input ); $i++ ) {
+ $c = $input[ $i ];
+ $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c );
+ }
+ return $out;
+ }
+
+ /*
+ * -------------------
+ * Unsupported selectors
+ * -------------------
+ */
+
+ private function gen_unsupported(): string {
+ $kind = $this->prng->weighted(
+ array(
+ 'pseudo-class' => 25,
+ 'pseudo-element' => 15,
+ 'sibling-combinator' => 20,
+ 'column-combinator' => 8,
+ 'namespace-type' => 12,
+ 'namespace-attr' => 8,
+ 'non-type-context' => 12,
+ )
+ );
+
+ switch ( $kind ) {
+ case 'pseudo-class':
+ $pseudo = $this->prng->choice(
+ array(
+ ':hover',
+ ':focus',
+ ':first-child',
+ ':last-child',
+ ':nth-child(2n+1)',
+ ':nth-of-type(3)',
+ ':not(.excluded)',
+ ':is(div, span)',
+ ':where(*)',
+ ':root',
+ ':empty',
+ ':checked',
+ ':lang(en)',
+ ':has(> img)',
+ )
+ );
+ return $this->render_compound( $this->gen_compound() ) . $pseudo;
+
+ case 'pseudo-element':
+ $pseudo = $this->prng->choice( array( '::before', '::after', '::first-line', '::first-letter', '::marker', '::placeholder' ) );
+ return $this->render_compound( $this->gen_compound() ) . $pseudo;
+
+ case 'sibling-combinator':
+ $combinator = $this->prng->choice( array( '+', '~' ) );
+ return $this->render_compound( $this->gen_compound() )
+ . $this->maybe_ws( 60 ) . $combinator . $this->maybe_ws( 60 )
+ . $this->render_compound( $this->gen_compound() );
+
+ case 'column-combinator':
+ return $this->gen_type_name( true )
+ . $this->maybe_ws( 50 ) . '||' . $this->maybe_ws( 50 )
+ . $this->gen_type_name( true );
+
+ case 'namespace-type':
+ $ns = $this->prng->choice( array( 'svg', 'html', '*', '' ) );
+ return $ns . '|' . $this->prng->choice( array( 'title', 'a', 'circle', 'div' ) );
+
+ case 'namespace-attr':
+ // `[ns|name]` — must not be confused with the `|=` matcher,
+ // so the char after `|` must not be `=`.
+ $ns = $this->prng->choice( array( 'xlink', 'svg', 'xml' ) );
+ return '[' . $ns . '|href]';
+
+ case 'non-type-context':
+ default:
+ // A context selector that is not a bare type selector.
+ $context = $this->prng->choice( array( '.ctx', '#ctx', '[ctx]', 'div.ctx', 'div#ctx', 'div[ctx]', '*.ctx' ) );
+ $joiner = $this->prng->chance( 50 )
+ ? $this->ws()
+ : $this->maybe_ws( 50 ) . '>' . $this->maybe_ws( 50 );
+ return $context . $joiner . $this->render_compound( $this->gen_compound() );
+ }
+ }
+
+ /*
+ * -----------------
+ * Invalid selectors
+ * -----------------
+ */
+
+ private function gen_invalid(): string {
+ $kind = $this->prng->weighted(
+ array(
+ 'template' => 45,
+ 'trailing-garbage' => 25,
+ 'leading-garbage' => 15,
+ 'comma-trouble' => 15,
+ )
+ );
+
+ switch ( $kind ) {
+ case 'template':
+ return $this->prng->choice(
+ array(
+ '',
+ ' ',
+ "\t\n\f ",
+ '.',
+ 'a.',
+ '#',
+ '[',
+ ']',
+ '[]',
+ '[ ]',
+ '.5x',
+ '#5',
+ '. x',
+ '..a',
+ '.#a',
+ /*
+ * EOF auto-closes an open attribute selector block
+ * ( '[a', '[a=b', '[a="b]', '[a=b i' are valid ), but
+ * grammar-level truncation is still invalid.
+ */
+ '[a=',
+ '[a= ',
+ '[a~',
+ '[a^',
+ '[a=]',
+ '[=b]',
+ '[a==b]',
+ '[a~b]',
+ '[a!=b]',
+ "[a=\"b\nc\"]",
+ "[a=\"b\nc",
+ '[a=b x]',
+ '[a=b x',
+ '[a=b ix]',
+ '[a=b ix',
+ '[a=b i x',
+ '[5=b]',
+ '[5=b',
+ 'a >',
+ '> a',
+ 'a > > b',
+ 'a >> b',
+ '>',
+ '-',
+ // A lone '\' is a valid escape at EOF ( type selector U+FFFD );
+ // '\' before a newline is not a valid escape.
+ "\\\n",
+ "a\\\nb",
+ 'a/**/b',
+ '!important',
+ '@media screen',
+ '{}',
+ ';',
+ 'a;b',
+ 'a{color:red}',
+ '()',
+ 'a()',
+ '*5',
+ '%',
+ 'a%',
+ )
+ );
+
+ case 'trailing-garbage':
+ $garbage = $this->prng->choice( array( ':', '(', ')', '{', '}', ';', '!', '@', '%', '/', '=', '|', '^', '$' ) );
+ return $this->render_compound( $this->gen_compound() ) . $garbage;
+
+ case 'leading-garbage':
+ $garbage = $this->prng->choice( array( '%', ';', ')', '}', '=', '~', '+', '/', ',' ) );
+ return $garbage . $this->render_compound( $this->gen_compound() );
+
+ case 'comma-trouble':
+ default:
+ $compound = $this->render_compound( $this->gen_compound() );
+ return $this->prng->choice(
+ array(
+ $compound . ',',
+ ',' . $compound,
+ $compound . ',,' . $compound,
+ $compound . ', ,' . $compound,
+ $compound . ' , ',
+ )
+ );
+ }
+ }
+
+ /*
+ * -----
+ * Chaos
+ * -----
+ */
+
+ private function gen_chaos(): string {
+ $alphabets = array(
+ 'css' => '.#[]=~|^$*>+,:()"\'\\ \t\n-_',
+ 'ident' => 'abcXYZ019-_',
+ 'mixed' => '.#[]=~|^$*>+,:()"\'\\ abcXYZ019-_iIsS',
+ 'unicode' => '✓Ωé🙂',
+ );
+
+ $alphabet = $alphabets[ $this->prng->weighted(
+ array(
+ 'css' => 25,
+ 'ident' => 15,
+ 'mixed' => 45,
+ 'unicode' => 15,
+ )
+ ) ];
+
+ if ( 'unicode' === $alphabet ) {
+ $points = utf8_codepoints( $alphabet . '.#[]= aZ9' );
+ $length = $this->prng->int( 0, 24 );
+ $out = '';
+ for ( $i = 0; $i < $length; $i++ ) {
+ $out .= $this->prng->choice( $points )[0];
+ }
+ return $out;
+ }
+
+ $length = $this->prng->int( 0, 40 );
+ $out = '';
+ for ( $i = 0; $i < $length; $i++ ) {
+ $out .= $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ];
+ }
+ return $out;
+ }
+
+ /*
+ * --------
+ * Mutation
+ * --------
+ */
+
+ private function mutate( string $selector ): string {
+ $mutation_count = $this->prng->int( 1, 4 );
+ $alphabet = '.#[]=~|^$*>+,:()"\'\\ \t\niIsSabcXYZ019-_';
+
+ for ( $m = 0; $m < $mutation_count; $m++ ) {
+ $length = strlen( $selector );
+ $kind = $this->prng->weighted(
+ array(
+ 'insert' => 30,
+ 'delete' => 25,
+ 'replace' => 25,
+ 'duplicate' => 10,
+ 'case-flip' => 10,
+ 'invalid-utf8' => 12,
+ )
+ );
+
+ switch ( $kind ) {
+ case 'insert':
+ $at = $this->prng->int( 0, $length );
+ $char = $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ];
+ $selector = substr( $selector, 0, $at ) . $char . substr( $selector, $at );
+ break;
+
+ case 'delete':
+ if ( $length > 0 ) {
+ $at = $this->prng->int( 0, $length - 1 );
+ $selector = substr( $selector, 0, $at ) . substr( $selector, $at + 1 );
+ }
+ break;
+
+ case 'replace':
+ if ( $length > 0 ) {
+ $at = $this->prng->int( 0, $length - 1 );
+ $char = $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ];
+ $selector = substr( $selector, 0, $at ) . $char . substr( $selector, $at + 1 );
+ }
+ break;
+
+ case 'duplicate':
+ if ( $length > 0 ) {
+ $start = $this->prng->int( 0, $length - 1 );
+ $span = $this->prng->int( 1, min( 6, $length - $start ) );
+ $selector = substr( $selector, 0, $start + $span )
+ . substr( $selector, $start, $span )
+ . substr( $selector, $start + $span );
+ }
+ break;
+
+ case 'case-flip':
+ if ( $length > 0 ) {
+ $at = $this->prng->int( 0, $length - 1 );
+ $char = $selector[ $at ];
+ $flip = ctype_lower( $char ) ? strtoupper( $char ) : strtolower( $char );
+ $selector = substr( $selector, 0, $at ) . $flip . substr( $selector, $at + 1 );
+ }
+ break;
+
+ case 'invalid-utf8':
+ // Splice a raw ill-formed sequence at an arbitrary byte
+ // offset — possibly splitting an existing multibyte
+ // character or landing before a continuation byte that
+ // completes a truncated lead. No expectations here; these
+ // exercise crash / scrub-notice / differential paths.
+ $bytes = $this->prng->choice( array_column( self::INVALID_UTF8_CLASSES, 0 ) );
+ $at = $this->prng->int( 0, $length );
+ $selector = substr( $selector, 0, $at ) . $bytes . substr( $selector, $at );
+ break;
+ }
+ }
+
+ return $selector;
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/TreeCapture.php b/tools/css-selector-fuzz/lib/TreeCapture.php
new file mode 100644
index 0000000000000..2350db024c8ad
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/TreeCapture.php
@@ -0,0 +1,145 @@
+,
+ * ancestorTags: string[] nearest-first )
+ * tag row: same without ancestorTags.
+ */
+class TreeCapture {
+
+ const CAPTURE_ITERATION_LIMIT = 20000;
+
+ /**
+ * Captures the processor's view of a document or a fragment.
+ *
+ * @param string $html The markup ( full document or fragment ).
+ * @param string|null $context When set, parse as a fragment in this
+ * context ( e.g. '' ); the tag
+ * processor has no fragment mode, so tagRows
+ * is null in that case.
+ * @return array{
+ * htmlRows: array|null,
+ * tagRows: array|null,
+ * quirks: bool,
+ * error: string|null,
+ * }
+ */
+ public static function capture( string $html, ?string $context = null ): array {
+ $out = array(
+ 'htmlRows' => null,
+ 'tagRows' => null,
+ 'quirks' => false,
+ 'error' => null,
+ );
+
+ $processor = null === $context
+ ? \WP_HTML_Processor::create_full_parser( $html )
+ : \WP_HTML_Processor::create_fragment( $html, $context );
+ if ( null === $processor ) {
+ $out['error'] = 'fragment-context-unsupported';
+ return $out;
+ }
+ $rows = array();
+ $iterations = 0;
+ while ( $processor->next_tag() ) {
+ if ( ++$iterations > self::CAPTURE_ITERATION_LIMIT ) {
+ $out['error'] = 'html-capture-iteration-limit';
+ return $out;
+ }
+ $breadcrumbs = $processor->get_breadcrumbs();
+ array_pop( $breadcrumbs );
+ $rows[] = array(
+ 'tag' => (string) $processor->get_tag(),
+ 'fid' => self::fid_of( $processor ),
+ 'attrs' => self::attrs_of( $processor ),
+ 'ancestorTags' => array_reverse( $breadcrumbs ),
+ 'namespace' => $processor->get_namespace(),
+ );
+ }
+
+ if ( null !== $processor->get_last_error() ) {
+ $out['error'] = 'html-processor-error: ' . $processor->get_last_error();
+ return $out;
+ }
+ if ( null !== $processor->get_unsupported_exception() ) {
+ $out['error'] = 'html-processor-unsupported: ' . $processor->get_unsupported_exception()->getMessage();
+ return $out;
+ }
+
+ $out['htmlRows'] = $rows;
+ $out['quirks'] = $processor->is_quirks_mode();
+
+ // The tag processor has no fragment mode; a fragment case exercises
+ // the html processor's select() only.
+ if ( null !== $context ) {
+ return $out;
+ }
+
+ $tag_processor = new \WP_HTML_Tag_Processor( $html );
+ $tag_rows = array();
+ $iterations = 0;
+ while ( $tag_processor->next_tag() ) {
+ if ( ++$iterations > self::CAPTURE_ITERATION_LIMIT ) {
+ $out['error'] = 'tag-capture-iteration-limit';
+ return $out;
+ }
+ $tag_rows[] = array(
+ 'tag' => (string) $tag_processor->get_tag(),
+ 'fid' => self::fid_of( $tag_processor ),
+ 'attrs' => self::attrs_of( $tag_processor ),
+ );
+ }
+ $out['tagRows'] = $tag_rows;
+
+ return $out;
+ }
+
+ /** The element's data-fid, or the same placeholder collect_matches() uses. */
+ private static function fid_of( $processor ): string {
+ $fid = $processor->get_attribute( 'data-fid' );
+ return is_string( $fid ) ? self::sanitize_fid( $fid ) : '(missing-fid:' . $processor->get_tag() . ')';
+ }
+
+ /**
+ * Replaces the lexbor protocol framing bytes ( TAB / LF / CR ) in a fid
+ * with '?'. Generated fids never contain these, but the lexbor harness
+ * applies the same replacement, so matching this here keeps the two trees
+ * comparable even for a hypothetical control-char fid ( the worst case is
+ * a benign tree-gated skip, never a false divergence ).
+ */
+ public static function sanitize_fid( string $fid ): string {
+ return strtr( $fid, "\t\n\r", '???' );
+ }
+
+ /**
+ * All attributes as ( lowercase name, decoded value ) pairs, excluding
+ * data-fid ( stored separately, mirroring the generated model's shape ).
+ *
+ * @return array
+ */
+ private static function attrs_of( $processor ): array {
+ $attrs = array();
+ foreach ( (array) $processor->get_attribute_names_with_prefix( '' ) as $name ) {
+ if ( 'data-fid' === $name ) {
+ continue;
+ }
+ $value = $processor->get_attribute( $name );
+ $attrs[] = array( $name, true === $value ? true : (string) $value );
+ }
+ return $attrs;
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/WildDocumentGenerator.php b/tools/css-selector-fuzz/lib/WildDocumentGenerator.php
new file mode 100644
index 0000000000000..3cd7dada17cc6
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/WildDocumentGenerator.php
@@ -0,0 +1,381 @@
+ '',
+ 'html' => '',
+ 'legacy-compat' => '',
+ 'quirky' => '',
+ 'limited' => '',
+ );
+
+ /** @var Prng */
+ private $prng;
+ private $fid_counter = 0;
+ private $pools;
+
+ private function __construct( Prng $prng ) {
+ $this->prng = $prng;
+ $this->pools = array(
+ 'tags' => array( 'html', 'head', 'body' ),
+ 'classes' => array(),
+ 'ids' => array(),
+ 'attrNames' => array(),
+ 'attrValues' => array(),
+ );
+ }
+
+ /**
+ * @return array{model: null, html: string, pools: array, wild: true, doctype: string}
+ */
+ public static function generate( Prng $prng ): array {
+ $generator = new self( $prng );
+ return $generator->build();
+ }
+
+ private function build(): array {
+ $doctype_kind = $this->prng->weighted(
+ array(
+ 'none' => 25,
+ 'html' => 45,
+ 'legacy-compat' => 10,
+ 'quirky' => 12,
+ 'limited' => 8,
+ )
+ );
+
+ $out = self::DOCTYPES[ $doctype_kind ];
+
+ if ( $this->prng->chance( 15 ) ) {
+ $out .= 'render_attrs( $this->random_attrs() ) . '>';
+ }
+ if ( $this->prng->chance( 10 ) ) {
+ $out .= 'render_attrs( $this->random_attrs() ) . '>';
+ }
+
+ $max_elements = $this->prng->int( 4, 35 );
+ $token_budget = $this->prng->int( 8, 70 );
+ $open = array();
+
+ for ( $i = 0; $i < $token_budget; $i++ ) {
+ $in_table = $this->in_table_context( $open );
+
+ $kind = $this->prng->weighted(
+ array(
+ 'start' => 42,
+ 'void' => $in_table ? 0 : 8,
+ 'end' => 24,
+ 'text' => 16,
+ 'comment' => 5,
+ 'stray' => $in_table ? 0 : 5,
+ )
+ );
+
+ switch ( $kind ) {
+ case 'start':
+ if ( $this->fid_counter >= $max_elements ) {
+ break;
+ }
+ $tag = $in_table
+ ? $this->prng->choice( array( 'caption', 'colgroup', 'thead', 'tbody', 'tfoot', 'tr', 'tr', 'td', 'td', 'th' ) )
+ : $this->prng->choice( self::TAGS );
+ if ( 'a' === $tag && in_array( 'a', $open, true ) ) {
+ // A nested immediately runs the adoption agency.
+ $tag = 'span';
+ }
+ $this->pools['tags'][] = $tag;
+ $out .= '<' . $this->maybe_case( $tag )
+ . ' data-fid="w' . $this->fid_counter++ . '"'
+ . $this->render_attrs( $this->random_attrs() ) . '>';
+ $open[] = $tag;
+ break;
+
+ case 'void':
+ if ( $this->fid_counter >= $max_elements ) {
+ break;
+ }
+ $tag = $this->prng->choice( self::VOID_TAGS );
+ $this->pools['tags'][] = $tag;
+ $out .= '<' . $this->maybe_case( $tag )
+ . ' data-fid="w' . $this->fid_counter++ . '"'
+ . $this->render_attrs( $this->random_attrs() )
+ . ( $this->prng->chance( 25 ) ? ' />' : '>' );
+ break;
+
+ case 'end':
+ if ( array() === $open ) {
+ break;
+ }
+ $pick = $this->prng->weighted(
+ array(
+ 'top' => 60,
+ 'random' => 40,
+ )
+ );
+ if ( 'top' === $pick ) {
+ $tag = array_pop( $open );
+ } else {
+ /*
+ * Close a non-top open element: misnesting. Never
+ * across a formatting element — the processor only
+ * supports the trivial adoption-agency cases and
+ * bails on the rest ( "any other end tag" /
+ * "common ancestor" / reconstruction-with-rewind ).
+ */
+ $formatting = array( 'a', 'b', 'i', 'em', 'strong', 'u', 's', 'code', 'small' );
+ $lowest = count( $open ) - 1;
+ while ( $lowest > 0 && ! in_array( $open[ $lowest ], $formatting, true ) ) {
+ $lowest--;
+ }
+ if ( in_array( $open[ $lowest ], $formatting, true ) ) {
+ $lowest++;
+ }
+ if ( $lowest > count( $open ) - 1 ) {
+ $tag = array_pop( $open );
+ } else {
+ $at = $this->prng->int( $lowest, count( $open ) - 1 );
+ $tag = $open[ $at ];
+ array_splice( $open, $at, 1 );
+ }
+ }
+ $out .= '' . $this->maybe_case( $tag ) . '>';
+ break;
+
+ case 'text':
+ // Non-whitespace text in table context is unsupported
+ // (pending-table-character-tokens), keep it whitespace.
+ $out .= $in_table
+ ? "\n "
+ : $this->prng->choice(
+ array(
+ 'text',
+ ' wild text ',
+ "\n",
+ '& <x>',
+ 'café ✓',
+ 'a < b',
+ )
+ );
+ break;
+
+ case 'comment':
+ $out .= '';
+ break;
+
+ case 'stray':
+ // An end tag for something that is not open.
+ // No formatting tags here: a stray formatting end tag
+ // runs the adoption agency's unsupported branches.
+ $out .= '' . $this->prng->choice( array( 'div', 'p', 'table', 'tr', 'li', 'span', 'x-wild' ) ) . '>';
+ break;
+ }
+ }
+
+ // Leave roughly half of the still-open elements unclosed.
+ foreach ( array_reverse( $open ) as $tag ) {
+ if ( $this->prng->chance( 50 ) ) {
+ $out .= '' . $this->maybe_case( $tag ) . '>';
+ }
+ }
+
+ foreach ( $this->pools as $key => $values ) {
+ $this->pools[ $key ] = array_values( array_unique( $values ) );
+ }
+
+ return array(
+ 'model' => null,
+ 'html' => $out,
+ 'pools' => $this->pools,
+ 'wild' => true,
+ 'doctype' => $doctype_kind,
+ );
+ }
+
+ /**
+ * Whether the insertion point is in table context outside any cell or
+ * caption — where arbitrary content would foster-parent (unsupported).
+ */
+ private function in_table_context( array $open ): bool {
+ for ( $i = count( $open ) - 1; $i >= 0; $i-- ) {
+ $tag = $open[ $i ];
+ if ( in_array( $tag, array( 'td', 'th', 'caption' ), true ) ) {
+ return false;
+ }
+ if ( in_array( $tag, array( 'table', 'thead', 'tbody', 'tfoot', 'tr', 'colgroup' ), true ) ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /** @return array */
+ private function random_attrs(): array {
+ $attrs = array();
+ $count = $this->prng->weighted( array( 0 => 30, 1 => 35, 2 => 25, 3 => 10 ) );
+
+ for ( $i = 0; $i < $count; $i++ ) {
+ $name = $this->prng->choice( DocumentGenerator::ATTR_NAMES );
+
+ $lower = ascii_strtolower( $name );
+ if ( 'class' === $lower ) {
+ $words = array();
+ $n = $this->prng->int( 1, 3 );
+ for ( $j = 0; $j < $n; $j++ ) {
+ $word = $this->maybe_inject_class_nul( $this->random_word() );
+ $words[] = $word;
+ foreach ( DocumentGenerator::class_tokens( $word ) as $token ) {
+ $this->pools['classes'][] = $token;
+ }
+ }
+ $value = implode( ' ', $words );
+ } elseif ( 'id' === $lower ) {
+ $value = $this->random_word();
+ $this->pools['ids'][] = $value;
+ } elseif ( $this->prng->chance( 15 ) ) {
+ $value = true;
+ } else {
+ $value = $this->random_word();
+ if ( $this->prng->chance( 20 ) ) {
+ $value .= ' ' . $this->random_word();
+ }
+ }
+
+ $this->pools['attrNames'][] = $lower;
+ if ( is_string( $value ) && 'class' !== $lower ) {
+ $this->pools['attrValues'][] = $value;
+ }
+ $attrs[] = array( $name, $value );
+ }
+
+ return $attrs;
+ }
+
+ private function maybe_inject_class_nul( string $class ): string {
+ if ( '' === $class || ! $this->prng->chance( 12 ) ) {
+ return $class;
+ }
+
+ $points = utf8_codepoints( $class );
+ $at = $this->prng->int( 0, count( $points ) );
+ $out = '';
+ foreach ( $points as $i => $point ) {
+ if ( $i === $at ) {
+ $out .= "\0";
+ }
+ $out .= $point[0];
+ }
+ return $at === count( $points ) ? $out . "\0" : $out;
+ }
+
+ private function render_attrs( array $attrs ): string {
+ $out = '';
+ foreach ( $attrs as $attr ) {
+ list( $name, $value ) = $attr;
+ if ( true === $value ) {
+ $out .= ' ' . $name;
+ continue;
+ }
+ $out .= ' ' . $name . '="' . str_replace( array( '&', '"', '<' ), array( '&', '"', '<' ), $value ) . '"';
+ }
+ return $out;
+ }
+
+ private function random_word(): string {
+ $stems = array( 'wild', 'soup', 'alpha', 'beta', 'item', 'note', 'x', 'mixedCase', 'Über', 'main-thing', '--var', '_u' );
+ $word = $this->prng->choice( $stems );
+ if ( $this->prng->chance( 30 ) ) {
+ $word .= (string) $this->prng->int( 0, 99 );
+ }
+ return $word;
+ }
+
+ private function maybe_case( string $tag ): string {
+ if ( ! $this->prng->chance( 15 ) ) {
+ return $tag;
+ }
+ $out = '';
+ for ( $i = 0; $i < strlen( $tag ); $i++ ) {
+ $c = $tag[ $i ];
+ $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c );
+ }
+ return $out;
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/Worker.php b/tools/css-selector-fuzz/lib/Worker.php
new file mode 100644
index 0000000000000..f701d87974657
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/Worker.php
@@ -0,0 +1,1308 @@
+chance( 30 );
+ $is_fragment = ! $is_wild && $prng->chance( 20 );
+
+ $failures = array();
+ $record = static function ( string $invariant, array $detail ) use ( &$failures ) {
+ $failures[] = array(
+ 'invariant' => $invariant,
+ 'detail' => $detail,
+ );
+ };
+ $match_stats = array();
+
+ /*
+ * The processor's own parse is the matching oracle's ground truth.
+ * For safe (model-built) documents the model must agree with the
+ * capture — that soundness check is what lets the capture be trusted
+ * on wild documents, where no model exists.
+ *
+ * Wild documents that hit one of the processor's unsupported
+ * constructs (it bails on foster parenting, complex adoption-agency
+ * runs, …) are deterministically regenerated a bounded number of
+ * times so nearly every wild case carries a usable ground truth.
+ */
+ $document = null;
+ $capture = null;
+ $capture_error = null;
+ $attempts = $is_wild ? 8 : 1;
+ for ( $attempt = 0; $attempt < $attempts; $attempt++ ) {
+ if ( $is_wild ) {
+ $document = WildDocumentGenerator::generate( $prng->fork( "wild-document:{$attempt}" ) );
+ } elseif ( $is_fragment ) {
+ $document = DocumentGenerator::generate_fragment( $prng->fork( 'fragment' ) );
+ } else {
+ $document = DocumentGenerator::generate( $prng->fork( 'document' ) );
+ }
+
+ $context = ( $document['fragment'] ?? false ) ? $document['context'] : null;
+ list( $capture, $capture_error ) = self::guard(
+ static function () use ( $document, $context ) {
+ return TreeCapture::capture( $document['html'], $context );
+ }
+ );
+
+ if ( null === $capture_error && null === $capture['error'] ) {
+ break;
+ }
+ }
+
+ $rows = null;
+ $tag_rows = null;
+ $quirks = false;
+
+ if ( null !== $capture_error ) {
+ $record( 'model-desync', array( 'phase' => 'capture', 'error' => self::describe_throwable( $capture_error ) ) );
+ } elseif ( null !== $capture['error'] ) {
+ if ( ! $is_wild ) {
+ $record( 'model-desync', array( 'phase' => 'capture', 'error' => $capture['error'] ) );
+ }
+ // Wild markup the processor cannot fully visit is skipped:
+ // parsing invariants still run, matching has no ground truth.
+ } else {
+ $rows = $capture['htmlRows'];
+ $tag_rows = $capture['tagRows'];
+ $quirks = $capture['quirks'];
+
+ if ( $is_fragment ) {
+ self::check_fragment_capture_against_model( $document, $capture, $record );
+ } elseif ( ! $is_wild ) {
+ self::check_capture_against_model( $document, $capture, $record );
+ }
+ }
+
+ $path_rows = null;
+ if ( null !== $rows ) {
+ $path_rows = array();
+ foreach ( $rows as $row ) {
+ if ( 0 !== strpos( $row['fid'], '(missing-fid:' ) ) {
+ $path_rows[] = $row;
+ }
+ }
+ }
+
+ $selector = SelectorGenerator::generate( $prng->fork( 'selector' ), $document['pools'], $path_rows );
+
+ $selector_string = $selector['selector'];
+
+ // --- Parse phase -------------------------------------------------
+
+ list( $compound_list, $compound_error ) = self::guard(
+ static function () use ( $selector_string ) {
+ return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string );
+ }
+ );
+ list( $complex_list, $complex_error ) = self::guard(
+ static function () use ( $selector_string ) {
+ return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string );
+ }
+ );
+
+ if ( null !== $compound_error ) {
+ $record( 'parse-error', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $compound_error ) ) );
+ }
+ if ( null !== $complex_error ) {
+ $record( 'parse-error', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $complex_error ) ) );
+ }
+
+ if ( null === $compound_error && null !== $selector['expectCompound'] && $selector['expectCompound'] !== ( null !== $compound_list ) ) {
+ $record(
+ 'parse-expectation',
+ array(
+ 'grammar' => 'compound',
+ 'expected' => $selector['expectCompound'] ? 'parse' : 'null',
+ 'actual' => null !== $compound_list ? 'parse' : 'null',
+ )
+ );
+ }
+ if ( null === $complex_error && null !== $selector['expectComplex'] && $selector['expectComplex'] !== ( null !== $complex_list ) ) {
+ $record(
+ 'parse-expectation',
+ array(
+ 'grammar' => 'complex',
+ 'expected' => $selector['expectComplex'] ? 'parse' : 'null',
+ 'actual' => null !== $complex_list ? 'parse' : 'null',
+ )
+ );
+ }
+
+ if ( null !== $compound_list && null === $complex_list && null === $complex_error ) {
+ $record( 'compound-implies-complex', array() );
+ }
+
+ // Parse determinism: a second parse must agree with the first.
+ list( $compound_again, ) = self::guard(
+ static function () use ( $selector_string ) {
+ return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string );
+ }
+ );
+ list( $complex_again, ) = self::guard(
+ static function () use ( $selector_string ) {
+ return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string );
+ }
+ );
+ if ( ( null === $compound_list ) !== ( null === $compound_again ) || ( null === $complex_list ) !== ( null === $complex_again ) ) {
+ $record( 'parse-determinism', array( 'note' => 'null-ness changed between identical parses' ) );
+ }
+
+ // --- AST extraction ----------------------------------------------
+
+ $compound_ast = null;
+ $complex_ast = null;
+
+ if ( null !== $compound_list ) {
+ list( $compound_ast, $shape_error ) = self::guard(
+ static function () use ( $compound_list ) {
+ return AstExtractor::from_compound_list( $compound_list );
+ }
+ );
+ if ( null !== $shape_error ) {
+ $record( 'ast-shape', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $shape_error ) ) );
+ }
+ }
+ if ( null !== $complex_list ) {
+ list( $complex_ast, $shape_error ) = self::guard(
+ static function () use ( $complex_list ) {
+ return AstExtractor::from_complex_list( $complex_list );
+ }
+ );
+ if ( null !== $shape_error ) {
+ $record( 'ast-shape', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $shape_error ) ) );
+ }
+ }
+
+ if ( null !== $compound_ast && null !== $complex_ast && $compound_ast !== $complex_ast ) {
+ $record(
+ 'ast-cross-grammar',
+ array(
+ 'compoundAst' => $compound_ast,
+ 'complexAst' => $complex_ast,
+ )
+ );
+ }
+
+ if ( null !== $selector['ast'] && null !== $complex_ast && $selector['ast'] !== $complex_ast ) {
+ $record(
+ 'ast-mismatch',
+ array(
+ 'generatedAst' => $selector['ast'],
+ 'parsedAst' => $complex_ast,
+ )
+ );
+ }
+
+ // --- Match phase ---------------------------------------------------
+
+ $html_matches = null;
+ // 'n/a' = the lexbor differential does not apply to this case
+ // ( unparseable selector, fragment, no captured tree ). Distinct from
+ // 'unavailable', which check_lexbor_differential reports only when the
+ // harness itself is missing or died — so a silently-dropped third
+ // oracle shows up in the per-batch tally instead of hiding in 'off'.
+ $lexbor_state = 'n/a';
+ if ( null !== $complex_ast && null !== $rows ) {
+ $expected = ReferenceMatcher::expected_html_matches_rows( $complex_ast, $rows, $quirks );
+
+ /*
+ * Path-directed selectors are guaranteed by construction to match
+ * ( or, for near-misses, not to match ) a specific element. The
+ * reference matcher disagreeing means the generator or the
+ * reference matcher itself is wrong — a fuzzer-side defect.
+ */
+ $must_match = $selector['mustMatchFid'] ?? null;
+ $must_not_match = $selector['mustNotMatchFid'] ?? null;
+ if ( null !== $must_match && ! in_array( $must_match, $expected, true ) ) {
+ $record(
+ 'path-expectation',
+ array(
+ 'expectation' => 'must-match',
+ 'fid' => $must_match,
+ 'expected' => $expected,
+ )
+ );
+ }
+ if ( null !== $must_not_match && in_array( $must_not_match, $expected, true ) ) {
+ $record(
+ 'path-expectation',
+ array(
+ 'expectation' => 'must-not-match',
+ 'fid' => $must_not_match,
+ 'expected' => $expected,
+ )
+ );
+ }
+
+ $html_matches = self::check_select_matches( 'html', $selector_string, $document, $expected, $record );
+ if ( null !== $html_matches ) {
+ self::note_match_assertion( $match_stats, 'html', $expected, $html_matches );
+ }
+
+ // lexbor parses full documents only; fragments skip it.
+ if ( ! ( $document['fragment'] ?? false ) ) {
+ $lexbor_state = self::check_lexbor_differential( $complex_ast, $selector_string, $document, $rows, $quirks, $expected, $record );
+ }
+ } elseif ( null === $complex_list && null === $complex_error ) {
+ self::check_select_rejection( 'html', $selector_string, $document, $record );
+ }
+
+ if ( null !== $compound_ast && null !== $tag_rows ) {
+ $expected = ReferenceMatcher::expected_tag_matches_rows( $compound_ast, $tag_rows );
+ $tag_matches = self::check_select_matches( 'tag', $selector_string, $document, $expected, $record );
+ if ( null !== $tag_matches ) {
+ self::note_match_assertion( $match_stats, 'tag', $expected, $tag_matches );
+ }
+ } elseif ( null === $compound_list && null === $compound_error ) {
+ self::check_select_rejection( 'tag', $selector_string, $document, $record );
+ }
+
+ // --- Metamorphic phase ----------------------------------------------
+ // Oracle-free relations: meaning-preserving transforms of the selector
+ // must select exactly the same elements. Run only on otherwise-clean
+ // cases so a single root cause does not multiply into noise.
+
+ if ( null !== $complex_ast && null !== $html_matches && array() === $failures ) {
+ self::check_metamorphic( $complex_ast, $html_matches, $document, $prng->fork( 'metamorph' ), $record );
+ }
+
+ $digest = sha1(
+ json_encode_safe(
+ array(
+ $selector_string,
+ $document['html'],
+ null !== $compound_list,
+ null !== $complex_list,
+ $compound_ast,
+ $complex_ast,
+ array_map(
+ static function ( $failure ) {
+ return $failure['invariant'];
+ },
+ $failures
+ ),
+ )
+ )
+ );
+
+ $signatures = array();
+ foreach ( $failures as $failure ) {
+ $signatures[] = self::signature( $failure );
+ }
+
+ return array(
+ 'seed' => $seed,
+ 'bucket' => $selector['bucket'],
+ 'digest' => $digest,
+ 'failures' => $failures,
+ 'signatures' => array_values( array_unique( $signatures ) ),
+ 'selector' => $selector_string,
+ 'html' => $document['html'],
+ 'lexbor' => $lexbor_state,
+ 'matchStats' => $match_stats,
+ );
+ }
+
+ /**
+ * Runs the SELF-CONTAINED invariants on an explicit ( selector, html )
+ * pair — no generated model, intended AST, or parse expectation. This is
+ * what the minimizer drives: every checked property is computable from
+ * the pair alone ( WP select() vs the reference matcher over WP's own
+ * parsed AST and the captured tree; metamorphic relations; the lexbor
+ * differential; parse/shape/cross-grammar invariants; rejection
+ * bookkeeping for unparseable selectors ).
+ *
+ * Bug 1 surfaces here as metamorphic-ast, Bug 2 as match-mismatch-*,
+ * Bug 3 as metamorphic-parse — so all three known bugs are minimizable
+ * without the generator.
+ *
+ * @return array{
+ * failures: array,
+ * signatures: string[],
+ * }
+ */
+ public static function run_pair( string $selector_string, string $html, ?string $target = null ): array {
+ Bootstrap::load();
+
+ $failures = array();
+ $record = static function ( string $invariant, array $detail ) use ( &$failures ) {
+ $failures[] = array(
+ 'invariant' => $invariant,
+ 'detail' => $detail,
+ );
+ };
+
+ // When the minimizer fixes a target signature, the metamorphic loop
+ // ( the only expensive, multi-draw stage ) is only worth running if
+ // the target is itself a metamorphic signature.
+ $target_invariant = null === $target ? null : substr( strrchr( $target, ':' ), 1 );
+ $target_is_metamorph = null !== $target_invariant && 0 === strpos( $target_invariant, 'metamorphic' );
+ $has_target_signature = static function () use ( &$failures, $target ) {
+ if ( null === $target ) {
+ return false;
+ }
+ foreach ( $failures as $failure ) {
+ if ( self::signature( $failure ) === $target ) {
+ return true;
+ }
+ }
+ return false;
+ };
+
+ list( $capture, $capture_error ) = self::guard(
+ static function () use ( $html ) {
+ return TreeCapture::capture( $html );
+ }
+ );
+
+ $rows = null;
+ $tag_rows = null;
+ $quirks = false;
+ if ( null === $capture_error && null === $capture['error'] ) {
+ $rows = $capture['htmlRows'];
+ $tag_rows = $capture['tagRows'];
+ $quirks = $capture['quirks'];
+ }
+
+ $document = array( 'html' => $html );
+
+ list( $compound_list, $compound_error ) = self::guard(
+ static function () use ( $selector_string ) {
+ return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string );
+ }
+ );
+ list( $complex_list, $complex_error ) = self::guard(
+ static function () use ( $selector_string ) {
+ return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string );
+ }
+ );
+
+ if ( null !== $compound_error ) {
+ $record( 'parse-error', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $compound_error ) ) );
+ }
+ if ( null !== $complex_error ) {
+ $record( 'parse-error', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $complex_error ) ) );
+ }
+ if ( null !== $compound_list && null === $complex_list && null === $complex_error ) {
+ $record( 'compound-implies-complex', array() );
+ }
+
+ $compound_ast = null;
+ $complex_ast = null;
+ if ( null !== $compound_list ) {
+ list( $compound_ast, $shape_error ) = self::guard(
+ static function () use ( $compound_list ) {
+ return AstExtractor::from_compound_list( $compound_list );
+ }
+ );
+ if ( null !== $shape_error ) {
+ $record( 'ast-shape', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $shape_error ) ) );
+ }
+ }
+ if ( null !== $complex_list ) {
+ list( $complex_ast, $shape_error ) = self::guard(
+ static function () use ( $complex_list ) {
+ return AstExtractor::from_complex_list( $complex_list );
+ }
+ );
+ if ( null !== $shape_error ) {
+ $record( 'ast-shape', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $shape_error ) ) );
+ }
+ }
+ if ( null !== $compound_ast && null !== $complex_ast && $compound_ast !== $complex_ast ) {
+ $record( 'ast-cross-grammar', array( 'compoundAst' => $compound_ast, 'complexAst' => $complex_ast ) );
+ }
+
+ $html_matches = null;
+ if ( null !== $complex_ast && null !== $rows ) {
+ $expected = ReferenceMatcher::expected_html_matches_rows( $complex_ast, $rows, $quirks );
+ $html_matches = self::check_select_matches( 'html', $selector_string, $document, $expected, $record );
+ self::check_lexbor_differential( $complex_ast, $selector_string, $document, $rows, $quirks, $expected, $record );
+ } elseif ( null === $complex_list && null === $complex_error && null !== $rows ) {
+ self::check_select_rejection( 'html', $selector_string, $document, $record );
+ }
+
+ if ( null !== $compound_ast && null !== $tag_rows ) {
+ $expected = ReferenceMatcher::expected_tag_matches_rows( $compound_ast, $tag_rows );
+ self::check_select_matches( 'tag', $selector_string, $document, $expected, $record );
+ } elseif ( null === $compound_list && null === $compound_error && null !== $tag_rows ) {
+ self::check_select_rejection( 'tag', $selector_string, $document, $record );
+ }
+
+ $run_metamorph = ( null === $target || $target_is_metamorph )
+ && null !== $complex_ast && null !== $html_matches && array() === $failures;
+ if ( $run_metamorph ) {
+ /*
+ * Metamorphic transforms randomize escapes / case / order, so a
+ * transform-sensitive bug ( e.g. Bug 1 and Bug 3 ) only fires for
+ * some PRNG draws. run_case sees one draw; here several fixed
+ * draws are tried so minimization can reliably preserve such a
+ * signature regardless of which draw first exposed it. With a
+ * target fixed, stop at the first draw that reproduces it.
+ */
+ for ( $i = 0; $i < self::PAIR_METAMORPH_DRAWS && array() === $failures; $i++ ) {
+ // A FIXED draw seed ( not derived from the pair ) keeps the
+ // test monotonic under shrinking: the same coin-flips apply to
+ // whatever AST survives, so a smaller selector that still has
+ // the bug reproduces the same transform signature.
+ $metamorph_prng = new Prng( 'css-selector-fuzz-minimize', "metamorph:{$i}" );
+ self::check_metamorphic( $complex_ast, $html_matches, $document, $metamorph_prng, $record );
+ if ( $has_target_signature() ) {
+ break;
+ }
+ }
+ }
+
+ $signatures = array();
+ foreach ( $failures as $failure ) {
+ $signatures[] = self::signature( $failure );
+ }
+
+ return array(
+ 'failures' => $failures,
+ 'signatures' => array_values( array_unique( $signatures ) ),
+ );
+ }
+
+ /**
+ * Fragment analogue of check_capture_against_model: the ``-context
+ * fragment capture must equal the model rows built from the body-level
+ * children ( with the implicit HTML/BODY ancestors ).
+ */
+ private static function check_fragment_capture_against_model( array $document, array $capture, callable $record ): void {
+ $model_rows = DocumentGenerator::rows_from_fragment( $document['children'] );
+
+ $normalize = static function ( array $rows ): array {
+ $out = array();
+ foreach ( $rows as $row ) {
+ $attrs = array();
+ foreach ( $row['attrs'] as $attr ) {
+ $attrs[ $attr[0] ] = $attr[1];
+ }
+ ksort( $attrs );
+ $out[] = array(
+ 'tag' => $row['tag'],
+ 'fid' => $row['fid'],
+ 'attrs' => $attrs,
+ 'ancestorTags' => $row['ancestorTags'],
+ );
+ }
+ return $out;
+ };
+
+ $expected = $normalize( $model_rows );
+ $actual = $normalize( $capture['htmlRows'] );
+ if ( $expected !== $actual ) {
+ $record(
+ 'model-desync',
+ array(
+ 'processor' => 'fragment',
+ 'expected' => $expected,
+ 'actual' => $actual,
+ )
+ );
+ }
+ }
+
+ /**
+ * Verifies that the processor's captured view of a safe (model-built)
+ * document agrees with the generated model — this guards the oracle
+ * itself against renderer/model drift, and is what justifies trusting
+ * the capture on wild documents.
+ */
+ private static function check_capture_against_model( array $document, array $capture, callable $record ): void {
+ $model_rows = DocumentGenerator::rows_from_model( $document['model'] );
+
+ $normalize = static function ( array $rows, bool $with_ancestors ): array {
+ $out = array();
+ foreach ( $rows as $row ) {
+ $attrs = array();
+ foreach ( $row['attrs'] as $attr ) {
+ $attrs[ $attr[0] ] = $attr[1];
+ }
+ ksort( $attrs );
+ $normalized = array(
+ 'tag' => $row['tag'],
+ 'fid' => $row['fid'],
+ 'attrs' => $attrs,
+ );
+ if ( $with_ancestors ) {
+ $normalized['ancestorTags'] = $row['ancestorTags'];
+ }
+ $out[] = $normalized;
+ }
+ return $out;
+ };
+
+ $expected = $normalize( $model_rows, true );
+ $actual = $normalize( $capture['htmlRows'], true );
+ if ( $expected !== $actual ) {
+ $record(
+ 'model-desync',
+ array(
+ 'processor' => 'html',
+ 'expected' => $expected,
+ 'actual' => $actual,
+ )
+ );
+ }
+
+ $expected_tags = $normalize( $model_rows, false );
+ $actual_tags = $normalize( $capture['tagRows'], false );
+ if ( $expected_tags !== $actual_tags ) {
+ $record(
+ 'model-desync',
+ array(
+ 'processor' => 'tag',
+ 'expected' => $expected_tags,
+ 'actual' => $actual_tags,
+ )
+ );
+ }
+
+ if ( $document['quirks'] !== $capture['quirks'] ) {
+ $record(
+ 'model-desync',
+ array(
+ 'processor' => 'quirks',
+ 'expected' => $document['quirks'],
+ 'actual' => $capture['quirks'],
+ )
+ );
+ }
+ }
+
+ /**
+ * Runs a select() loop over the document, collecting matched data-fids.
+ *
+ * @param string $target 'html' or 'tag'.
+ * @param array $document The case document ( may request fragment mode ).
+ * @return array{0: string[]|null, 1: \Throwable|null}
+ */
+ private static function collect_matches( string $target, string $selector_string, array $document ): array {
+ $html = $document['html'];
+ $context = ( $document['fragment'] ?? false ) ? $document['context'] : null;
+ return self::guard(
+ static function () use ( $target, $selector_string, $html, $context ) {
+ if ( 'tag' === $target ) {
+ $processor = new \WP_HTML_Tag_Processor( $html );
+ } elseif ( null !== $context ) {
+ $processor = \WP_HTML_Processor::create_fragment( $html, $context );
+ } else {
+ $processor = \WP_HTML_Processor::create_full_parser( $html );
+ }
+
+ $matches = array();
+ $iterations = 0;
+ while ( $processor->select( $selector_string ) ) {
+ $fid = $processor->get_attribute( 'data-fid' );
+ // Sanitize identically to TreeCapture/lexbor so a fid with
+ // a control char can never produce a false divergence on
+ // the match path ( unreachable today: fids are integers ).
+ $matches[] = is_string( $fid ) ? TreeCapture::sanitize_fid( $fid ) : '(missing-fid:' . $processor->get_tag() . ')';
+ if ( ++$iterations > self::SELECT_ITERATION_LIMIT ) {
+ throw new \RuntimeException( 'select() did not terminate within the iteration limit.' );
+ }
+ }
+
+ if ( $processor instanceof \WP_HTML_Processor ) {
+ if ( null !== $processor->get_last_error() ) {
+ throw new \RuntimeException( 'Processor error state: ' . $processor->get_last_error() );
+ }
+ if ( null !== $processor->get_unsupported_exception() ) {
+ throw new \RuntimeException( 'Processor unsupported state: ' . $processor->get_unsupported_exception()->getMessage() );
+ }
+ }
+
+ return $matches;
+ }
+ );
+ }
+
+ /**
+ * Flushes the select() parse caches.
+ *
+ * Both select() implementations memoize the most recently parsed selector
+ * string in a function-static cache, so whether a select() call re-parses
+ * — and therefore whether parse-time notices ( the invalid-UTF-8 scrub
+ * notice from from_selectors() ) fire — depends on what the worker
+ * happened to parse before. Parsing a sentinel selector first makes the
+ * next select() call for the case selector deterministic: it always
+ * re-parses, so exactly one parse happens inside each notice-assertion
+ * window regardless of worker history or case re-runs.
+ */
+ private static function flush_select_parse_caches(): void {
+ ( new \WP_HTML_Tag_Processor( '' ) )->select( '#-fuzz-cache-flush-' );
+ \WP_HTML_Processor::create_full_parser( '' )->select( '#-fuzz-cache-flush-' );
+ }
+
+ /**
+ * The _doing_it_wrong() name under which from_selectors() reports that an
+ * invalid-UTF-8 selector string was scrubbed to U+FFFD before parsing.
+ *
+ * @param string $target 'html' or 'tag'.
+ */
+ private static function scrub_notice_name( string $target ): string {
+ return ( 'tag' === $target ? 'WP_CSS_Compound_Selector_List' : 'WP_CSS_Complex_Selector_List' ) . '::from_selectors';
+ }
+
+ /**
+ * Runs a select() loop on a parseable selector and compares the match set
+ * against the reference matcher.
+ *
+ * @param string $target 'html' or 'tag'.
+ * @return string[]|null The actual match set, or null when matching failed.
+ */
+ private static function check_select_matches( string $target, string $selector_string, array $document, array $expected, callable $record ): ?array {
+ self::flush_select_parse_caches();
+ Bootstrap::reset_doing_it_wrong();
+
+ list( $actual, $error ) = self::collect_matches( $target, $selector_string, $document );
+
+ if ( null !== $error ) {
+ $record(
+ 'match-error',
+ array(
+ 'target' => $target,
+ 'error' => self::describe_throwable( $error ),
+ )
+ );
+ return null;
+ }
+
+ /*
+ * A selector string containing invalid UTF-8 is scrubbed to U+FFFD by
+ * from_selectors(), which reports the replacement with exactly one
+ * notice on the (single, cache-flushed) parse. Anything else is
+ * unexpected for a selector that parses.
+ */
+ $expected_calls = \wp_is_valid_utf8( $selector_string )
+ ? array()
+ : array(
+ array(
+ 'function' => self::scrub_notice_name( $target ),
+ ),
+ );
+
+ $doing_it_wrong = Bootstrap::doing_it_wrong_calls();
+ if ( ! self::notices_match( $expected_calls, $doing_it_wrong ) ) {
+ $record(
+ 'doing-it-wrong-unexpected',
+ array(
+ 'target' => $target,
+ 'expectedCalls' => $expected_calls,
+ 'calls' => $doing_it_wrong,
+ )
+ );
+ }
+
+ if ( $actual !== $expected ) {
+ $record(
+ 'match-mismatch-' . $target,
+ array(
+ 'expected' => $expected,
+ 'actual' => $actual,
+ )
+ );
+ }
+
+ return $actual;
+ }
+
+ private static function note_match_assertion( array &$match_stats, string $target, array $expected, array $actual ): void {
+ if ( ! isset( $match_stats[ $target ] ) ) {
+ $match_stats[ $target ] = array(
+ 'assertions' => 0,
+ 'nonVacuous' => 0,
+ );
+ }
+
+ ++$match_stats[ $target ]['assertions'];
+ if ( array() !== $expected || array() !== $actual ) {
+ ++$match_stats[ $target ]['nonVacuous'];
+ }
+ }
+
+ private static function finalize_match_stats( array $match_stats ): array {
+ foreach ( $match_stats as $bucket => $targets ) {
+ foreach ( $targets as $target => $counts ) {
+ $assertions = (int) ( $counts['assertions'] ?? 0 );
+ $non_vacuous = (int) ( $counts['nonVacuous'] ?? 0 );
+ $vacuous = max( 0, $assertions - $non_vacuous );
+
+ $match_stats[ $bucket ][ $target ]['vacuous'] = $vacuous;
+ $match_stats[ $bucket ][ $target ]['nonVacuousRate'] = $assertions > 0 ? round( $non_vacuous / $assertions, 4 ) : 0.0;
+ $match_stats[ $bucket ][ $target ]['vacuousRate'] = $assertions > 0 ? round( $vacuous / $assertions, 4 ) : 0.0;
+ }
+ }
+ return $match_stats;
+ }
+
+ /**
+ * Runs the lexbor differential — the THIRD, independent matching opinion.
+ *
+ * Quirks-mode documents are excluded unless the startup probe confirms
+ * lexbor has reliable class/#id case folding in both no-quirks and quirks
+ * mode. The comparison only runs when lexbor built the same element tree
+ * as WP ( fid/tag/ancestry multiset ), so it tests the selector layer,
+ * not tree construction.
+ *
+ * Verdict triage:
+ * - 'lexbor-divergence' lexbor != reference: a fuzzer-oracle problem
+ * ( or an un-compensated lexbor bug ) — never a
+ * WP verdict on its own.
+ * - 'lexbor-parse-reject' lexbor refused a selector WP accepted.
+ * - match-mismatch-html with NO lexbor-divergence on the same case
+ * means reference == lexbor != WP: a
+ * high-confidence WP finding.
+ *
+ * @return string Tally state:
+ * unavailable|skipped-quirks|skipped-utf8|error|tree-gated|compared.
+ */
+ private static function check_lexbor_differential( array $complex_ast, string $selector_string, array $document, array $rows, bool $quirks, array $expected, callable $record ): string {
+ if ( ! LexborOracle::available() ) {
+ return 'unavailable';
+ }
+ if ( $quirks && ! LexborOracle::quirks_class_id_reliable() ) {
+ return 'skipped-quirks';
+ }
+
+ /*
+ * lexbor receives a canonical re-render of the (already verified)
+ * AST rather than the original byte form: the differential targets
+ * matching semantics, while byte-level parsing (escapes, whitespace,
+ * modifier case — lexbor e.g. rejects uppercase I/S modifiers) is
+ * covered by the AST round-trip and metamorphic invariants. ASTs
+ * containing invalid UTF-8 cannot be re-rendered; since
+ * from_selectors() scrubs input to U+FFFD before parsing, none should
+ * exist and this skip is defensive ( a nonzero skipped-utf8 tally
+ * indicates a normalization bypass ).
+ */
+ if ( ! ast_strings_are_utf8( $complex_ast ) ) {
+ return 'skipped-utf8';
+ }
+ $canonical = SelectorGenerator::render_canonical( $complex_ast );
+
+ $lex = LexborOracle::query( $document['html'], $canonical );
+ if ( null === $lex ) {
+ return 'error';
+ }
+
+ if ( 'parse' === $lex['error'] ) {
+ $record(
+ 'lexbor-parse-reject',
+ array(
+ 'note' => 'lexbor rejected the canonical form of a selector the WP parser accepted',
+ 'canonical' => printable_bytes( $canonical ),
+ )
+ );
+ return 'compared';
+ }
+ if ( null !== $lex['error'] ) {
+ return 'error';
+ }
+
+ if ( ! self::trees_agree( $rows, $lex['rows'] ) ) {
+ return 'tree-gated';
+ }
+
+ /*
+ * Two known lexbor deviations are compensated for so the rest of the
+ * semantics still get differential coverage; WP itself is still held
+ * to the strict expectation:
+ *
+ * - lexbor #368: class/#id match ASCII case-insensitively even in
+ * no-quirks documents. Compare lexbor against the reference run
+ * with quirks-style class/ID folding.
+ * - lexbor does not implement HTML's case-insensitive attribute
+ * value list ( [rel=NOFOLLOW] does not match rel="nofollow" ),
+ * where browsers and WP do. Compare lexbor against the reference
+ * run with that list disabled.
+ */
+ $expected_for_lexbor = ReferenceMatcher::expected_html_matches_rows(
+ $complex_ast,
+ $rows,
+ LexborOracle::has_issue_368() ? true : $quirks,
+ false
+ );
+
+ // lexbor reports in document order, WP/reference in visit order —
+ // compare as multisets.
+ $lex_matches = $lex['matches'];
+ sort( $lex_matches );
+ sort( $expected_for_lexbor );
+
+ if ( $lex_matches !== $expected_for_lexbor ) {
+ $record(
+ 'lexbor-divergence',
+ array(
+ 'reference' => $expected_for_lexbor,
+ 'lexbor' => $lex_matches,
+ 'issue368' => LexborOracle::has_issue_368(),
+ )
+ );
+ }
+
+ return 'compared';
+ }
+
+ /** Multiset equality of ( tag, fid, ancestry ) between WP and lexbor rows. */
+ private static function trees_agree( array $wp_rows, array $lexbor_rows ): bool {
+ $serialize = static function ( array $rows ): array {
+ $out = array();
+ foreach ( $rows as $row ) {
+ $out[] = $row['tag'] . '|' . $row['fid'] . '|' . implode( ',', $row['ancestorTags'] );
+ }
+ sort( $out );
+ return $out;
+ };
+
+ return $serialize( $wp_rows ) === $serialize( $lexbor_rows );
+ }
+
+ /**
+ * Checks the metamorphic relations: each meaning-preserving transform of
+ * the parsed selector must parse, must (for AST-preserving transforms)
+ * parse to exactly the transformed AST, and must select exactly the same
+ * elements the original selector selected.
+ *
+ * @param array $complex_ast Canonical AST of the original selector.
+ * @param string[] $html_matches The original's WP_HTML_Processor match set.
+ */
+ private static function check_metamorphic( array $complex_ast, array $html_matches, array $document, Prng $prng, callable $record ): void {
+ foreach ( Metamorph::variants( $complex_ast, $prng ) as $variant ) {
+ $transform = $variant['name'];
+ $variant_selector = $variant['selector'];
+
+ list( $variant_list, $parse_error ) = self::guard(
+ static function () use ( $variant_selector ) {
+ return \WP_CSS_Complex_Selector_List::from_selectors( $variant_selector );
+ }
+ );
+
+ if ( null !== $parse_error ) {
+ $record(
+ 'metamorphic-error',
+ array(
+ 'transform' => $transform,
+ 'selector' => printable_bytes( $variant_selector ),
+ 'error' => self::describe_throwable( $parse_error ),
+ )
+ );
+ continue;
+ }
+
+ if ( null === $variant_list ) {
+ $record(
+ 'metamorphic-parse',
+ array(
+ 'transform' => $transform,
+ 'selector' => printable_bytes( $variant_selector ),
+ )
+ );
+ continue;
+ }
+
+ if ( $variant['astMustMatch'] ) {
+ list( $variant_ast, $shape_error ) = self::guard(
+ static function () use ( $variant_list ) {
+ return AstExtractor::from_complex_list( $variant_list );
+ }
+ );
+ if ( null !== $shape_error ) {
+ $record(
+ 'metamorphic-error',
+ array(
+ 'transform' => $transform,
+ 'selector' => printable_bytes( $variant_selector ),
+ 'error' => self::describe_throwable( $shape_error ),
+ )
+ );
+ continue;
+ }
+ if ( $variant_ast !== $variant['ast'] ) {
+ $record(
+ 'metamorphic-ast',
+ array(
+ 'transform' => $transform,
+ 'selector' => printable_bytes( $variant_selector ),
+ 'expectedAst' => $variant['ast'],
+ 'parsedAst' => $variant_ast,
+ )
+ );
+ continue;
+ }
+ }
+
+ Bootstrap::reset_doing_it_wrong();
+ list( $variant_matches, $match_error ) = self::collect_matches( 'html', $variant_selector, $document );
+
+ if ( null !== $match_error ) {
+ $record(
+ 'metamorphic-error',
+ array(
+ 'transform' => $transform,
+ 'selector' => printable_bytes( $variant_selector ),
+ 'error' => self::describe_throwable( $match_error ),
+ )
+ );
+ continue;
+ }
+
+ if ( $variant_matches !== $html_matches ) {
+ $record(
+ 'metamorphic-mismatch',
+ array(
+ 'transform' => $transform,
+ 'selector' => printable_bytes( $variant_selector ),
+ 'expected' => $html_matches,
+ 'actual' => $variant_matches,
+ )
+ );
+ }
+ }
+ }
+
+ /**
+ * For unparseable selectors: select() must return false, leave the
+ * processor usable, and report misuse exactly once per call.
+ */
+ private static function check_select_rejection( string $target, string $selector_string, array $document, callable $record ): void {
+ self::flush_select_parse_caches();
+ Bootstrap::reset_doing_it_wrong();
+
+ $context = ( $document['fragment'] ?? false ) ? $document['context'] : null;
+ list( $results, $error ) = self::guard(
+ static function () use ( $target, $selector_string, $document, $context ) {
+ if ( 'tag' === $target ) {
+ $processor = new \WP_HTML_Tag_Processor( $document['html'] );
+ } elseif ( null !== $context ) {
+ $processor = \WP_HTML_Processor::create_fragment( $document['html'], $context );
+ } else {
+ $processor = \WP_HTML_Processor::create_full_parser( $document['html'] );
+ }
+
+ // Two calls: the second exercises the parse cache.
+ return array( $processor->select( $selector_string ), $processor->select( $selector_string ) );
+ }
+ );
+
+ if ( null !== $error ) {
+ $record(
+ 'match-error',
+ array(
+ 'target' => $target,
+ 'rejected' => true,
+ 'error' => self::describe_throwable( $error ),
+ )
+ );
+ return;
+ }
+
+ if ( array( false, false ) !== $results ) {
+ $record(
+ 'select-on-null',
+ array(
+ 'target' => $target,
+ 'results' => $results,
+ )
+ );
+ }
+
+ /*
+ * Two select() calls report the unparseable selector once each; the
+ * parse cache only skips re-parsing, never the per-call notice. An
+ * invalid-UTF-8 selector additionally reports the U+FFFD scrub once,
+ * on the first call ( the only one that parses after the flush ).
+ */
+ $select_notice_name = ( 'tag' === $target ? 'WP_HTML_Tag_Processor' : 'WP_HTML_Processor' ) . '::select';
+ $expected_calls = array(
+ array( 'function' => $select_notice_name ),
+ array( 'function' => $select_notice_name ),
+ );
+ if ( ! \wp_is_valid_utf8( $selector_string ) ) {
+ array_unshift( $expected_calls, array( 'function' => self::scrub_notice_name( $target ) ) );
+ }
+
+ $doing_it_wrong = Bootstrap::doing_it_wrong_calls();
+ if ( ! self::notices_match( $expected_calls, $doing_it_wrong ) ) {
+ $record(
+ 'doing-it-wrong-missing',
+ array(
+ 'target' => $target,
+ 'expectedCalls' => $expected_calls,
+ 'calls' => $doing_it_wrong,
+ )
+ );
+ }
+ }
+
+ /**
+ * Compares recorded _doing_it_wrong() calls against expectations: same
+ * count, in order, matching on every key the expectation specifies
+ * ( recorded calls also carry 'message', which expectations omit ).
+ *
+ * @param array[] $expected_calls Expected calls, each a subset of record keys.
+ * @param array[] $actual_calls Recorded calls.
+ */
+ private static function notices_match( array $expected_calls, array $actual_calls ): bool {
+ if ( count( $expected_calls ) !== count( $actual_calls ) ) {
+ return false;
+ }
+ foreach ( $expected_calls as $i => $expected_call ) {
+ foreach ( $expected_call as $key => $value ) {
+ if ( ( $actual_calls[ $i ][ $key ] ?? null ) !== $value ) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /*
+ * -------------
+ * Batch running
+ * -------------
+ */
+
+ /**
+ * Runs a batch of sequential seeds.
+ *
+ * @return array Summary.
+ */
+ public static function run_batch( array $options ): array {
+ Bootstrap::load();
+
+ $start_seed = option_int( $options, 'start-seed', 1 );
+ $count = option_int( $options, 'count', 100 );
+ $failures_out = option_string( $options, 'failures-out', null );
+ $progress_file = option_string( $options, 'progress-file', null );
+ $determinism_every = option_int( $options, 'determinism-every', 16 );
+ $max_failures = option_int( $options, 'max-failures', 200 );
+
+ $started_at = microtime( true );
+ $failures = 0;
+ $buckets = array();
+ $signatures = array();
+ $lexbor = array();
+ $match_stats = array();
+ $last_seed = null;
+ $stop_reason = 'completed';
+
+ for ( $seed = $start_seed; $seed < $start_seed + $count; $seed++ ) {
+ if ( $max_failures > 0 && $failures >= $max_failures ) {
+ $stop_reason = 'max-failures';
+ break;
+ }
+ if ( null !== $progress_file ) {
+ file_put_contents( $progress_file, (string) $seed );
+ }
+
+ $result = self::run_case( $seed );
+
+ if ( $determinism_every > 0 && 0 === $seed % $determinism_every ) {
+ $repeat = self::run_case( $seed );
+ if ( $repeat['digest'] !== $result['digest'] ) {
+ $result['failures'][] = array(
+ 'invariant' => 'case-determinism',
+ 'detail' => array(
+ 'firstDigest' => $result['digest'],
+ 'secondDigest' => $repeat['digest'],
+ ),
+ );
+ }
+ }
+
+ $buckets[ $result['bucket'] ] = ( $buckets[ $result['bucket'] ] ?? 0 ) + 1;
+ $lexbor[ $result['lexbor'] ] = ( $lexbor[ $result['lexbor'] ] ?? 0 ) + 1;
+ $last_seed = $seed;
+ foreach ( $result['matchStats'] as $target => $stats ) {
+ if ( ! isset( $match_stats[ $result['bucket'] ][ $target ] ) ) {
+ $match_stats[ $result['bucket'] ][ $target ] = array(
+ 'assertions' => 0,
+ 'nonVacuous' => 0,
+ );
+ }
+ $match_stats[ $result['bucket'] ][ $target ]['assertions'] += $stats['assertions'];
+ $match_stats[ $result['bucket'] ][ $target ]['nonVacuous'] += $stats['nonVacuous'];
+ }
+
+ foreach ( $result['failures'] as $failure ) {
+ ++$failures;
+ $signature = self::signature( $failure );
+ $signatures[ $signature ] = ( $signatures[ $signature ] ?? 0 ) + 1;
+
+ $entry = array(
+ 'kind' => 'css-selector-fuzz-failure',
+ 'seed' => $result['seed'],
+ 'bucket' => $result['bucket'],
+ 'invariant' => $failure['invariant'],
+ 'signature' => $signature,
+ 'selector' => printable_bytes( $result['selector'] ),
+ 'selectorBase64' => base64_encode( $result['selector'] ),
+ 'htmlBase64' => base64_encode( $result['html'] ),
+ 'detail' => $failure['detail'],
+ );
+ if ( null !== $failures_out ) {
+ append_ndjson( $failures_out, $entry );
+ } else {
+ fwrite( STDERR, json_encode_safe( $entry ) . "\n" );
+ }
+ }
+ }
+
+ return array(
+ 'kind' => 'css-selector-fuzz-batch-summary',
+ 'startSeed' => $start_seed,
+ 'count' => $count,
+ 'lastSeed' => $last_seed,
+ 'failures' => $failures,
+ 'buckets' => $buckets,
+ 'signatures' => $signatures,
+ 'lexbor' => $lexbor,
+ 'matchStats' => self::finalize_match_stats( $match_stats ),
+ 'stopReason' => $stop_reason,
+ 'durationMs' => (int) round( 1000 * ( microtime( true ) - $started_at ) ),
+ );
+ }
+
+ /** Stable identity for de-duplicating equivalent failures. */
+ private static function signature( array $failure ): string {
+ $parts = array( $failure['invariant'] );
+ if ( isset( $failure['detail']['grammar'] ) ) {
+ $parts[] = $failure['detail']['grammar'];
+ }
+ if ( isset( $failure['detail']['target'] ) ) {
+ $parts[] = $failure['detail']['target'];
+ }
+ if ( isset( $failure['detail']['transform'] ) ) {
+ $parts[] = $failure['detail']['transform'];
+ }
+ if ( isset( $failure['detail']['error']['class'] ) ) {
+ $parts[] = $failure['detail']['error']['class'];
+ $parts[] = preg_replace( '/[0-9]+/', 'N', (string) ( $failure['detail']['error']['message'] ?? '' ) );
+ }
+ return substr( sha1( implode( '|', $parts ) ), 0, 12 ) . ':' . $failure['invariant'];
+ }
+
+ /*
+ * -------
+ * Helpers
+ * -------
+ */
+
+ /**
+ * Calls $fn with PHP warnings/notices converted to exceptions.
+ *
+ * @return array{0: mixed, 1: \Throwable|null}
+ */
+ private static function guard( callable $fn ): array {
+ set_error_handler(
+ static function ( $severity, $message, $file, $line ) {
+ if ( E_DEPRECATED === $severity || E_USER_DEPRECATED === $severity ) {
+ return true;
+ }
+ throw new \ErrorException( $message, 0, $severity, $file, $line );
+ }
+ );
+ try {
+ return array( $fn(), null );
+ } catch ( \Throwable $e ) {
+ return array( null, $e );
+ } finally {
+ restore_error_handler();
+ }
+ }
+
+ public static function describe_throwable( \Throwable $e ): array {
+ $root = repo_root() . DIRECTORY_SEPARATOR;
+ return array(
+ 'class' => get_class( $e ),
+ 'message' => $e->getMessage(),
+ 'at' => str_replace( $root, '', $e->getFile() ) . ':' . $e->getLine(),
+ 'trace' => array_slice(
+ array_map(
+ static function ( $frame ) use ( $root ) {
+ $location = isset( $frame['file'] )
+ ? str_replace( $root, '', $frame['file'] ) . ':' . ( $frame['line'] ?? '?' )
+ : '[internal]';
+ $callable = ( $frame['class'] ?? '' ) . ( $frame['type'] ?? '' ) . ( $frame['function'] ?? '' );
+ return $location . ' ' . $callable;
+ },
+ $e->getTrace()
+ ),
+ 0,
+ 6
+ ),
+ );
+ }
+}
diff --git a/tools/css-selector-fuzz/lib/autoload.php b/tools/css-selector-fuzz/lib/autoload.php
new file mode 100644
index 0000000000000..6ebdcbc75d6c3
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/autoload.php
@@ -0,0 +1,13 @@
+ array() );
+ $count = count( $argv );
+ for ( $i = 1; $i < $count; $i++ ) {
+ $arg = $argv[ $i ];
+ if ( 0 === strpos( $arg, '--' ) ) {
+ $name = substr( $arg, 2 );
+ if ( false !== strpos( $name, '=' ) ) {
+ list( $name, $value ) = explode( '=', $name, 2 );
+ $options[ $name ] = $value;
+ } elseif ( $i + 1 < $count && 0 !== strpos( $argv[ $i + 1 ], '--' ) ) {
+ $options[ $name ] = $argv[ ++$i ];
+ } else {
+ $options[ $name ] = true;
+ }
+ } else {
+ $options['_'][] = $arg;
+ }
+ }
+ return $options;
+}
+
+function option_string( array $options, string $name, ?string $default = null ): ?string {
+ if ( ! array_key_exists( $name, $options ) || true === $options[ $name ] ) {
+ return $default;
+ }
+ return (string) $options[ $name ];
+}
+
+function option_int( array $options, string $name, int $default ): int {
+ $value = option_string( $options, $name, null );
+ return null === $value ? $default : (int) $value;
+}
+
+function option_float( array $options, string $name, float $default ): float {
+ $value = option_string( $options, $name, null );
+ return null === $value ? $default : (float) $value;
+}
+
+function option_bool( array $options, string $name, bool $default ): bool {
+ if ( ! array_key_exists( $name, $options ) ) {
+ return $default;
+ }
+ $value = $options[ $name ];
+ if ( true === $value ) {
+ return true;
+ }
+ return in_array( strtolower( (string) $value ), array( '1', 'true', 'yes', 'on' ), true );
+}
+
+function ensure_dir( string $dir ): void {
+ if ( ! is_dir( $dir ) && ! mkdir( $dir, 0777, true ) && ! is_dir( $dir ) ) {
+ throw new \RuntimeException( "Could not create directory: {$dir}" );
+ }
+}
+
+function json_encode_safe( $value ): string {
+ $encoded = json_encode( $value, JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_SUBSTITUTE );
+ if ( false === $encoded ) {
+ $encoded = json_encode( array( 'jsonError' => json_last_error_msg() ) );
+ }
+ return $encoded;
+}
+
+function write_json_file( string $path, $value ): void {
+ file_put_contents( $path, json_encode_safe( $value ) . "\n" );
+}
+
+function read_json_file( string $path ): ?array {
+ if ( ! is_file( $path ) ) {
+ return null;
+ }
+ $decoded = json_decode( (string) file_get_contents( $path ), true );
+ return is_array( $decoded ) ? $decoded : null;
+}
+
+function append_ndjson( string $path, array $value ): void {
+ file_put_contents( $path, json_encode_safe( $value ) . "\n", FILE_APPEND | LOCK_EX );
+}
+
+function timestamp(): string {
+ return gmdate( 'Ymd-His' );
+}
+
+/**
+ * Renders bytes for human inspection: printable ASCII passes through,
+ * everything else becomes \xHH.
+ */
+function printable_bytes( string $bytes, int $max_length = 4096 ): string {
+ $out = '';
+ $truncated = strlen( $bytes ) > $max_length;
+ $bytes = substr( $bytes, 0, $max_length );
+ for ( $i = 0; $i < strlen( $bytes ); $i++ ) {
+ $c = $bytes[ $i ];
+ $o = ord( $c );
+ if ( $o >= 0x20 && $o <= 0x7E ) {
+ $out .= '\\' === $c ? '\\\\' : $c;
+ } else {
+ $out .= sprintf( '\\x%02X', $o );
+ }
+ }
+ return $out . ( $truncated ? '…(truncated)' : '' );
+}
+
+function git_metadata(): array {
+ $head = trim( (string) shell_exec( 'git -C ' . escapeshellarg( repo_root() ) . ' rev-parse HEAD 2>/dev/null' ) );
+ $branch = trim( (string) shell_exec( 'git -C ' . escapeshellarg( repo_root() ) . ' rev-parse --abbrev-ref HEAD 2>/dev/null' ) );
+ return array(
+ 'head' => '' !== $head ? $head : null,
+ 'branch' => '' !== $branch ? $branch : null,
+ );
+}
+
+/** Whether every string anywhere in a nested array is valid UTF-8. */
+function ast_strings_are_utf8( $node ): bool {
+ if ( is_string( $node ) ) {
+ return (bool) preg_match( '//u', $node );
+ }
+ if ( is_array( $node ) ) {
+ foreach ( $node as $child ) {
+ if ( ! ast_strings_are_utf8( $child ) ) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+function ascii_strtolower( string $input ): string {
+ return strtr( $input, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz' );
+}
+
+function ascii_strtoupper( string $input ): string {
+ return strtr( $input, 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' );
+}
+
+/** Flips the case of each ASCII letter independently with 50% probability. */
+function str_shuffle_case( string $input, Prng $prng ): string {
+ $out = '';
+ for ( $i = 0; $i < strlen( $input ); $i++ ) {
+ $byte = $input[ $i ];
+ if ( $prng->chance( 50 ) ) {
+ $byte = ctype_lower( $byte ) ? ascii_strtoupper( $byte ) : ascii_strtolower( $byte );
+ }
+ $out .= $byte;
+ }
+ return $out;
+}
+
+/**
+ * Splits a valid UTF-8 string into codepoints.
+ *
+ * @return array Pairs of ( utf8 bytes, codepoint value ).
+ */
+function utf8_codepoints( string $input ): array {
+ $out = array();
+ $len = strlen( $input );
+ $i = 0;
+ while ( $i < $len ) {
+ $byte = ord( $input[ $i ] );
+ if ( $byte < 0x80 ) {
+ $size = 1;
+ $cp = $byte;
+ } elseif ( 0xC0 === ( $byte & 0xE0 ) ) {
+ $size = 2;
+ $cp = $byte & 0x1F;
+ } elseif ( 0xE0 === ( $byte & 0xF0 ) ) {
+ $size = 3;
+ $cp = $byte & 0x0F;
+ } else {
+ $size = 4;
+ $cp = $byte & 0x07;
+ }
+ $size = min( $size, $len - $i );
+ for ( $j = 1; $j < $size; $j++ ) {
+ $cp = ( $cp << 6 ) | ( ord( $input[ $i + $j ] ) & 0x3F );
+ }
+ $out[] = array( substr( $input, $i, $size ), $cp );
+ $i += $size;
+ }
+ return $out;
+}
diff --git a/tools/css-selector-fuzz/lib/wp-stubs.php b/tools/css-selector-fuzz/lib/wp-stubs.php
new file mode 100644
index 0000000000000..ec9b154ee58d6
--- /dev/null
+++ b/tools/css-selector-fuzz/lib/wp-stubs.php
@@ -0,0 +1,62 @@
+ (string) $function_name,
+ 'message' => (string) $message,
+ );
+ }
+}
+
+if ( ! function_exists( '_deprecated_argument' ) ) {
+ function _deprecated_argument( $function_name, $version, $message = '' ) {
+ }
+}
+
+if ( ! function_exists( 'wp_trigger_error' ) ) {
+ function wp_trigger_error( $function_name, $message, $error_level = E_USER_NOTICE ) {
+ $GLOBALS['css_selector_fuzz_doing_it_wrong'][] = array(
+ 'function' => (string) $function_name,
+ 'message' => (string) $message,
+ );
+ }
+}
+
+if ( ! function_exists( 'wp_kses_uri_attributes' ) ) {
+ function wp_kses_uri_attributes() {
+ return array(
+ 'action',
+ 'archive',
+ 'background',
+ 'cite',
+ 'classid',
+ 'codebase',
+ 'data',
+ 'formaction',
+ 'href',
+ 'icon',
+ 'longdesc',
+ 'manifest',
+ 'poster',
+ 'profile',
+ 'src',
+ 'usemap',
+ 'xmlns',
+ );
+ }
+}
diff --git a/tools/css-selector-fuzz/minimize.php b/tools/css-selector-fuzz/minimize.php
new file mode 100644
index 0000000000000..c7fe1f1cb3dda
--- /dev/null
+++ b/tools/css-selector-fuzz/minimize.php
@@ -0,0 +1,268 @@
+#!/usr/bin/env php
+ metamorphic-ast, Bug 2 -> match-mismatch-html,
+ * Bug 3 -> metamorphic-parse — reachable via --signature).
+ *
+ * Usage:
+ * php tools/css-selector-fuzz/minimize.php --seed 1234 [--signature SUBSTR]
+ * php tools/css-selector-fuzz/minimize.php --selector 'sel' --html '<…>' [--signature SUBSTR]
+ *
+ * Options:
+ * --signature SUBSTR Target a signature whose id or invariant contains
+ * SUBSTR. For --seed, also the way to opt into a
+ * related self-contained signature when the seed's own
+ * failure is generator-side (printed as a retarget).
+ * --max-attempts N Cap test evaluations (default 4000).
+ * --json Emit the reproducer as JSON.
+ */
+
+require_once __DIR__ . '/lib/autoload.php';
+
+use CssSelectorFuzz\Worker;
+use function CssSelectorFuzz\json_encode_safe;
+use function CssSelectorFuzz\option_bool;
+use function CssSelectorFuzz\option_int;
+use function CssSelectorFuzz\option_string;
+use function CssSelectorFuzz\parse_cli_options;
+use function CssSelectorFuzz\printable_bytes;
+
+$options = parse_cli_options( $argv );
+$max_attempts = option_int( $options, 'max-attempts', 20000 );
+$sig_filter = option_string( $options, 'signature', null );
+
+/*
+ * In --seed mode, the seed's OWN failures ( from run_case ) are the source
+ * of truth. The minimizer can only preserve "self-contained" signatures
+ * ( those run_pair re-checks without the generator's intended AST ); the
+ * generator-side ones ( ast-mismatch, parse-expectation, path-expectation,
+ * model-desync ) are invisible to run_pair. Targeting must therefore be
+ * restricted to the intersection of the seed's failures and run_pair's
+ * view — otherwise the minimizer could silently retarget to an unrelated
+ * incidental signature and report a false "reproduced".
+ */
+$seed = option_int( $options, 'seed', -1 );
+$seed_signatures = null;
+if ( $seed >= 0 ) {
+ $case = Worker::run_case( $seed );
+ $selector = $case['selector'];
+ $html = $case['html'];
+ $seed_signatures = $case['signatures'];
+ if ( array() === $seed_signatures ) {
+ fwrite( STDERR, "Seed {$seed} produced no failure; nothing to minimize.\n" );
+ exit( 1 );
+ }
+} else {
+ $selector = option_string( $options, 'selector', null );
+ $html = option_string( $options, 'html', null );
+ if ( null === $selector || null === $html ) {
+ fwrite( STDERR, "Provide --seed N, or both --selector and --html.\n" );
+ exit( 1 );
+ }
+}
+
+/** Signatures produced by a pair ( $target lets run_pair short-circuit ). */
+$signatures_of = static function ( string $selector, string $html, ?string $target = null ): array {
+ return Worker::run_pair( $selector, $html, $target )['signatures'];
+};
+
+$baseline = $signatures_of( $selector, $html );
+if ( array() === $baseline ) {
+ fwrite( STDERR, "The starting pair does not reproduce any self-contained failure.\n" );
+ if ( null !== $seed_signatures ) {
+ fwrite( STDERR, 'Seed failure(s): ' . implode( ', ', $seed_signatures ) . "\n" );
+ fwrite( STDERR, "These are generator-side signatures the minimizer cannot reproduce from the\n" );
+ fwrite( STDERR, "pair alone. Minimize a seed whose failure is self-contained, or pass\n" );
+ fwrite( STDERR, "--selector/--html directly.\n" );
+ }
+ fwrite( STDERR, 'selector: ' . printable_bytes( $selector ) . "\n" );
+ exit( 1 );
+}
+
+/*
+ * Candidate targets are matched at the INVARIANT level, not the exact
+ * signature hash: a signature embeds transform-specific detail ( e.g.
+ * metamorphic-parse via `rerender` vs via `dup-branch` ), and run_pair's
+ * fixed metamorphic draws may expose the same invariant through a
+ * different transform than run_case did. Same invariant == same bug class,
+ * so that is faithful. A DIFFERENT invariant ( e.g. the seed's generator-
+ * side ast-mismatch vs an incidental self-contained metamorphic-ast ) is a
+ * genuine retarget and must be opted into.
+ */
+$invariant_of = static function ( string $signature ): string {
+ $pos = strrpos( $signature, ':' );
+ return false === $pos ? $signature : substr( $signature, $pos + 1 );
+};
+
+$retargeted = false;
+if ( null === $seed_signatures ) {
+ $candidates = $baseline;
+} else {
+ $seed_invariants = array_map( $invariant_of, $seed_signatures );
+ $candidates = array();
+ foreach ( $baseline as $signature ) {
+ if ( in_array( $invariant_of( $signature ), $seed_invariants, true ) ) {
+ $candidates[] = $signature;
+ }
+ }
+}
+
+if ( array() === $candidates ) {
+ // The seed's failures are all generator-side ( no self-contained
+ // invariant in common ); refuse to silently minimize an unrelated
+ // incidental signature.
+ fwrite( STDERR, "Seed {$seed}'s failures are not self-contained, so the minimizer cannot\n" );
+ fwrite( STDERR, "faithfully reproduce them.\n" );
+ fwrite( STDERR, 'Seed failure(s): ' . implode( ', ', $seed_signatures ) . "\n" );
+ fwrite( STDERR, 'Self-contained nearby: ' . implode( ', ', $baseline ) . "\n" );
+ fwrite( STDERR, "Re-run with --signature to minimize one of the nearby signatures\n" );
+ fwrite( STDERR, "explicitly ( understanding it is a related, not identical, failure ).\n" );
+ if ( null === $sig_filter ) {
+ exit( 1 );
+ }
+ // User explicitly opted into a nearby signature.
+ $candidates = $baseline;
+ $retargeted = true;
+}
+
+// Pick the target signature from the eligible candidates.
+$target = $candidates[0];
+if ( null !== $sig_filter ) {
+ foreach ( $candidates as $candidate ) {
+ if ( false !== strpos( $candidate, $sig_filter ) ) {
+ $target = $candidate;
+ break;
+ }
+ }
+}
+
+$attempts = 0;
+$reproduces = static function ( string $selector, string $html ) use ( $signatures_of, $target, &$attempts, $max_attempts ): bool {
+ if ( $attempts >= $max_attempts ) {
+ return false;
+ }
+ ++$attempts;
+ return in_array( $target, $signatures_of( $selector, $html, $target ), true );
+};
+
+/**
+ * Delta-debugging shrink of one byte string: ddmin chunk removal followed
+ * by per-position single-byte simplification. $test( candidate ) decides
+ * whether a candidate still reproduces.
+ */
+$shrink = static function ( string $current, callable $test ) use ( &$attempts, $max_attempts ): string {
+ $chunks = 2;
+ while ( strlen( $current ) > 0 && $attempts < $max_attempts ) {
+ $length = strlen( $current );
+ $chunk_size = (int) ceil( $length / $chunks );
+ $changed = false;
+
+ for ( $offset = 0; $offset < $length && $attempts < $max_attempts; $offset += $chunk_size ) {
+ $candidate = substr( $current, 0, $offset ) . substr( $current, min( $length, $offset + $chunk_size ) );
+ if ( $candidate === $current ) {
+ continue;
+ }
+ if ( $test( $candidate ) ) {
+ $current = $candidate;
+ $chunks = max( 2, $chunks - 1 );
+ $changed = true;
+ break;
+ }
+ }
+
+ if ( ! $changed ) {
+ if ( $chunks >= $length ) {
+ break;
+ }
+ $chunks = min( $length, $chunks * 2 );
+ }
+ }
+
+ // Per-byte canonicalization: replace each byte with a simpler stand-in.
+ $replacements = array( 'a', ' ', '' );
+ for ( $i = 0; $i < strlen( $current ) && $attempts < $max_attempts; $i++ ) {
+ foreach ( $replacements as $replacement ) {
+ $candidate = substr( $current, 0, $i ) . $replacement . substr( $current, $i + 1 );
+ if ( $candidate === $current ) {
+ continue;
+ }
+ if ( $test( $candidate ) ) {
+ $current = $candidate;
+ $i = max( -1, $i - 2 );
+ break;
+ }
+ }
+ }
+
+ return $current;
+};
+
+// Alternate shrinking the HTML and the selector until neither moves.
+// HTML first: when the signature is selector-only (e.g. metamorphic-parse)
+// the document collapses cheaply before the costlier selector pass.
+$prev = null;
+while ( $attempts < $max_attempts && ( $selector . "\0" . $html ) !== $prev ) {
+ $prev = $selector . "\0" . $html;
+
+ $html = $shrink(
+ $html,
+ static function ( string $candidate ) use ( $reproduces, &$selector ): bool {
+ return $reproduces( $selector, $candidate );
+ }
+ );
+ $selector = $shrink(
+ $selector,
+ static function ( string $candidate ) use ( $reproduces, &$html ): bool {
+ return $reproduces( $candidate, $html );
+ }
+ );
+}
+
+$final = $signatures_of( $selector, $html );
+$ok = in_array( $target, $final, true );
+
+if ( option_bool( $options, 'json', false ) ) {
+ echo json_encode_safe(
+ array(
+ 'target' => $target,
+ 'retargeted' => $retargeted,
+ 'seedSignatures' => $seed_signatures,
+ 'reproduced' => $ok,
+ 'attempts' => $attempts,
+ 'selector' => printable_bytes( $selector ),
+ 'selectorBytes' => strlen( $selector ),
+ 'html' => printable_bytes( $html ),
+ 'htmlBytes' => strlen( $html ),
+ 'selectorBase64' => base64_encode( $selector ),
+ 'htmlBase64' => base64_encode( $html ),
+ )
+ ) . "\n";
+ exit( $ok ? 0 : 2 );
+}
+
+echo "target: {$target}\n";
+if ( $retargeted ) {
+ echo 'NOTE: seed failure(s) ' . implode( ', ', $seed_signatures ) . " are generator-side;\n";
+ echo " minimized the related self-contained signature above instead.\n";
+}
+echo 'reproduced: ' . ( $ok ? 'yes' : 'NO' ) . "\n";
+echo "attempts: {$attempts}\n";
+echo 'selector: ' . printable_bytes( $selector ) . ' (' . strlen( $selector ) . " bytes)\n";
+echo 'html: ' . printable_bytes( $html ) . ' (' . strlen( $html ) . " bytes)\n";
+echo "\nreplay:\n";
+echo ' php tools/css-selector-fuzz/replay.php --selector ' . escapeshellarg( $selector )
+ . ' --html ' . escapeshellarg( $html ) . "\n";
+exit( $ok ? 0 : 2 );
diff --git a/tools/css-selector-fuzz/replay.php b/tools/css-selector-fuzz/replay.php
new file mode 100644
index 0000000000000..38ffb8a43678e
--- /dev/null
+++ b/tools/css-selector-fuzz/replay.php
@@ -0,0 +1,91 @@
+#!/usr/bin/env php
+ bar' [--html '…
']
+ */
+
+require_once __DIR__ . '/lib/autoload.php';
+
+use CssSelectorFuzz\Bootstrap;
+use CssSelectorFuzz\Worker;
+use function CssSelectorFuzz\json_encode_safe;
+use function CssSelectorFuzz\option_bool;
+use function CssSelectorFuzz\option_int;
+use function CssSelectorFuzz\option_string;
+use function CssSelectorFuzz\parse_cli_options;
+use function CssSelectorFuzz\printable_bytes;
+
+$options = parse_cli_options( $argv );
+
+$probe_selector = option_string( $options, 'selector', null );
+if ( null !== $probe_selector ) {
+ // Quick probe mode: parse a selector and report what the API does with it.
+ Bootstrap::load();
+
+ $compound = \WP_CSS_Compound_Selector_List::from_selectors( $probe_selector );
+ $complex = \WP_CSS_Complex_Selector_List::from_selectors( $probe_selector );
+
+ $report = array(
+ 'selector' => printable_bytes( $probe_selector ),
+ 'compoundList' => null === $compound ? null : \CssSelectorFuzz\AstExtractor::from_compound_list( $compound ),
+ 'complexList' => null === $complex ? null : \CssSelectorFuzz\AstExtractor::from_complex_list( $complex ),
+ );
+
+ $html = option_string( $options, 'html', null );
+ if ( null !== $html && null !== $complex ) {
+ $processor = \WP_HTML_Processor::create_full_parser( $html );
+ $matches = array();
+ while ( $processor->select( $probe_selector ) ) {
+ $matches[] = array(
+ 'tag' => $processor->get_tag(),
+ 'breadcrumbs' => $processor->get_breadcrumbs(),
+ );
+ }
+ $report['htmlProcessorMatches'] = $matches;
+ }
+
+ echo json_encode_safe( $report ) . "\n";
+ exit( 0 );
+}
+
+$seed = option_int( $options, 'seed', -1 );
+if ( $seed < 0 ) {
+ echo "Usage: php tools/css-selector-fuzz/replay.php --seed N [--json] [--show-html]\n";
+ echo " php tools/css-selector-fuzz/replay.php --selector 'div > .cls' [--html '…
']\n";
+ exit( 1 );
+}
+
+$result = Worker::run_case( $seed );
+
+if ( option_bool( $options, 'json', false ) ) {
+ echo json_encode_safe( $result ) . "\n";
+ exit( array() === $result['failures'] ? 0 : 2 );
+}
+
+echo "seed: {$result['seed']}\n";
+echo "bucket: {$result['bucket']}\n";
+echo 'selector: ' . printable_bytes( $result['selector'] ) . "\n";
+echo "digest: {$result['digest']}\n";
+
+if ( option_bool( $options, 'show-html', false ) ) {
+ echo "html: " . printable_bytes( $result['html'] ) . "\n";
+}
+
+if ( array() === $result['failures'] ) {
+ echo "failures: none\n";
+ exit( 0 );
+}
+
+echo 'failures: ' . count( $result['failures'] ) . "\n";
+foreach ( $result['failures'] as $i => $failure ) {
+ echo "--- failure {$i}: {$failure['invariant']} ---\n";
+ echo json_encode_safe( $failure['detail'] ) . "\n";
+}
+exit( 2 );
diff --git a/tools/css-selector-fuzz/runner.php b/tools/css-selector-fuzz/runner.php
new file mode 100644
index 0000000000000..414eb167a660e
--- /dev/null
+++ b/tools/css-selector-fuzz/runner.php
@@ -0,0 +1,337 @@
+#!/usr/bin/env php
+ array( 'pipe', 'r' ),
+ 1 => array( 'pipe', 'w' ),
+ 2 => array( 'pipe', 'w' ),
+ );
+
+ $started = microtime( true );
+ $proc = proc_open( $command, $descriptors, $pipes, repo_root() );
+ if ( ! is_resource( $proc ) ) {
+ return array(
+ 'code' => null,
+ 'timedOut' => false,
+ 'stdout' => '',
+ 'stderr' => 'proc_open failed',
+ 'durationMs' => 0,
+ );
+ }
+
+ fclose( $pipes[0] );
+ stream_set_blocking( $pipes[1], false );
+ stream_set_blocking( $pipes[2], false );
+
+ $stdout = '';
+ $stderr = '';
+ $timed_out = false;
+ $deadline = $started + $timeout_ms / 1000;
+
+ while ( true ) {
+ $status = proc_get_status( $proc );
+ $stdout .= (string) stream_get_contents( $pipes[1] );
+ $stderr .= (string) stream_get_contents( $pipes[2] );
+
+ if ( ! $status['running'] ) {
+ $code = $status['exitcode'];
+ break;
+ }
+ if ( microtime( true ) > $deadline ) {
+ $timed_out = true;
+ proc_terminate( $proc, 9 );
+ $code = null;
+ break;
+ }
+ usleep( 10000 );
+ }
+
+ $stdout .= (string) stream_get_contents( $pipes[1] );
+ $stderr .= (string) stream_get_contents( $pipes[2] );
+ fclose( $pipes[1] );
+ fclose( $pipes[2] );
+ proc_close( $proc );
+
+ return array(
+ 'code' => $code,
+ 'timedOut' => $timed_out,
+ 'stdout' => $stdout,
+ 'stderr' => $stderr,
+ 'durationMs' => (int) round( 1000 * ( microtime( true ) - $started ) ),
+ );
+}
+
+/** Extracts the batch summary from worker stdout, or null. */
+function css_selector_fuzz_worker_summary( string $stdout ): ?array {
+ foreach ( array_reverse( explode( "\n", trim( $stdout ) ) ) as $line ) {
+ $decoded = json_decode( $line, true );
+ if ( is_array( $decoded ) && 'css-selector-fuzz-batch-summary' === ( $decoded['kind'] ?? null ) ) {
+ return $decoded;
+ }
+ }
+ return null;
+}
+
+/** Merges per-bucket/per-target match assertion counts. */
+function css_selector_fuzz_merge_match_stats( array &$target, array $source ): void {
+ foreach ( $source as $bucket => $targets ) {
+ foreach ( $targets as $match_target => $stats ) {
+ if ( ! isset( $target[ $bucket ][ $match_target ] ) ) {
+ $target[ $bucket ][ $match_target ] = array(
+ 'assertions' => 0,
+ 'nonVacuous' => 0,
+ );
+ }
+ $target[ $bucket ][ $match_target ]['assertions'] += (int) ( $stats['assertions'] ?? 0 );
+ $target[ $bucket ][ $match_target ]['nonVacuous'] += (int) ( $stats['nonVacuous'] ?? 0 );
+ }
+ }
+}
+
+/** Adds derived rates after all count aggregation is finished. */
+function css_selector_fuzz_finalize_match_stats( array $stats ): array {
+ foreach ( $stats as $bucket => $targets ) {
+ foreach ( $targets as $match_target => $counts ) {
+ $assertions = (int) ( $counts['assertions'] ?? 0 );
+ $non_vacuous = (int) ( $counts['nonVacuous'] ?? 0 );
+ $vacuous = max( 0, $assertions - $non_vacuous );
+
+ $stats[ $bucket ][ $match_target ]['vacuous'] = $vacuous;
+ $stats[ $bucket ][ $match_target ]['nonVacuousRate'] = $assertions > 0 ? round( $non_vacuous / $assertions, 4 ) : 0.0;
+ $stats[ $bucket ][ $match_target ]['vacuousRate'] = $assertions > 0 ? round( $vacuous / $assertions, 4 ) : 0.0;
+ }
+ }
+ return $stats;
+}
+
+function css_selector_fuzz_write_state( string $state_path, array $state ): void {
+ $state['matchStats'] = css_selector_fuzz_finalize_match_stats( $state['matchStats'] ?? array() );
+ write_json_file( $state_path, $state );
+}
+
+function css_selector_fuzz_state_for_output( array $state ): array {
+ $state['matchStats'] = css_selector_fuzz_finalize_match_stats( $state['matchStats'] ?? array() );
+ return $state;
+}
+
+$options = parse_cli_options( $argv );
+if ( option_bool( $options, 'help', false ) || option_bool( $options, 'h', false ) ) {
+ echo "Usage: php tools/css-selector-fuzz/runner.php [--start-seed N] [--max-seeds N] [--duration-seconds N] [--chunk-size N] [--timeout-ms N] [--output-dir DIR] [--stop-on-failure]\n";
+ exit( 0 );
+}
+
+$start_seed = option_int( $options, 'start-seed', 1 );
+$max_seeds = option_int( $options, 'max-seeds', 1000 );
+$duration_seconds = option_int( $options, 'duration-seconds', 120 );
+$chunk_size = max( 1, option_int( $options, 'chunk-size', 200 ) );
+$timeout_ms = option_int( $options, 'timeout-ms', 0 );
+$stop_on_failure = option_bool( $options, 'stop-on-failure', false );
+$output_dir = option_string( $options, 'output-dir', repo_root() . '/artifacts/css-selector-fuzz/run-' . timestamp() );
+
+if ( $max_seeds < 1 ) {
+ fwrite( STDERR, "--max-seeds must be at least 1; refusing to run unbounded.\n" );
+ exit( 1 );
+}
+if ( 0 === $timeout_ms ) {
+ // Generous per-chunk budget: ~50ms per case plus startup.
+ $timeout_ms = $chunk_size * 50 + 10000;
+}
+
+ensure_dir( $output_dir );
+$failures_path = $output_dir . '/failures.ndjson';
+$state_path = $output_dir . '/state.json';
+$worker_script = __DIR__ . '/worker.php';
+
+$state = array(
+ 'kind' => 'css-selector-fuzz-runner-state',
+ 'startedAt' => gmdate( 'c' ),
+ 'updatedAt' => gmdate( 'c' ),
+ 'git' => git_metadata(),
+ 'phpVersion' => PHP_VERSION,
+ 'outputDir' => $output_dir,
+ 'startSeed' => $start_seed,
+ 'maxSeeds' => $max_seeds,
+ 'durationSeconds' => $duration_seconds,
+ 'chunkSize' => $chunk_size,
+ 'casesCompleted' => 0,
+ 'failures' => 0,
+ 'crashes' => 0,
+ 'buckets' => array(),
+ 'signatures' => array(),
+ 'lexbor' => array(),
+ 'matchStats' => array(),
+ 'nextSeed' => $start_seed,
+ 'stopReason' => null,
+);
+css_selector_fuzz_write_state( $state_path, $state );
+
+$deadline = $duration_seconds > 0 ? microtime( true ) + $duration_seconds : null;
+$seed = $start_seed;
+$end_seed = $start_seed + $max_seeds;
+
+while ( $seed < $end_seed ) {
+ if ( null !== $deadline && microtime( true ) > $deadline ) {
+ $state['stopReason'] = 'duration-elapsed';
+ break;
+ }
+
+ $count = min( $chunk_size, $end_seed - $seed );
+ $args = array(
+ $worker_script,
+ '--start-seed',
+ (string) $seed,
+ '--count',
+ (string) $count,
+ '--failures-out',
+ $failures_path,
+ '--progress-file',
+ $output_dir . '/progress.txt',
+ );
+
+ $proc = css_selector_fuzz_run_php( $args, $timeout_ms );
+ $summary = css_selector_fuzz_worker_summary( $proc['stdout'] );
+
+ if ( null === $summary ) {
+ /*
+ * The worker crashed, hung, or died fatally. Re-run each seed of the
+ * chunk in its own process to attribute the crash.
+ */
+ fwrite( STDERR, "chunk seed={$seed} count={$count}: worker crashed/hung; isolating…\n" );
+ for ( $isolated = $seed; $isolated < $seed + $count; $isolated++ ) {
+ $single = css_selector_fuzz_run_php(
+ array(
+ $worker_script,
+ '--start-seed',
+ (string) $isolated,
+ '--count',
+ '1',
+ '--failures-out',
+ $failures_path,
+ '--determinism-every',
+ '0',
+ ),
+ max( 5000, (int) ( $timeout_ms / $count ) + 5000 )
+ );
+ $single_summary = css_selector_fuzz_worker_summary( $single['stdout'] );
+ if ( null === $single_summary ) {
+ ++$state['crashes'];
+ ++$state['failures'];
+ append_ndjson(
+ $failures_path,
+ array(
+ 'kind' => 'css-selector-fuzz-failure',
+ 'seed' => $isolated,
+ 'invariant' => $single['timedOut'] ? 'worker-timeout' : 'worker-crash',
+ 'signature' => $single['timedOut'] ? 'worker-timeout' : 'worker-crash',
+ 'exitCode' => $single['code'],
+ 'stderrTail' => substr( $single['stderr'], -2000 ),
+ )
+ );
+ $key = $single['timedOut'] ? 'worker-timeout' : 'worker-crash';
+ $state['signatures'][ $key ] = ( $state['signatures'][ $key ] ?? 0 ) + 1;
+ } else {
+ ++$state['casesCompleted'];
+ $state['failures'] += $single_summary['failures'];
+ foreach ( $single_summary['buckets'] as $bucket => $bucket_count ) {
+ $state['buckets'][ $bucket ] = ( $state['buckets'][ $bucket ] ?? 0 ) + $bucket_count;
+ }
+ foreach ( $single_summary['signatures'] as $signature => $signature_count ) {
+ $state['signatures'][ $signature ] = ( $state['signatures'][ $signature ] ?? 0 ) + $signature_count;
+ }
+ foreach ( $single_summary['lexbor'] ?? array() as $lexbor_state => $lexbor_count ) {
+ $state['lexbor'][ $lexbor_state ] = ( $state['lexbor'][ $lexbor_state ] ?? 0 ) + $lexbor_count;
+ }
+ css_selector_fuzz_merge_match_stats( $state['matchStats'], $single_summary['matchStats'] ?? array() );
+ }
+ }
+ } else {
+ $state['casesCompleted'] += array_sum( $summary['buckets'] );
+ $state['failures'] += $summary['failures'];
+ foreach ( $summary['buckets'] as $bucket => $bucket_count ) {
+ $state['buckets'][ $bucket ] = ( $state['buckets'][ $bucket ] ?? 0 ) + $bucket_count;
+ }
+ foreach ( $summary['signatures'] as $signature => $signature_count ) {
+ $state['signatures'][ $signature ] = ( $state['signatures'][ $signature ] ?? 0 ) + $signature_count;
+ }
+ foreach ( $summary['lexbor'] ?? array() as $lexbor_state => $lexbor_count ) {
+ $state['lexbor'][ $lexbor_state ] = ( $state['lexbor'][ $lexbor_state ] ?? 0 ) + $lexbor_count;
+ }
+ css_selector_fuzz_merge_match_stats( $state['matchStats'], $summary['matchStats'] ?? array() );
+ }
+
+ $seed += $count;
+ $state['nextSeed'] = $seed;
+ $state['updatedAt'] = gmdate( 'c' );
+ css_selector_fuzz_write_state( $state_path, $state );
+
+ if ( $stop_on_failure && $state['failures'] > 0 ) {
+ $state['stopReason'] = 'stop-on-failure';
+ break;
+ }
+}
+
+if ( null === $state['stopReason'] ) {
+ $state['stopReason'] = 'max-seeds';
+}
+$state['updatedAt'] = gmdate( 'c' );
+css_selector_fuzz_write_state( $state_path, $state );
+
+/*
+ * The lexbor differential is the third oracle. If it ever ran ( 'compared' )
+ * it was built and live; any 'unavailable' or 'error' tally then means it
+ * was missing for some cases or died mid-run, so part of the run had only
+ * two oracles. Surface that loudly rather than letting a green run hide it.
+ */
+$lexbor = $state['lexbor'];
+$lexbor_ran = ( $lexbor['compared'] ?? 0 ) > 0;
+$lexbor_lost = ( $lexbor['unavailable'] ?? 0 ) + ( $lexbor['error'] ?? 0 );
+if ( $lexbor_ran && $lexbor_lost > 0 ) {
+ fwrite( STDERR, "WARNING: lexbor third oracle was unavailable/errored for {$lexbor_lost} case(s); those ran with two oracles.\n" );
+} elseif ( ! $lexbor_ran ) {
+ fwrite( STDERR, "NOTE: lexbor third oracle never ran (harness not built?); run `sh tools/css-selector-fuzz/lexbor/build.sh` for the differential.\n" );
+}
+
+echo json_encode_safe( css_selector_fuzz_state_for_output( $state ) ) . "\n";
+exit( 0 === $state['failures'] ? 0 : 2 );
diff --git a/tools/css-selector-fuzz/tests/self-check.php b/tools/css-selector-fuzz/tests/self-check.php
new file mode 100644
index 0000000000000..9664367f1e300
--- /dev/null
+++ b/tools/css-selector-fuzz/tests/self-check.php
@@ -0,0 +1,368 @@
+#!/usr/bin/env php
+ substr_count( $selector, ']' ) ) {
+ return 'eof-auto-closes-attribute-selector';
+ }
+ if ( preg_match( '/\\[[^\\]]*=\\s*[-_a-zA-Z0-9]\\]$/', $selector ) ) {
+ return 'single-char-unquoted-attribute-value-at-eof';
+ }
+ if ( has_identity_escape_after_multibyte( $selector ) ) {
+ return 'identity-escape-after-multibyte';
+ }
+
+ return null;
+}
+
+function has_identity_escape_after_multibyte( string $selector ): bool {
+ $seen_multibyte = false;
+ $length = strlen( $selector );
+ for ( $i = 0; $i < $length; $i++ ) {
+ $byte = ord( $selector[ $i ] );
+ if ( $byte > 0x7F ) {
+ $seen_multibyte = true;
+ continue;
+ }
+ if ( ! $seen_multibyte || '\\' !== $selector[ $i ] || $i + 1 >= $length ) {
+ continue;
+ }
+
+ $next = $selector[ $i + 1 ];
+ if ( "\n" === $next || "\r" === $next || "\f" === $next || ctype_xdigit( $next ) ) {
+ continue;
+ }
+ return true;
+ }
+ return false;
+}
+
+Bootstrap::load();
+
+// --- Prng determinism and independence -------------------------------------
+
+$a = new Prng( '42', 'label' );
+$b = new Prng( '42', 'label' );
+check( $a->bytes( 64 ) === $b->bytes( 64 ), 'Identical seeds produce identical streams.' );
+
+$c = new Prng( '42', 'label' );
+$d = new Prng( '43', 'label' );
+check( $c->bytes( 64 ) !== $d->bytes( 64 ), 'Different seeds produce different streams.' );
+
+$e = new Prng( '42', 'fork-test' );
+$f = new Prng( '42', 'fork-test' );
+$fork1 = $e->fork( 'x' );
+$fork2 = $f->fork( 'x' );
+check( $fork1->bytes( 32 ) === $fork2->bytes( 32 ), 'Forked streams are deterministic.' );
+
+// --- utf8_codepoints --------------------------------------------------------
+
+$points = utf8_codepoints( "a\u{E9}\u{1F600}" );
+check( 3 === count( $points ), 'utf8_codepoints splits into 3 codepoints.' );
+check( 0x61 === $points[0][1] && 0xE9 === $points[1][1] && 0x1F600 === $points[2][1], 'utf8_codepoints decodes values.' );
+
+// --- Document generator: model matches parse for many seeds ---------------
+// ( Worker::run_case checks this per case as model-desync; here only a couple
+// of seeds are sampled for a fast signal. )
+
+for ( $seed = 1; $seed <= 3; $seed++ ) {
+ $document = DocumentGenerator::generate( new Prng( (string) $seed, 'self-check-doc' ) );
+ check( is_string( $document['html'] ) && '' !== $document['html'], "Document {$seed} renders." );
+ check( str_contains( $document['html'], 'data-fid' ) || false !== strpos( $document['html'], 'data-fid' ), "Document {$seed} has fids." );
+}
+
+// --- Selector generator expectations over many seeds -----------------------
+
+$by_bucket = array();
+$allowed_parse_mismatches = array();
+for ( $seed = 1; $seed <= 400; $seed++ ) {
+ $prng = new Prng( (string) $seed, 'self-check-selector' );
+ $document = DocumentGenerator::generate( $prng->fork( 'doc' ) );
+ $selector = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'] );
+
+ $by_bucket[ $selector['bucket'] ] = ( $by_bucket[ $selector['bucket'] ] ?? 0 ) + 1;
+
+ $compound = WP_CSS_Compound_Selector_List::from_selectors( $selector['selector'] );
+ $complex = WP_CSS_Complex_Selector_List::from_selectors( $selector['selector'] );
+
+ if ( null !== $selector['expectCompound'] ) {
+ $expected = $selector['expectCompound'];
+ $actual = null !== $compound;
+ $known = known_core_parse_mismatch( $selector['selector'], $expected, $actual );
+ if ( null !== $known ) {
+ $allowed_parse_mismatches[ "compound:{$known}" ] = ( $allowed_parse_mismatches[ "compound:{$known}" ] ?? 0 ) + 1;
+ } else {
+ check(
+ $expected === $actual,
+ "Seed {$seed} ({$selector['bucket']}): compound parse expectation for: " . \CssSelectorFuzz\printable_bytes( $selector['selector'] )
+ );
+ }
+ }
+ if ( null !== $selector['expectComplex'] ) {
+ $expected = $selector['expectComplex'];
+ $actual = null !== $complex;
+ $known = known_core_parse_mismatch( $selector['selector'], $expected, $actual );
+ if ( null !== $known ) {
+ $allowed_parse_mismatches[ "complex:{$known}" ] = ( $allowed_parse_mismatches[ "complex:{$known}" ] ?? 0 ) + 1;
+ } else {
+ check(
+ $expected === $actual,
+ "Seed {$seed} ({$selector['bucket']}): complex parse expectation for: " . \CssSelectorFuzz\printable_bytes( $selector['selector'] )
+ );
+ }
+ }
+}
+
+check( count( $by_bucket ) >= 5, 'Bucket variety: saw ' . count( $by_bucket ) . ' buckets.' );
+if ( array() !== $allowed_parse_mismatches ) {
+ fwrite( STDERR, 'Allowed known core parse bug signatures: ' . \CssSelectorFuzz\json_encode_safe( $allowed_parse_mismatches ) . "\n" );
+}
+
+// --- Document generator: randomized class NUL injection --------------------
+
+$safe_class_nul = 0;
+for ( $seed = 1; $seed <= 200; $seed++ ) {
+ $document = DocumentGenerator::generate( new Prng( (string) $seed, 'self-check-class-nul-safe' ) );
+ if ( false !== strpos( $document['html'], "\0" ) ) {
+ ++$safe_class_nul;
+ check( false === str_contains( implode( "\n", $document['pools']['attrValues'] ), "\0" ), "Safe document {$seed}: class NUL does not leak into attrValues pool." );
+ check( \CssSelectorFuzz\ast_strings_are_utf8( $document['pools']['classes'] ), "Safe document {$seed}: class pool strings stay valid UTF-8." );
+ check( in_array( true, array_map( static function ( string $class ): bool {
+ return false !== strpos( $class, "\u{FFFD}" );
+ }, $document['pools']['classes'] ), true ), "Safe document {$seed}: class pool contains decoded U+FFFD token." );
+ }
+}
+check( $safe_class_nul > 0, "Safe document generator emits randomized class NUL values ({$safe_class_nul} of 200)." );
+
+$wild_class_nul = 0;
+for ( $seed = 1; $seed <= 200; $seed++ ) {
+ $document = WildDocumentGenerator::generate( new Prng( (string) $seed, 'self-check-class-nul-wild' ) );
+ if ( false !== strpos( $document['html'], "\0" ) ) {
+ ++$wild_class_nul;
+ check( false === str_contains( implode( "\n", $document['pools']['attrValues'] ), "\0" ), "Wild document {$seed}: class NUL does not leak into attrValues pool." );
+ check( \CssSelectorFuzz\ast_strings_are_utf8( $document['pools']['classes'] ), "Wild document {$seed}: class pool strings stay valid UTF-8." );
+ check( in_array( true, array_map( static function ( string $class ): bool {
+ return false !== strpos( $class, "\u{FFFD}" );
+ }, $document['pools']['classes'] ), true ), "Wild document {$seed}: class pool contains decoded U+FFFD token." );
+ }
+}
+check( $wild_class_nul > 0, "Wild document generator emits randomized class NUL values ({$wild_class_nul} of 200)." );
+
+// --- Invalid-UTF-8 bucket: post-scrub AST expectations by construction ------
+// from_selectors() replaces each maximal subpart of an ill-formed UTF-8
+// sequence with one U+FFFD before parsing ( CSS Syntax §3.2 via the WHATWG
+// decoder ). The bucket injects raw ill-formed sequences and carries the
+// post-scrub AST, with the per-class subpart counts hard-coded in the
+// generator — independent of wp_scrub_utf8(), so this loop is a real
+// differential between the generator's WHATWG expectations and the core
+// scrub + parse pipeline.
+
+$fffd_ast_counts = array();
+$injection_sites = array();
+$byte_classes = array();
+
+// The class names AND byte values are duplicated here on purpose: tallying
+// from the generator's own table would silently shrink the assertion with a
+// deleted entry and self-validate on a drifted byte value.
+$expected_byte_classes = array(
+ 'lone-continuation' => "\x80",
+ 'truncated-2-byte' => "\xC3",
+ 'truncated-3-byte' => "\xE2\x8C",
+ 'truncated-4-byte' => "\xF0\x9F\x82",
+ 'invalid-lead-f5' => "\xF5",
+ 'invalid-lead-ff' => "\xFF",
+ 'overlong-min' => "\xC0\x80",
+ 'overlong-max' => "\xC1\xBF",
+ 'surrogate-half' => "\xED\xA0\x80",
+ 'beyond-max' => "\xF4\x90\x80\x80",
+);
+
+$count_fffd = static function ( $node ) use ( &$count_fffd ): int {
+ if ( is_string( $node ) ) {
+ return substr_count( $node, "\u{FFFD}" );
+ }
+ $total = 0;
+ if ( is_array( $node ) ) {
+ foreach ( $node as $child ) {
+ $total += $count_fffd( $child );
+ }
+ }
+ return $total;
+};
+
+for ( $seed = 1; $seed <= 150; $seed++ ) {
+ $prng = new Prng( (string) $seed, 'self-check-invalid-utf8' );
+ $document = DocumentGenerator::generate( $prng->fork( 'doc' ) );
+ $case = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'], null, 'invalid-utf8' );
+ $printable = \CssSelectorFuzz\printable_bytes( $case['selector'] );
+
+ check( 'invalid-utf8' === $case['bucket'], "Seed {$seed}: forced invalid-utf8 bucket, got {$case['bucket']}." );
+ check( ! wp_is_valid_utf8( $case['selector'] ), "Seed {$seed}: selector must contain invalid UTF-8: {$printable}" );
+ check( true === $case['expectCompound'] && true === $case['expectComplex'], "Seed {$seed}: invalid-utf8 cases must expect to parse in both grammars." );
+ check( is_array( $case['ast'] ) && \CssSelectorFuzz\ast_strings_are_utf8( $case['ast'] ), "Seed {$seed}: expected AST must be valid UTF-8." );
+
+ $compound = WP_CSS_Compound_Selector_List::from_selectors( $case['selector'] );
+ $complex = WP_CSS_Complex_Selector_List::from_selectors( $case['selector'] );
+ check( null !== $compound, "Seed {$seed}: compound parse after scrub for: {$printable}" );
+ check( null !== $complex, "Seed {$seed}: complex parse after scrub for: {$printable}" );
+ if ( null === $complex || ! is_array( $case['ast'] ) ) {
+ continue;
+ }
+
+ $parsed_ast = \CssSelectorFuzz\AstExtractor::from_complex_list( $complex );
+ check( $case['ast'] === $parsed_ast, "Seed {$seed}: parsed AST equals maximal-subpart scrub expectation for: {$printable}" );
+
+ $fffd_ast_counts[ $count_fffd( $case['ast'] ) ] = true;
+ foreach ( (array) $case['ast'][0]['self']['subs'] as $sub ) {
+ $injection_sites[ 'attr' === $sub['kind'] && null !== $sub['matcher'] ? 'attr-value' : $sub['kind'] ] = true;
+ }
+ foreach ( $expected_byte_classes as $class_name => $class_bytes ) {
+ // Substring attribution is ambiguous only for lone-continuation,
+ // whose byte occurs inside three longer classes — good enough for
+ // an at-least-once variety tally.
+ if ( str_contains( $case['selector'], $class_bytes ) ) {
+ $byte_classes[ $class_name ] = true;
+ }
+ }
+}
+
+foreach ( array( 1, 2, 3, 4 ) as $expected_count ) {
+ check( isset( $fffd_ast_counts[ $expected_count ] ), "Invalid-utf8 variety: a {$expected_count}-subpart byte class was generated." );
+}
+foreach ( array( 'class', 'id', 'attr', 'attr-value' ) as $site ) {
+ check( isset( $injection_sites[ $site ] ), "Invalid-utf8 variety: injection site {$site} was generated." );
+}
+foreach ( array_keys( $expected_byte_classes ) as $class_name ) {
+ check( isset( $byte_classes[ $class_name ] ), "Invalid-utf8 variety: byte class {$class_name} was generated." );
+}
+
+// --- Mutated bucket: raw invalid-byte splicing -------------------------------
+// mutate() must be able to splice raw ill-formed UTF-8 into a selector at
+// arbitrary byte offsets; these cases carry no AST expectation and exercise
+// crash / scrub-notice / differential paths only. The marker bytes here can
+// appear in NO rendered selector (the pools' multibyte characters use other
+// lead bytes), so their presence proves the mutation operation fired.
+
+$mutated_with_invalid = 0;
+for ( $seed = 1; $seed <= 200; $seed++ ) {
+ $prng = new Prng( (string) $seed, 'self-check-mutated-utf8' );
+ $document = DocumentGenerator::generate( $prng->fork( 'doc' ) );
+ $case = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'], null, 'mutated' );
+ if ( false !== strpbrk( $case['selector'], "\xC0\xC1\xED\xF4\xF5\xFF" ) ) {
+ ++$mutated_with_invalid;
+ }
+}
+check( $mutated_with_invalid >= 10, "Mutated bucket splices raw invalid bytes ({$mutated_with_invalid} of 200 seeds)." );
+
+// --- Known-answer matching cases -------------------------------------------
+
+$known_html = ''
+ . '
'
+ . ''
+ . '';
+
+function select_fids( string $html, string $selector ): array {
+ $processor = WP_HTML_Processor::create_full_parser( $html );
+ $out = array();
+ while ( $processor->select( $selector ) ) {
+ $out[] = $processor->get_attribute( 'data-fid' );
+ }
+ return $out;
+}
+
+check( array( 'e4' ) === select_fids( $known_html, '#x' ), 'Known: #x.' );
+check( array( 'e3', 'e4' ) === select_fids( $known_html, '.b' ), 'Known: .b.' );
+check( array( 'e4' ) === select_fids( $known_html, 'div > span.b' ), 'Known: div > span.b.' );
+check( array( 'e7' ) === select_fids( $known_html, 'section em' ), 'Known: section em.' );
+check( array() === select_fids( $known_html, 'section > em' ), 'Known: section > em matches nothing.' );
+check( array( 'e4' ) === select_fids( $known_html, '[data-v|="hello"]' ), 'Known: [data-v|=hello].' );
+check( array( 'e7' ) === select_fids( $known_html, '[lang^="en"]' ), 'Known: [lang^=en].' );
+
+// --- Class-value decode boundary (ReferenceMatcher vs WP class_list) --------
+// WP's class_list() folds NUL -> U+FFFD and treats FF as a separator; the
+// reference matcher reimplements tokenization independently. Pin both engines
+// against each other on these boundary inputs; randomized generator sampling
+// above verifies that the same NUL boundary is present in the hot path. Each
+// case also checks the reference matcher agrees with select() over a
+// TreeCapture of the same markup.
+
+function ref_fids( string $html, string $selector ): array {
+ $capture = \CssSelectorFuzz\TreeCapture::capture( $html );
+ $list = WP_CSS_Complex_Selector_List::from_selectors( $selector );
+ if ( null !== $capture['error'] || null === $list ) {
+ return array( '(error)' );
+ }
+ $ast = \CssSelectorFuzz\AstExtractor::from_complex_list( $list );
+ return \CssSelectorFuzz\ReferenceMatcher::expected_html_matches_rows( $ast, $capture['htmlRows'], $capture['quirks'] );
+}
+
+$nul_html = "";
+$ff_html = "";
+
+$nul_cases = array(
+ array( "class NUL -> FFFD", $nul_html, ".foo\u{FFFD}bar", array( 'n0' ) ),
+ array( "class trailing NUL", $nul_html, ".x\u{FFFD}", array( 'n1' ) ),
+ array( "class raw NUL no-match", $nul_html, '.foobar', array() ),
+ array( "class FF separator (first)", $ff_html, '.alpha', array( 'f0' ) ),
+ array( "class FF separator (second)", $ff_html, '.beta', array( 'f0' ) ),
+);
+foreach ( $nul_cases as $case ) {
+ list( $label, $html, $selector, $expected ) = $case;
+ $wp = select_fids( $html, $selector );
+ $ref = ref_fids( $html, $selector );
+ check( $expected === $wp, "Decode boundary ({$label}): select() == expected." );
+ check( $ref === $wp, "Decode boundary ({$label}): ReferenceMatcher == select()." );
+}
+
+// --- Worker end-to-end on a few seeds ---------------------------------------
+
+for ( $seed = 1; $seed <= 5; $seed++ ) {
+ $first = Worker::run_case( $seed );
+ $second = Worker::run_case( $seed );
+ check( $first['digest'] === $second['digest'], "Seed {$seed}: case digest is deterministic." );
+}
+
+if ( 0 === $failures ) {
+ echo "self-check OK\n";
+ exit( 0 );
+}
+echo "self-check FAILED: {$failures} failure(s)\n";
+exit( 1 );
diff --git a/tools/css-selector-fuzz/worker.php b/tools/css-selector-fuzz/worker.php
new file mode 100644
index 0000000000000..bdcde442aa943
--- /dev/null
+++ b/tools/css-selector-fuzz/worker.php
@@ -0,0 +1,34 @@
+#!/usr/bin/env php
+ 'css-selector-fuzz-worker-fatal',
+ 'error' => \CssSelectorFuzz\Worker::describe_throwable( $e ),
+ )
+ ) . "\n"
+ );
+ exit( 1 );
+}