diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 56ea0f705c2b8..1419e623a8246 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -658,6 +658,57 @@ public function get_unsupported_exception() { return $this->unsupported_exception; } + /** + * Progress through a document pausing on tags matching the provided CSS selector string. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( + * 'Example' + * ); + * while ( $processor->select( 'meta[property^="og:" i]' ) ) { + * // Loop is entered twice. + * var_dump( + * $processor->get_tag(), // string(4) "META" + * $processor->get_attribute( 'property' ), // string(7) "og:type" / string(14) "og:description" + * $processor->get_attribute( 'content' ), // string(7) "website" / string(11) "An example." + * ); + * } + * + * @since {WP_VERSION} + * + * @param string $selector_string Selector string. + * @return bool Whether a selection was found. + */ + public function select( $selector_string ): bool { + static $previous_selector_string = null; + static $previous_selector = null; + + $selector = $selector_string === $previous_selector_string + ? $previous_selector + : WP_CSS_Complex_Selector_List::from_selectors( $selector_string ); + + $previous_selector = $selector; + $previous_selector_string = $selector_string; + + if ( null === $selector ) { + _doing_it_wrong( + __METHOD__, + sprintf( 'Received unsupported or invalid selector "%s".', $selector_string ), + '{WP_VERSION}' + ); + return false; + } + + while ( $this->next_tag() ) { + if ( $selector->matches( $this ) ) { + return true; + } + } + + return false; + } + /** * Finds the next tag matching the $query. * diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 501a623afb10b..9e0b22eb15bc3 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -537,6 +537,10 @@ class WP_HTML_Tag_Processor { */ protected $compat_mode = self::NO_QUIRKS_MODE; + public function is_quirks_mode() { + return self::QUIRKS_MODE === $this->compat_mode; + } + /** * Indicates whether the parser is inside foreign content, * e.g. inside an SVG or MathML element. @@ -864,6 +868,57 @@ public function change_parsing_namespace( string $new_namespace ): bool { return true; } + /** + * Progress through a document pausing on tags matching the provided CSS selector string. + * + * Example: + * + * $processor = new WP_HTML_Tag_Processor( + * 'Example' + * ); + * while ( $processor->select( 'meta[property^="og:" i]' ) ) { + * // Loop is entered twice. + * var_dump( + * $processor->get_tag(), // string(4) "META" + * $processor->get_attribute( 'property' ), // string(7) "og:type" / string(14) "og:description" + * $processor->get_attribute( 'content' ), // string(7) "website" / string(11) "An example." + * ); + * } + * + * @since {WP_VERSION} + * + * @param string $selector_string Selector string. + * @return bool Whether a selection was found. + */ + public function select( $selector_string ): bool { + static $previous_selector_string = null; + static $previous_selector = null; + + $selector = $selector_string === $previous_selector_string + ? $previous_selector + : WP_CSS_Compound_Selector_List::from_selectors( $selector_string ); + + $previous_selector = $selector; + $previous_selector_string = $selector_string; + + if ( null === $selector ) { + _doing_it_wrong( + __METHOD__, + sprintf( 'Received unsupported or invalid selector "%s".', $selector_string ), + '{WP_VERSION}' + ); + return false; + } + + while ( $this->next_tag() ) { + if ( $selector->matches( $this ) ) { + return true; + } + } + + return false; + } + /** * Finds the next tag matching the $query. * diff --git a/src/wp-includes/html-api/css/class-wp-css-attribute-selector.php b/src/wp-includes/html-api/css/class-wp-css-attribute-selector.php new file mode 100644 index 0000000000000..134e68104811f --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-attribute-selector.php @@ -0,0 +1,450 @@ + true, + 'accept-charset' => true, + 'align' => true, + 'alink' => true, + 'axis' => true, + 'bgcolor' => true, + 'charset' => true, + 'checked' => true, + 'clear' => true, + 'codetype' => true, + 'color' => true, + 'compact' => true, + 'declare' => true, + 'defer' => true, + 'dir' => true, + 'direction' => true, + 'disabled' => true, + 'enctype' => true, + 'face' => true, + 'frame' => true, + 'hreflang' => true, + 'http-equiv' => true, + 'lang' => true, + 'language' => true, + 'link' => true, + 'media' => true, + 'method' => true, + 'multiple' => true, + 'nohref' => true, + 'noresize' => true, + 'noshade' => true, + 'nowrap' => true, + 'readonly' => true, + 'rel' => true, + 'rev' => true, + 'rules' => true, + 'scope' => true, + 'scrolling' => true, + 'selected' => true, + 'shape' => true, + 'target' => true, + 'text' => true, + 'type' => true, + 'valign' => true, + 'valuetype' => true, + 'vlink' => true, + ); + + /** + * The name of the attribute to match. + * + * @var string + */ + public $name; + + /** + * The attribute matcher. + * + * Allowed string values are the class constants: + * - {@see WP_CSS_Attribute_Selector::MATCH_EXACT} + * - {@see WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT} + * - {@see WP_CSS_Attribute_Selector::MATCH_EXACT_OR_HYPHEN_SUFFIXED} + * - {@see WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY} + * - {@see WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY} + * - {@see WP_CSS_Attribute_Selector::MATCH_CONTAINS} + * + * @var string|null + */ + public $matcher; + + /** + * The attribute value to match. + * + * @var string|null + */ + public $value; + + /** + * The attribute modifier. + * + * Allowed string values are the class constants: + * - {@see WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE} + * - {@see WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE} + * + * @var string|null + */ + public $modifier; + + /** + * Constructor. + * + * @param string $name The attribute name. + * @param string|null $matcher The attribute matcher. + * Must be one of the class MATCH_* constants or null. + * @param string|null $value The attribute value to match. + * @param string|null $modifier The attribute case modifier. + * Must be one of the class MODIFIER_* constants or null. + */ + private function __construct( string $name, ?string $matcher = null, ?string $value = null, ?string $modifier = null ) { + $this->name = $name; + $this->matcher = $matcher; + $this->value = $value; + $this->modifier = $modifier; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + $attr_value = $processor->get_attribute( $this->name ); + if ( null === $attr_value ) { + return false; + } + + if ( null === $this->value ) { + return true; + } + + /* + * The substring matchers match nothing when the value is empty: + * + * > If "val" is the empty string then the selector does not represent anything. + * + * https://www.w3.org/TR/selectors-4/#attribute-substrings + */ + if ( + '' === $this->value && + ( + self::MATCH_PREFIXED_BY === $this->matcher || + self::MATCH_SUFFIXED_BY === $this->matcher || + self::MATCH_CONTAINS === $this->matcher + ) + ) { + return false; + } + + if ( true === $attr_value ) { + $attr_value = ''; + } + + /* + * Without an explicit modifier, HTML defines some attributes' values + * as ASCII case-insensitive on HTML elements. An explicit `s` + * modifier forces case-sensitive matching even for those. + */ + $case_insensitive = self::MODIFIER_CASE_INSENSITIVE === $this->modifier || ( + null === $this->modifier && + 'html' === $processor->get_namespace() && + isset( self::HTML_CASE_INSENSITIVE_ATTRIBUTE_VALUES[ strtolower( $this->name ) ] ) + ); + + switch ( $this->matcher ) { + case self::MATCH_EXACT: + return $case_insensitive + ? 0 === strcasecmp( $attr_value, $this->value ) + : $attr_value === $this->value; + + case self::MATCH_ONE_OF_EXACT: + foreach ( $this->whitespace_delimited_list( $attr_value ) as $val ) { + if ( + $case_insensitive + ? 0 === strcasecmp( $val, $this->value ) + : $val === $this->value + ) { + return true; + } + } + return false; + + case self::MATCH_EXACT_OR_HYPHEN_SUFFIXED: + $exact_length = strlen( $this->value ); + $matches_prefix = substr_compare( $attr_value, $this->value, 0, $exact_length, $case_insensitive ); + return ( + 0 === $matches_prefix && + ( strlen( $attr_value ) === $exact_length || '-' === $attr_value[ $exact_length ] ) + ); + + case self::MATCH_PREFIXED_BY: + return 0 === substr_compare( $attr_value, $this->value, 0, strlen( $this->value ), $case_insensitive ); + + case self::MATCH_SUFFIXED_BY: + return 0 === substr_compare( $attr_value, $this->value, -strlen( $this->value ), null, $case_insensitive ); + + case self::MATCH_CONTAINS: + return false !== ( + $case_insensitive + ? stripos( $attr_value, $this->value ) + : strpos( $attr_value, $this->value ) + ); + } + } + + /** + * Splits a string into a list of whitespace delimited values. + * + * This is useful for the {@see WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT} matcher. + * + * @param string $input + * + * @return Generator Yields each whitespace-delimited value from the input string. + */ + private function whitespace_delimited_list( string $input ): Generator { + // Start by skipping whitespace. + $offset = strspn( $input, self::WHITESPACE_CHARACTERS ); + + while ( $offset < strlen( $input ) ) { + // Find the byte length until the next boundary. + $length = strcspn( $input, self::WHITESPACE_CHARACTERS, $offset ); + $value = substr( $input, $offset, $length ); + + // Move past trailing whitespace. + $offset += $length + strspn( $input, self::WHITESPACE_CHARACTERS, $offset + $length ); + + yield $value; + } + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * The end of input acts like a closing `]`: tokenization auto-closes + * unterminated simple blocks (and unterminated strings) at EOF, so + * `[att=val` is the same selector as `[att=val]`. Truncation inside the + * selector grammar itself (e.g. `[` or `[att=`) is still invalid. + * + * https://www.w3.org/TR/css-syntax-3/#consume-simple-block + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + // Need at least 2 bytes `[x`; the closing `]` may be supplied by the end of input. + if ( $offset + 1 >= strlen( $input ) ) { + return null; + } + + $updated_offset = $offset; + + if ( '[' !== $input[ $updated_offset ] ) { + return null; + } + ++$updated_offset; + + self::parse_whitespace( $input, $updated_offset ); + $attr_name = self::parse_ident( $input, $updated_offset ); + if ( null === $attr_name ) { + return null; + } + self::parse_whitespace( $input, $updated_offset ); + + // The end of input auto-closes the attribute selector. + if ( $updated_offset >= strlen( $input ) ) { + $offset = $updated_offset; + return new WP_CSS_Attribute_Selector( $attr_name ); + } + + if ( ']' === $input[ $updated_offset ] ) { + $offset = $updated_offset + 1; + return new WP_CSS_Attribute_Selector( $attr_name ); + } + + if ( '=' === $input[ $updated_offset ] ) { + ++$updated_offset; + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_EXACT; + } elseif ( $updated_offset + 1 < strlen( $input ) && '=' === $input[ $updated_offset + 1 ] ) { + switch ( $input[ $updated_offset ] ) { + case '~': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT; + $updated_offset += 2; + break; + case '|': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_EXACT_OR_HYPHEN_SUFFIXED; + $updated_offset += 2; + break; + case '^': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY; + $updated_offset += 2; + break; + case '$': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY; + $updated_offset += 2; + break; + case '*': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_CONTAINS; + $updated_offset += 2; + break; + default: + return null; + } + } else { + return null; + } + + self::parse_whitespace( $input, $updated_offset ); + $attr_val = + self::parse_string( $input, $updated_offset ) ?? + self::parse_ident( $input, $updated_offset ); + + if ( null === $attr_val ) { + return null; + } + + self::parse_whitespace( $input, $updated_offset ); + + $attr_modifier = null; + if ( $updated_offset < strlen( $input ) ) { + switch ( $input[ $updated_offset ] ) { + case 'i': + case 'I': + $attr_modifier = WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE; + ++$updated_offset; + break; + + case 's': + case 'S': + $attr_modifier = WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE; + ++$updated_offset; + break; + } + + if ( null !== $attr_modifier ) { + self::parse_whitespace( $input, $updated_offset ); + } + } + + // The end of input auto-closes the attribute selector. + if ( $updated_offset >= strlen( $input ) ) { + $offset = $updated_offset; + return new self( $attr_name, $attr_matcher, $attr_val, $attr_modifier ); + } + + if ( ']' === $input[ $updated_offset ] ) { + $offset = $updated_offset + 1; + return new self( $attr_name, $attr_matcher, $attr_val, $attr_modifier ); + } + + return null; + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-class-selector.php b/src/wp-includes/html-api/css/class-wp-css-class-selector.php new file mode 100644 index 0000000000000..121b3abf10f96 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-class-selector.php @@ -0,0 +1,71 @@ +class_name = $class_name; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + return (bool) $processor->has_class( $this->class_name ); + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + if ( $offset + 1 >= strlen( $input ) || '.' !== $input[ $offset ] ) { + return null; + } + + $updated_offset = $offset + 1; + $result = self::parse_ident( $input, $updated_offset ); + + if ( null === $result ) { + return null; + } + + $offset = $updated_offset; + return new self( $result ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-complex-selector-list.php b/src/wp-includes/html-api/css/class-wp-css-complex-selector-list.php new file mode 100644 index 0000000000000..da5e17011e0d8 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-complex-selector-list.php @@ -0,0 +1,81 @@ + in the grammar. + * See {@see WP_CSS_Compound_Selector_List} for more details on the grammar. + * + * This class supports the same selector syntax as {@see WP_CSS_Compound_Selector_List} as well as + * the following combinators: + * - Descendant (`ancestor descendant`) + * - Child (`parent > child`) + * + * Combinators may only be used with type selectors in the non-final position, for example: + * - `div [type=input]` is valid because the `div` type selector appears in a non-final position. + * - `[disabled] option` is NOT valid, because the `[disabled]` attribute selector appears + * in a non-final position. + * + * These combinators are not supported: + * - Next sibling (`former-sibling + next-sibling`) + * - Subsequent sibling (`former-sibling ~ subsequent-sibling`) + * + * @since {WP_VERSION} + * + * @access private + */ +class WP_CSS_Complex_Selector_List extends WP_CSS_Compound_Selector_List { + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + $selector = WP_CSS_Complex_Selector::parse( $input, $offset ); + if ( null === $selector ) { + return null; + } + self::parse_whitespace( $input, $offset ); + + $selectors = array( $selector ); + while ( $offset < strlen( $input ) ) { + // Each loop should stop on a `,` selector list delimiter. + if ( ',' !== $input[ $offset ] ) { + return null; + } + ++$offset; + self::parse_whitespace( $input, $offset ); + $selector = WP_CSS_Complex_Selector::parse( $input, $offset ); + if ( null === $selector ) { + return null; + } + $selectors[] = $selector; + self::parse_whitespace( $input, $offset ); + } + + return new self( $selectors ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-complex-selector.php b/src/wp-includes/html-api/css/class-wp-css-complex-selector.php new file mode 100644 index 0000000000000..fd05c29daba91 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-complex-selector.php @@ -0,0 +1,260 @@ +'; + + /** + * Descendant combinator. + */ + const COMBINATOR_DESCENDANT = ' '; + + /** + * Next sibling combinator. + * + * This combinator is not currently supported. + */ + const COMBINATOR_NEXT_SIBLING = '+'; + + /** + * Subsequent sibling combinator. + * + * This combinator is not currently supported. + */ + const COMBINATOR_SUBSEQUENT_SIBLING = '~'; + + /** + * The "self selector" is the last element in a complex selector, it corresponds to the + * selected element. + * + * Example: + * + * $self_selector + * ┏━━━━┻━━━━┓ + * .heading h1 > el.selected + * + * @readonly + * @var WP_CSS_Compound_Selector + */ + public $self_selector; + + /** + * The "context selectors" are zero or more elements that provide additional constraints for + * the "self selector." + * + * These selectors are represented as 2-tuples where the element at index 0 is the selector and + * the element at index 1 is the combinator string constant from this class, + * e.g. `WP_CSS_Complex_Selector::COMBINATOR_CHILD`. + * + * In the example selector below, an element like `` matches iff: + * - it is a child of an `H1` element + * - that `H1` element is a descendant of a `SECTION` element. + * + * The `section` and `h1` parts of this selector and their combinators are the + * "context selectors." Note that this terminology does not correspond to language in the + * specification texts. + * + * $context_selectors + * ┏━━━━━┻━━━━┓ + * section h1 > strong.selected + * + * The example would have the following context selectors: + * + * // Pseudo-code + * array( + * array( WP_CSS_Type_Selector( 'type'=>'h1' ), '>' ), + * array( WP_CSS_Type_Selector( 'type'=>'section' ), ' ' ), + * ) + * + * Context selectors are ordered from right to left in the selector text. The selectors closest + * to the target appear at the start of the `context_selectors` array. + * + * @readonly + * @var array{WP_CSS_Type_Selector, string}[]|null + */ + public $context_selectors; + + /** + * Constructor. + * + * @param WP_CSS_Compound_Selector $self_selector The selector in the final position. + * @param array{WP_CSS_Type_Selector, string}[]|null $context_selectors The context selectors. + */ + private function __construct( + WP_CSS_Compound_Selector $self_selector, + ?array $context_selectors + ) { + $this->self_selector = $self_selector; + $this->context_selectors = $context_selectors; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( $processor ): bool { + // First selector must match this location. + if ( ! $this->self_selector->matches( $processor ) ) { + return false; + } + + if ( null === $this->context_selectors || array() === $this->context_selectors ) { + return true; + } + + $breadcrumbs = array_slice( array_reverse( $processor->get_breadcrumbs() ), 1 ); + return $this->explore_matches( $this->context_selectors, $breadcrumbs ); + } + + /** + * Checks for matches by recursively comparing context selectors with breadcrumbs. + * + * @param array{WP_CSS_Type_Selector, string}[] $selectors Selectors to match. + * @param string[] $breadcrumbs Breadcrumbs. + * @return bool True if a match is found, otherwise false. + */ + private function explore_matches( array $selectors, array $breadcrumbs ): bool { + if ( array() === $selectors ) { + return true; + } + if ( array() === $breadcrumbs ) { + return false; + } + + $selector = $selectors[0][0]; + $combinator = $selectors[0][1]; + + switch ( $combinator ) { + case self::COMBINATOR_CHILD: + if ( $selector->matches_tag( $breadcrumbs[0] ) ) { + return $this->explore_matches( array_slice( $selectors, 1 ), array_slice( $breadcrumbs, 1 ) ); + } + return false; + + case self::COMBINATOR_DESCENDANT: + // Find _all_ the breadcrumbs that match and recurse from each of them. + for ( $i = 0; $i < count( $breadcrumbs ); $i++ ) { + if ( $selector->matches_tag( $breadcrumbs[ $i ] ) ) { + $next_breadcrumbs = array_slice( $breadcrumbs, $i + 1 ); + if ( $this->explore_matches( array_slice( $selectors, 1 ), $next_breadcrumbs ) ) { + return true; + } + } + } + return false; + + default: + _doing_it_wrong( + __METHOD__, + sprintf( + // translators: %s: A CSS selector combinator like ">" or "+". + __( 'Unsupported combinator "%s" found.' ), + $combinator + ), + '{WP_VERSION}' + ); + return false; + } + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + if ( $offset >= strlen( $input ) ) { + return null; + } + + $updated_offset = $offset; + $self_selector = WP_CSS_Compound_Selector::parse( $input, $updated_offset ); + if ( null === $self_selector ) { + return null; + } + /** @var array{WP_CSS_Compound_Selector, string}[] */ + $selectors = array(); + + $found_whitespace = self::parse_whitespace( $input, $updated_offset ); + while ( $updated_offset < strlen( $input ) ) { + $combinator = null; + $next_selector = null; + + // Sibling (`+` and `~`) combinators are not supported at this time. + if ( + WP_CSS_Complex_Selector::COMBINATOR_NEXT_SIBLING === $input[ $updated_offset ] || + WP_CSS_Complex_Selector::COMBINATOR_SUBSEQUENT_SIBLING === $input[ $updated_offset ] + ) { + return null; + } elseif ( + WP_CSS_Complex_Selector::COMBINATOR_CHILD === $input[ $updated_offset ] + ) { + $combinator = $input[ $updated_offset ]; + ++$updated_offset; + self::parse_whitespace( $input, $updated_offset ); + + // A combinator has been found, failure to find a selector here is a parse error. + $next_selector = WP_CSS_Compound_Selector::parse( $input, $updated_offset ); + if ( null === $next_selector ) { + return null; + } + } elseif ( $found_whitespace ) { + /* + * Whitespace is ambiguous, it could be a descendant combinator or + * insignificant whitespace. + */ + $next_selector = WP_CSS_Compound_Selector::parse( $input, $updated_offset ); + if ( null !== $next_selector ) { + $combinator = WP_CSS_Complex_Selector::COMBINATOR_DESCENDANT; + } + } + + if ( null === $next_selector ) { + break; + } + + // $self_selector will pass to a relative selector where only the type selector is allowed. + if ( null !== $self_selector->subclass_selectors || null === $self_selector->type_selector ) { + return null; + } + + /** @var array{WP_CSS_Type_Selector, string} */ + $selector_pair = array( $self_selector->type_selector, $combinator ); + $selectors[] = $selector_pair; + $self_selector = $next_selector; + + $found_whitespace = self::parse_whitespace( $input, $updated_offset ); + } + $offset = $updated_offset; + + return new self( $self_selector, array_reverse( $selectors ) ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-compound-selector-list.php b/src/wp-includes/html-api/css/class-wp-css-compound-selector-list.php new file mode 100644 index 0000000000000..4042b1bc94f3e --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-compound-selector-list.php @@ -0,0 +1,191 @@ + in the grammar. The supported grammar is: + * + * = + * = # + * = # + * = [ ? ]* + * = [ ? * ]! + * = '>' | [ '|' '|' ] + * = | '*' + * = | | + * = + * = '.' + * = '[' ']' | + * '[' [ | ] ? ']' + * = [ '~' | '|' | '^' | '$' | '*' ]? '=' + * = i | s + * + * @link https://www.w3.org/TR/selectors/#grammar Refer to the grammar for more details. + * + * This class of selectors does not support "complex" selectors. That is any selector with a + * combinator such as descendant (`.ancestor .descendant`) or child (`.parent > .child`). + * See {@see WP_CSS_Complex_Selector_List} for support of some combinators. + * + * Note that this grammar has been adapted and does not support the full CSS selector grammar. + * Supported selector syntax: + * - Type selectors (tag names, e.g. `div`) + * - Class selectors (e.g. `.class-name`) + * - ID selectors (e.g. `#unique-id`) + * - Attribute selectors (e.g. `[attribute-name]` or `[attribute-name="value"]`) + * - Comma-separated selector lists (e.g. `.selector-1, .selector-2`) + * - Compound selectors (e.g. `div.class-name#id[attr]`) + * + * Unsupported selector syntax: + * - Pseudo-element selectors (`::before`) + * - Pseudo-class selectors (`:hover` or `:nth-child(2)`) + * - Namespace prefixes (`svg|title` or `[xlink|href]`) + * - Combinators are not supported by this class (descendant, child, next sibling, + * subsequent sibling). See {@see WP_CSS_Complex_Selector_List} for combinator support. + * + * Future ideas: + * - Namespace type selectors could be implemented with select namespaces in order to + * select elements from a namespace, for example: + * - `svg|*` to select all SVG elements + * - `html|title` to select only HTML TITLE elements. + * + * @since {WP_VERSION} + * + * @access private + * + * @link https://www.w3.org/TR/css-syntax-3/ + * @link https://www.w3.org/tr/selectors/ + * @link https://www.w3.org/TR/selectors-api2/ + * @link https://www.w3.org/TR/selectors-4/ + */ +class WP_CSS_Compound_Selector_List extends WP_CSS_Selector_Parser_Matcher { + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( $processor ): bool { + if ( $processor->get_token_type() !== '#tag' ) { + return false; + } + + foreach ( $this->selectors as $selector ) { + if ( $selector->matches( $processor ) ) { + return true; + } + } + return false; + } + + /** + * Array of selectors. + * + * @var array + */ + private $selectors; + + /** + * Constructor. + * + * @param array $selectors Array of selectors. + */ + protected function __construct( array $selectors ) { + $this->selectors = $selectors; + } + + /** + * Takes a CSS selector string and returns an instance of itself or `null` if the selector + * string is invalid or unsupported. + * + * The selector string must be UTF-8: ill-formed byte sequences are replaced with + * U+FFFD per maximal subpart before parsing and reported with `_doing_it_wrong()`. + * See the "Text Encoding" section of the class documentation. + * + * @param string $input CSS selectors. + * @return static|null + */ + public static function from_selectors( string $input ) { + $input = self::normalize_selector_input( $input ); + + if ( '' === $input ) { + return null; + } + + $offset = 0; + return static::parse( $input, $offset ); + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + $selector = WP_CSS_Compound_Selector::parse( $input, $offset ); + if ( null === $selector ) { + return null; + } + self::parse_whitespace( $input, $offset ); + + $selectors = array( $selector ); + while ( $offset < strlen( $input ) ) { + // Each loop should stop on a `,` selector list delimiter. + if ( ',' !== $input[ $offset ] ) { + return null; + } + ++$offset; + self::parse_whitespace( $input, $offset ); + $selector = WP_CSS_Compound_Selector::parse( $input, $offset ); + if ( null === $selector ) { + return null; + } + $selectors[] = $selector; + self::parse_whitespace( $input, $offset ); + } + + return new self( $selectors ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-compound-selector.php b/src/wp-includes/html-api/css/class-wp-css-compound-selector.php new file mode 100644 index 0000000000000..48e206819c0d3 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-compound-selector.php @@ -0,0 +1,130 @@ +type_selector = $type_selector; + $this->subclass_selectors = array() === $subclass_selectors ? null : $subclass_selectors; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + if ( $this->type_selector && ! $this->type_selector->matches( $processor ) ) { + return false; + } + if ( null !== $this->subclass_selectors ) { + foreach ( $this->subclass_selectors as $subclass_selector ) { + if ( ! $subclass_selector->matches( $processor ) ) { + return false; + } + } + } + return true; + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + if ( $offset >= strlen( $input ) ) { + return null; + } + + $updated_offset = $offset; + $type_selector = WP_CSS_Type_Selector::parse( $input, $updated_offset ); + + $subclass_selectors = array(); + $last_parsed_subclass_selector = self::parse_subclass_selector( $input, $updated_offset ); + while ( null !== $last_parsed_subclass_selector ) { + $subclass_selectors[] = $last_parsed_subclass_selector; + $last_parsed_subclass_selector = self::parse_subclass_selector( $input, $updated_offset ); + } + + // There must be at least one selector. + if ( null === $type_selector && array() === $subclass_selectors ) { + return null; + } + + $offset = $updated_offset; + return new self( $type_selector, $subclass_selectors ); + } + + /** + * Parses a subclass selector. + * + * > = | | + * + * @return WP_CSS_ID_Selector|WP_CSS_Class_Selector|WP_CSS_Attribute_Selector|null + */ + private static function parse_subclass_selector( string $input, int &$offset ) { + if ( $offset >= strlen( $input ) ) { + return null; + } + + switch ( $input[ $offset ] ) { + case '.': + return WP_CSS_Class_Selector::parse( $input, $offset ); + case '#': + return WP_CSS_ID_Selector::parse( $input, $offset ); + case '[': + return WP_CSS_Attribute_Selector::parse( $input, $offset ); + } + + return null; + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-id-selector.php b/src/wp-includes/html-api/css/class-wp-css-id-selector.php new file mode 100644 index 0000000000000..e2e47a24d1e6c --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-id-selector.php @@ -0,0 +1,72 @@ +id = $id; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + $id = $processor->get_attribute( 'id' ); + if ( ! is_string( $id ) ) { + return false; + } + + $case_insensitive = $processor->is_quirks_mode(); + + return $case_insensitive + ? 0 === strcasecmp( $id, $this->id ) + : $id === $this->id; + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + $ident = self::parse_hash_token( $input, $offset ); + if ( null === $ident ) { + return null; + } + return new self( $ident ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-selector-parser-matcher.php b/src/wp-includes/html-api/css/class-wp-css-selector-parser-matcher.php new file mode 100644 index 0000000000000..14d9d28a771cc --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-selector-parser-matcher.php @@ -0,0 +1,573 @@ + 0; + $offset += $length; + return $advanced; + } + + /** + * Tokenization of hash tokens + * + * > U+0023 NUMBER SIGN (#) + * > If the next input code point is an ident code point or the next two input code points are a valid escape, then: + * > 1. Create a . + * > 2. If the next 3 input code points would start an ident sequence, set the + * > ’s type flag to "id". + * > 3. Consume an ident sequence, and set the ’s value to the + * > returned string. + * > 4. Return the . + * > Otherwise, return a with its value set to the current input code point. + * + * This implementation is not interested in the , a '#' delim token is not relevant for selectors. + */ + final protected static function parse_hash_token( string $input, int &$offset ): ?string { + if ( $offset + 1 >= strlen( $input ) || '#' !== $input[ $offset ] ) { + return null; + } + + $updated_offset = $offset + 1; + $result = self::parse_ident( $input, $updated_offset ); + + if ( null === $result ) { + return null; + } + + $offset = $updated_offset; + return $result; + } + + /** + * Parse a string token + * + * > 4.3.5. Consume a string token + * > This section describes how to consume a string token from a stream of code points. It returns either a or . + * > + * > This algorithm may be called with an ending code point, which denotes the code point that ends the string. If an ending code point is not specified, the current input code point is used. + * > + * > Initially create a with its value set to the empty string. + * > + * > Repeatedly consume the next input code point from the stream: + * > + * > ending code point + * > Return the . + * > EOF + * > This is a parse error. Return the . + * > newline + * > This is a parse error. Reconsume the current input code point, create a , and return it. + * > U+005C REVERSE SOLIDUS (\) + * > If the next input code point is EOF, do nothing. + * > Otherwise, if the next input code point is a newline, consume it. + * > Otherwise, (the stream starts with a valid escape) consume an escaped code point and append the returned code point to the ’s value. + * > + * > anything else + * > Append the current input code point to the ’s value. + * + * https://www.w3.org/TR/css-syntax-3/#consume-string-token + * + * This implementation will never return a because + * the is not a part of the selector grammar. That + * case is treated as failure to parse and null is returned. + * + * @return string|null The parsed string token value, or null if parsing failed. + */ + final protected static function parse_string( string $input, int &$offset ): ?string { + if ( $offset >= strlen( $input ) ) { + return null; + } + + $ending_code_point = $input[ $offset ]; + if ( '"' !== $ending_code_point && "'" !== $ending_code_point ) { + return null; + } + + $string_token = ''; + + $updated_offset = $offset + 1; + $anything_else_mask = "\\\n{$ending_code_point}"; + while ( $updated_offset < strlen( $input ) ) { + $anything_else_length = strcspn( $input, $anything_else_mask, $updated_offset ); + if ( $anything_else_length > 0 ) { + $string_token .= substr( $input, $updated_offset, $anything_else_length ); + $updated_offset += $anything_else_length; + + if ( $updated_offset >= strlen( $input ) ) { + break; + } + } + + switch ( $input[ $updated_offset ] ) { + case '\\': + ++$updated_offset; + if ( $updated_offset >= strlen( $input ) ) { + break; + } + if ( "\n" === $input[ $updated_offset ] ) { + ++$updated_offset; + break; + } else { + $string_token .= self::consume_escaped_codepoint( $input, $updated_offset ); + } + break; + + /* + * This case would return a . + * The is not a part of the selector grammar + * so we do not return it and instead treat this as a + * failure to parse a string token. + */ + case "\n": + return null; + + case $ending_code_point: + ++$updated_offset; + break 2; + } + } + + $offset = $updated_offset; + return $string_token; + } + + /** + * Consume an escaped code point. + * + * > 4.3.7. Consume an escaped code point + * > This section describes how to consume an escaped code point. It assumes that the U+005C + * > REVERSE SOLIDUS (\) has already been consumed and that the next input code point has + * > already been verified to be part of a valid escape. It will return a code point. + * > + * > Consume the next input code point. + * > + * > hex digit + * > Consume as many hex digits as possible, but no more than 5. Note that this means 1-6 + * > hex digits have been consumed in total. If the next input code point is whitespace, + * > consume it as well. Interpret the hex digits as a hexadecimal number. If this number is + * > zero, or is for a surrogate, or is greater than the maximum allowed code point, return + * > U+FFFD REPLACEMENT CHARACTER (�). Otherwise, return the code point with that value. + * > EOF + * > This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�). + * > anything else + * > Return the current input code point. + * + * @param string $input + * @param int $offset + * @return string + */ + final protected static function consume_escaped_codepoint( $input, &$offset ): string { + /* + * > EOF + * > This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�). + */ + if ( $offset >= strlen( $input ) ) { + return "\u{FFFD}"; + } + + $hex_length = strspn( $input, '0123456789abcdefABCDEF', $offset, 6 ); + if ( $hex_length > 0 ) { + /** + * The 6-character hex string has a maximum value of 0xFFFFFF. + * It is likely to fit in an int value and not be a float. + * + * @var int + */ + $codepoint_value = hexdec( substr( $input, $offset, $hex_length ) ); + + /* + * > A surrogate is a leading surrogate or a trailing surrogate. + * > A leading surrogate is a code point that is in the range U+D800 to U+DBFF, inclusive. + * > A trailing surrogate is a code point that is in the range U+DC00 to U+DFFF, inclusive. + * + * The surrogate ranges are adjacent, so the complete range is 0xD800 to 0xDFFF, inclusive. + */ + $codepoint_char = ( + 0 === $codepoint_value || + $codepoint_value > self::UTF8_MAX_CODEPOINT_VALUE || + ( 0xD800 <= $codepoint_value && $codepoint_value <= 0xDFFF ) + ) + ? "\u{FFFD}" + : mb_chr( $codepoint_value, 'UTF-8' ); + + $offset += $hex_length; + + // If the next input code point is whitespace, consume it as well. + if ( + strlen( $input ) > $offset && + ( + "\n" === $input[ $offset ] || + "\t" === $input[ $offset ] || + ' ' === $input[ $offset ] + ) + ) { + ++$offset; + } + return $codepoint_char; + } + + /* + * Find the byte length of the code point at $offset without copying the rest + * of the input: a code point is at most 4 bytes, so the scan is bounded and + * an escape of valid UTF-8 decodes in O(1) regardless of selector length. + * + * `_wp_utf8_codepoint_span()` is not suitable here: it does not bound the + * scan, so its ASCII fast-path reads to the end of the input on every call, + * which is quadratic over a selector composed of escapes. + */ + $at = $offset; + $invalid_length = 0; + _wp_scan_utf8( $input, $at, $invalid_length, 4, 1 ); + if ( $at > $offset ) { + $codepoint_char = substr( $input, $offset, $at - $offset ); + $offset = $at; + return $codepoint_char; + } + + /* + * The bytes at $offset are not valid UTF-8, which can only happen when + * `parse()` was called directly with un-normalized input: the public + * `from_selectors()` API replaces ill-formed byte sequences with U+FFFD + * before parsing. Decode consistently with that normalization — consume + * the maximal subpart of the ill-formed sequence, whose length the scan + * above reported, and return a single U+FFFD. + */ + $offset += max( 1, $invalid_length ); + return "\u{FFFD}"; + } + + /** + * Parse an ident token + * + * CAUTION: This method is _not_ for parsing an ID selector! + * + * > 4.3.11. Consume an ident sequence + * > This section describes how to consume an ident sequence from a stream of code points. It returns a string containing the largest name that can be formed from adjacent code points in the stream, starting from the first. + * > + * > Note: This algorithm does not do the verification of the first few code points that are necessary to ensure the returned code points would constitute an . If that is the intended use, ensure that the stream starts with an ident sequence before calling this algorithm. + * > + * > Let result initially be an empty string. + * > + * > Repeatedly consume the next input code point from the stream: + * > + * > ident code point + * > Append the code point to result. + * > the stream starts with a valid escape + * > Consume an escaped code point. Append the returned code point to result. + * > anything else + * > Reconsume the current input code point. Return result. + * + * https://www.w3.org/TR/css-syntax-3/#consume-name + * + * @return string|null The parsed identifier name, or null if parsing failed. + */ + final protected static function parse_ident( string $input, int &$offset ): ?string { + if ( ! self::check_if_three_code_points_would_start_an_ident_sequence( $input, $offset ) ) { + return null; + } + + $ident = ''; + + while ( $offset < strlen( $input ) ) { + if ( self::next_two_are_valid_escape( $input, $offset ) ) { + // Move past the `\` character. + ++$offset; + $ident .= self::consume_escaped_codepoint( $input, $offset ); + continue; + } elseif ( self::is_ident_codepoint( $input, $offset ) ) { + $ident .= $input[ $offset ]; + ++$offset; + continue; + } + break; + } + + return $ident; + } + + /* + * -------------------------- + * Selector parsing utilities + * -------------------------- + * + * The following functions are used for parsing but do not consume any input. + */ + + /** + * Checks for two valid escape codepoints. + * + * > 4.3.8. Check if two code points are a valid escape + * > This section describes how to check if two code points are a valid escape. The algorithm + * > described here can be called explicitly with two code points, or can be called with the + * > input stream itself. In the latter case, the two code points in question are the current + * > input code point and the next input code point, in that order. + * > + * > Note: This algorithm will not consume any additional code point. + * > + * > If the first code point is not U+005C REVERSE SOLIDUS (\), return false. + * > + * > Otherwise, if the second code point is a newline, return false. + * > + * > Otherwise, return true. + * + * https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape + * + * @todo The second codepoint is not checked for validity. + * + * @param string $input The input string. + * @param int $offset The byte offset in the string. + * @return bool True if the next two codepoints are a valid escape, otherwise false. + */ + final protected static function next_two_are_valid_escape( string $input, int $offset ): bool { + if ( $offset >= strlen( $input ) ) { + return false; + } + + /* + * The second code point may be EOF. EOF is not a newline, so a + * backslash at the end of input is a valid escape; consuming it + * produces U+FFFD REPLACEMENT CHARACTER. + */ + return '\\' === $input[ $offset ] && + ( $offset + 1 >= strlen( $input ) || "\n" !== $input[ $offset + 1 ] ); + } + + /** + * Checks if the next code point is an "ident start code point." + * + * Caution! This method does not do any bounds checking, it should not be passed + * a string with an offset that is out of bounds. + * + * > ident-start code point + * > A letter, a non-ASCII code point, or U+005F LOW LINE (_). + * > uppercase letter + * > A code point between U+0041 LATIN CAPITAL LETTER A (A) and U+005A LATIN CAPITAL LETTER Z (Z) inclusive. + * > lowercase letter + * > A code point between U+0061 LATIN SMALL LETTER A (a) and U+007A LATIN SMALL LETTER Z (z) inclusive. + * > letter + * > An uppercase letter or a lowercase letter. + * > non-ASCII code point + * > A code point with a value equal to or greater than U+0080 . + * + * @link https://www.w3.org/TR/css-syntax-3/#ident-start-code-point + * + * @param string $input The input string. + * @param int $offset The byte offset in the string. + * @return bool True if the next codepoint is an ident start code point, otherwise false. + */ + final protected static function is_ident_start_codepoint( string $input, int $offset ): bool { + return ( + '_' === $input[ $offset ] || + ( 'a' <= $input[ $offset ] && $input[ $offset ] <= 'z' ) || + ( 'A' <= $input[ $offset ] && $input[ $offset ] <= 'Z' ) || + ord( $input[ $offset ] ) > 0x7F + ); + } + + /** + * Checks if the next code point is an "ident code point." + * + * Caution! This method does not do any bounds checking, it should not be passed + * a string with an offset that is out of bounds. + * + * > ident code point + * > An ident-start code point, a digit, or U+002D HYPHEN-MINUS (-). + * > digit + * > A code point between U+0030 DIGIT ZERO (0) and U+0039 DIGIT NINE (9) inclusive. + * + * @link https://www.w3.org/TR/css-syntax-3/#ident-code-point + * + * @param string $input The input string. + * @param int $offset The byte offset in the string. + * @return bool True if the next codepoint is an ident code point, otherwise false. + */ + final protected static function is_ident_codepoint( string $input, int $offset ): bool { + return '-' === $input[ $offset ] || + ( '0' <= $input[ $offset ] && $input[ $offset ] <= '9' ) || + self::is_ident_start_codepoint( $input, $offset ); + } + + /** + * Checks if three code points would start an ident sequence. + * + * > 4.3.9. Check if three code points would start an ident sequence + * > This section describes how to check if three code points would start an ident sequence. The algorithm described here can be called explicitly with three code points, or can be called with the input stream itself. In the latter case, the three code points in question are the current input code point and the next two input code points, in that order. + * > + * > Note: This algorithm will not consume any additional code points. + * > + * > Look at the first code point: + * > + * > U+002D HYPHEN-MINUS + * > If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or the second and third code points are a valid escape, return true. Otherwise, return false. + * > ident-start code point + * > Return true. + * > U+005C REVERSE SOLIDUS (\) + * > If the first and second code points are a valid escape, return true. Otherwise, return false. + * > anything else + * > Return false. + * + * @link https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier + * + * @param string $input The input string. + * @param int $offset The byte offset in the string. + * @return bool True if the next three codepoints would start an ident sequence, otherwise false. + */ + final protected static function check_if_three_code_points_would_start_an_ident_sequence( string $input, int $offset ): bool { + if ( $offset >= strlen( $input ) ) { + return false; + } + + // > U+005C REVERSE SOLIDUS (\) + if ( '\\' === $input[ $offset ] ) { + return self::next_two_are_valid_escape( $input, $offset ); + } + + // > U+002D HYPHEN-MINUS + if ( '-' === $input[ $offset ] ) { + $after_initial_hyphen_minus_offset = $offset + 1; + if ( $after_initial_hyphen_minus_offset >= strlen( $input ) ) { + return false; + } + + // > If the second code point is… U+002D HYPHEN-MINUS… return true + if ( '-' === $input[ $after_initial_hyphen_minus_offset ] ) { + return true; + } + + // > If the second and third code points are a valid escape… return true. + if ( self::next_two_are_valid_escape( $input, $after_initial_hyphen_minus_offset ) ) { + return true; + } + + // > If the second code point is an ident-start code point… return true. + if ( self::is_ident_start_codepoint( $input, $after_initial_hyphen_minus_offset ) ) { + return true; + } + + // > Otherwise, return false. + return false; + } + + // > ident-start code point + // > Return true. + // > anything else + // > Return false. + return self::is_ident_start_codepoint( $input, $offset ); + } + + /** + * Normalizes selector input for processing: decodes the byte stream as + * UTF-8 ( replacing ill-formed sequences with U+FFFD ), then filters the + * code points per the input-preprocessing rules. + * + * @see https://www.w3.org/TR/css-syntax-3/#input-byte-stream + * @see https://www.w3.org/TR/css-syntax-3/#input-preprocessing + * + * @param string $input The selector string. + * @return string The normalized selector string. + */ + final protected static function normalize_selector_input( string $input ): string { + /* + * > The input byte stream defines the byte stream that comprises a style sheet. + * > To decode bytes into a stream of code points… + * + * Selector strings are UTF-8 text. Decoding replaces each maximal + * subpart of an ill-formed byte sequence with U+FFFD REPLACEMENT + * CHARACTER (�), per the WHATWG Encoding Standard's UTF-8 decoder. + * The replaced selector is unlikely to match the elements the + * developer intended, so the replacement also reports a notice. + * + * https://www.w3.org/TR/css-syntax-3/#input-byte-stream + */ + $scrubbed = wp_scrub_utf8( $input ); + if ( $scrubbed !== $input ) { + _doing_it_wrong( + get_called_class() . '::from_selectors', + 'Selector strings must be valid UTF-8: ill-formed byte sequences were replaced with U+FFFD (�), which is unlikely to match the intended elements.', + '{WP_VERSION}' + ); + $input = $scrubbed; + } + + /* + * > A selector string is a list of one or more complex selectors ([SELECTORS4], section 3.1) that may be surrounded by whitespace… + * + * This list includes \f. + * + * Only leading whitespace is removed here. Trailing whitespace may be + * significant: a backslash may escape a final whitespace code point + * into an ident (`.foo\ ` is the class `foo `), and a backslash + * before a final newline is an invalid escape, while a backslash at + * the end of input is a valid escape that decodes to U+FFFD. The + * selector grammar consumes insignificant trailing whitespace itself. + */ + $input = ltrim( $input, " \t\r\n\f" ); + + /* + * > The input stream consists of the filtered code points pushed into it as the input byte stream is decoded. + * > + * > To filter code points from a stream of (unfiltered) code points input: + * > Replace any U+000D CARRIAGE RETURN (CR) code points, U+000C FORM FEED (FF) code points, or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF) in input by a single U+000A LINE FEED (LF) code point. + * > Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�). + * + * https://www.w3.org/TR/css-syntax-3/#input-preprocessing + */ + $input = str_replace( array( "\r\n" ), "\n", $input ); + $input = str_replace( array( "\r", "\f" ), "\n", $input ); + $input = str_replace( "\0", "\u{FFFD}", $input ); + + return $input; + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-type-selector.php b/src/wp-includes/html-api/css/class-wp-css-type-selector.php new file mode 100644 index 0000000000000..c7c7baa2d5508 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-type-selector.php @@ -0,0 +1,90 @@ +type = $type; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + $tag_name = $processor->get_tag(); + if ( null === $tag_name ) { + return false; + } + return $this->matches_tag( $tag_name ); + } + + /** + * Checks whether the selector matches the provided tag name. + * + * @param string $tag_name + * @return bool + */ + public function matches_tag( string $tag_name ): bool { + if ( '*' === $this->type ) { + return true; + } + return 0 === strcasecmp( $tag_name, $this->type ); + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + if ( $offset >= strlen( $input ) ) { + return null; + } + + if ( '*' === $input[ $offset ] ) { + ++$offset; + return new WP_CSS_Type_Selector( '*' ); + } + + $result = self::parse_ident( $input, $offset ); + if ( null === $result ) { + return null; + } + + return new self( $result ); + } +} diff --git a/src/wp-settings.php b/src/wp-settings.php index ef5c7784ee561..e9ff3af23a096 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -278,6 +278,15 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; require ABSPATH . WPINC . '/class-wp-block-processor.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-selector-parser-matcher.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-attribute-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-class-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-id-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-type-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-compound-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-complex-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-compound-selector-list.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-complex-selector-list.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; diff --git a/tests/phpunit/tests/html-api/wpCssAttributeSelector.php b/tests/phpunit/tests/html-api/wpCssAttributeSelector.php new file mode 100644 index 0000000000000..99051f2cc971c --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssAttributeSelector.php @@ -0,0 +1,123 @@ +assertNull( $result ); + } else { + $this->assertNotNull( $result, "Failed to parse attribute selector: {$input}" ); + $this->assertSame( $expected_name, $result->name ); + $this->assertSame( $expected_matcher, $result->matcher ); + $this->assertSame( $expected_value, $result->value ); + $this->assertSame( $expected_modifier, $result->modifier ); + $this->assertSame( $rest, substr( $input, $offset ) ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_attribute_selectors(): array { + return array( + '[href]' => array( '[href]', 'href', null, null, null, '' ), + '[href] type' => array( '[href] type', 'href', null, null, null, ' type' ), + '[href]#id' => array( '[href]#id', 'href', null, null, null, '#id' ), + '[href].class' => array( '[href].class', 'href', null, null, null, '.class' ), + '[href][href2]' => array( '[href][href2]', 'href', null, null, null, '[href2]' ), + '[\n href\t\r]' => array( "[\n href\t\r]", 'href', null, null, null, '' ), + '[href=foo]' => array( '[href=foo]', 'href', WP_CSS_Attribute_Selector::MATCH_EXACT, 'foo', null, '' ), + '[a=b]' => array( '[a=b]', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + '[href \n = bar ]' => array( "[href \n = bar ]", 'href', WP_CSS_Attribute_Selector::MATCH_EXACT, 'bar', null, '' ), + '[href \n ^= baz ]' => array( "[href \n ^= baz ]", 'href', WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY, 'baz', null, '' ), + + '[match $= insensitive i]' => array( '[match $= insensitive i]', 'match', WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY, 'insensitive', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ), + '[match|=sensitive s]' => array( '[match|=sensitive s]', 'match', WP_CSS_Attribute_Selector::MATCH_EXACT_OR_HYPHEN_SUFFIXED, 'sensitive', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ), + '[att=val I]' => array( '[att=val I]', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ), + '[att=val S]' => array( '[att=val S]', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ), + + '[match~="quoted[][]"]' => array( '[match~="quoted[][]"]', 'match', WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT, 'quoted[][]', null, '' ), + "[match$='quoted!{}']" => array( "[match$='quoted!{}']", 'match', WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY, 'quoted!{}', null, '' ), + "[match*='quoted's]" => array( "[match*='quoted's]", 'match', WP_CSS_Attribute_Selector::MATCH_CONTAINS, 'quoted', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ), + + '[escape-nl="foo\\nbar"]' => array( "[escape-nl='foo\\\nbar']", 'escape-nl', WP_CSS_Attribute_Selector::MATCH_EXACT, 'foobar', null, '' ), + '[escape-seq="\\31 23"]' => array( "[escape-seq='\\31 23']", 'escape-seq', WP_CSS_Attribute_Selector::MATCH_EXACT, '123', null, '' ), + + /* + * The end of input closes an open attribute selector: tokenization + * auto-closes unterminated simple blocks (and strings) at EOF. + * + * https://www.w3.org/TR/css-syntax-3/#consume-simple-block + */ + 'EOF [foo' => array( '[foo', 'foo', null, null, null, '' ), + 'EOF [ \n foo' => array( "[ \n foo", 'foo', null, null, null, '' ), + 'EOF [foo ' => array( '[foo ', 'foo', null, null, null, '' ), + 'EOF [a=b' => array( '[a=b', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + 'EOF [att=val ' => array( '[att=val ', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', null, '' ), + 'EOF [a="b' => array( '[a="b', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + "EOF [a='b" => array( "[a='b", 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + 'EOF [a="b\\' => array( '[a="b\\', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + 'EOF [a=b\\' => array( '[a=b\\', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, "b\u{FFFD}", null, '' ), + 'EOF [a^=b' => array( '[a^=b', 'a', WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY, 'b', null, '' ), + 'EOF [att=val i' => array( '[att=val i', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ), + 'EOF [att=val i ' => array( '[att=val i ', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ), + 'EOF [att="val"s' => array( '[att="val"s', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ), + + // Invalid + 'Invalid: (empty string)' => array( '' ), + 'Invalid: foo' => array( 'foo' ), + 'Invalid: [' => array( '[' ), + 'Invalid: [ ' => array( '[ ' ), + 'Invalid: [a=' => array( '[a=' ), + 'Invalid: [a= ' => array( '[a= ' ), + 'Invalid: [a~' => array( '[a~' ), + 'Invalid: [a=b x' => array( '[a=b x' ), + 'Invalid: [a i' => array( '[a i' ), + 'Invalid: [#foo]' => array( '[#foo]' ), + 'Invalid: [*|*]' => array( '[*|*]' ), + 'Invalid: [ns|*]' => array( '[ns|*]' ), + 'Invalid: [* |att]' => array( '[* |att]' ), + 'Invalid: [*| att]' => array( '[*| att]' ), + 'Invalid: [att * =]' => array( '[att * =]' ), + 'Invalid: [att+=val]' => array( '[att+=val]' ), + 'Invalid: [a=]' => array( '[a=]' ), + 'Invalid: [a~=]' => array( '[a~=]' ), + 'Invalid: [a==b]' => array( '[a==b]' ), + 'Invalid: [a=1]' => array( '[a=1]' ), + 'Invalid: [a=1' => array( '[a=1' ), + 'Invalid: [att i]' => array( '[att i]' ), + 'Invalid: [att s]' => array( '[att s]' ), + "Invalid: [att='val\\n']" => array( "[att='val\n']" ), + "Invalid: [att='val\\n" => array( "[att='val\n" ), + 'Invalid: [att="val"ix' => array( '[att="val"ix' ), + 'Invalid: [att="val"ix ' => array( '[att="val"ix ' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssClassSelector.php b/tests/phpunit/tests/html-api/wpCssClassSelector.php new file mode 100644 index 0000000000000..3328b047fa143 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssClassSelector.php @@ -0,0 +1,50 @@ +assertNull( $result ); + } else { + $this->assertSame( $expected, $result->class_name ); + $this->assertSame( $rest, substr( $input, $offset ) ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_class_selectors(): array { + return array( + 'valid ._-foo123' => array( '._-foo123', '_-foo123', '' ), + 'valid .foo.bar' => array( '.foo.bar', 'foo', '.bar' ), + 'escaped .\31 23' => array( '.\\31 23', '123', '' ), + 'with descendant .\31 23 div' => array( '.\\31 23 div', '123', ' div' ), + 'escape at EOF .foo\\' => array( '.foo\\', "foo\u{fffd}", '' ), + + 'not class foo' => array( 'foo' ), + 'not class #bar' => array( '#bar' ), + 'not valid .1foo' => array( '.1foo' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssComplexSelector.php b/tests/phpunit/tests/html-api/wpCssComplexSelector.php new file mode 100644 index 0000000000000..8738bb6fc32d2 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssComplexSelector.php @@ -0,0 +1,71 @@ + .child#bar[baz=quux] , rest'; + $offset = 0; + + /** @var WP_CSS_Complex_Selector|null */ + $sel = WP_CSS_Complex_Selector::parse( $input, $offset ); + + $this->assertSame( 2, count( $sel->context_selectors ) ); + + // Relative selectors should be reverse ordered. + $this->assertSame( 'el2', $sel->context_selectors[0][0]->type ); + $this->assertSame( WP_CSS_Complex_Selector::COMBINATOR_CHILD, $sel->context_selectors[0][1] ); + + $this->assertSame( 'el1', $sel->context_selectors[1][0]->type ); + $this->assertSame( WP_CSS_Complex_Selector::COMBINATOR_DESCENDANT, $sel->context_selectors[1][1] ); + + $this->assertSame( 3, count( $sel->self_selector->subclass_selectors ) ); + $this->assertNull( $sel->self_selector->type_selector ); + $this->assertSame( 'child', $sel->self_selector->subclass_selectors[0]->class_name ); + + $this->assertSame( ', rest', substr( $input, $offset ) ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_complex_selector() { + $input = 'el.foo#bar[baz=quux] > , rest'; + $offset = 0; + $result = WP_CSS_Complex_Selector::parse( $input, $offset ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_complex_selector_nonfinal_subclass() { + $input = 'el.foo#bar[baz=quux] > final, rest'; + $offset = 0; + $result = WP_CSS_Complex_Selector::parse( $input, $offset ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_empty_complex_selector() { + $input = ''; + $offset = 0; + $result = WP_CSS_Complex_Selector::parse( $input, $offset ); + $this->assertNull( $result ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php b/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php new file mode 100644 index 0000000000000..b85f788f98f0d --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php @@ -0,0 +1,65 @@ + selector'; + $result = WP_CSS_Complex_Selector_List::from_selectors( $input ); + $this->assertNotNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_selector_list() { + $input = 'el,,'; + $result = WP_CSS_Complex_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_selector_list2() { + $input = 'el!'; + $result = WP_CSS_Complex_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_empty_selector_list() { + $input = " \t \t\n\r\f"; + $result = WP_CSS_Complex_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * The invalid-UTF-8 scrub notice reports the called class: through this + * class it must be named WP_CSS_Complex_Selector_List::from_selectors, + * not the WP_CSS_Compound_Selector_List parent where from_selectors() + * and the scrub are implemented. The fuzzer's notice model depends on + * the per-class name. + * + * @expectedIncorrectUsage WP_CSS_Complex_Selector_List::from_selectors + */ + public function test_invalid_utf8_scrub_notice_reports_the_called_class() { + $result = WP_CSS_Complex_Selector_List::from_selectors( "el \xC2.child" ); + $this->assertNotNull( $result, 'Selector with invalid UTF-8 should parse after scrubbing.' ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssCompoundSelector.php b/tests/phpunit/tests/html-api/wpCssCompoundSelector.php new file mode 100644 index 0000000000000..8092ee049b6e1 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssCompoundSelector.php @@ -0,0 +1,44 @@ + .child'; + $offset = 0; + $sel = WP_CSS_Compound_Selector::parse( $input, $offset ); + + $this->assertSame( 'el', $sel->type_selector->type ); + $this->assertSame( 3, count( $sel->subclass_selectors ) ); + $this->assertSame( 'foo', $sel->subclass_selectors[0]->class_name, 'foo' ); + $this->assertSame( 'bar', $sel->subclass_selectors[1]->id, 'bar' ); + $this->assertSame( 'baz', $sel->subclass_selectors[2]->name, 'baz' ); + $this->assertSame( WP_CSS_Attribute_Selector::MATCH_EXACT, $sel->subclass_selectors[2]->matcher ); + $this->assertSame( 'quux', $sel->subclass_selectors[2]->value ); + $this->assertSame( ' > .child', substr( $input, $offset ) ); + } + + /** + * @ticket 62653 + */ + public function test_parse_empty_selector() { + $input = ''; + $offset = 0; + $result = WP_CSS_Compound_Selector::parse( $input, $offset ); + $this->assertNull( $result ); + $this->assertSame( 0, $offset ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php b/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php new file mode 100644 index 0000000000000..33149c22ed400 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php @@ -0,0 +1,143 @@ +assertNotNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_selector_list() { + $input = 'el,,'; + $result = WP_CSS_Compound_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_selector_list2() { + $input = 'el!'; + $result = WP_CSS_Compound_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * An escaped whitespace code point at the end of input belongs to the + * ident and must survive input normalization: `.foo\ ` is the valid + * class `foo ` (with a space), not a backslash at the end of input. + * + * @ticket 62653 + */ + public function test_parse_escaped_whitespace_at_end_of_input() { + $result = WP_CSS_Compound_Selector_List::from_selectors( '.foo\\ ' ); + $this->assertNotNull( $result ); + } + + /** + * A backslash before a newline is not a valid escape; at the end of + * input it must not be mistaken for trimmable trailing whitespace. + * + * @ticket 62653 + */ + public function test_parse_escape_before_newline_at_end_of_input_is_invalid() { + $result = WP_CSS_Compound_Selector_List::from_selectors( ".foo\\\n" ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_empty_selector_list() { + $input = " \t \t\n\r\f"; + $result = WP_CSS_Compound_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_unsupported_complex_selector() { + $input = 'ancestor descendant'; + $result = WP_CSS_Compound_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * Selector strings are UTF-8 text: invalid byte sequences are replaced + * with U+FFFD per maximal subpart (CSS Syntax §3.2 via the WHATWG + * Encoding Standard) before parsing, so the selector parses rather than + * being rejected. The replacement is almost certainly not what the + * developer meant, so it also triggers `_doing_it_wrong()`. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_invalid_utf8_is_scrubbed_to_replacement_character_and_notifies() { + $result = WP_CSS_Compound_Selector_List::from_selectors( ".B\xFCcher" ); + $this->assertNotNull( $result, 'Selector with invalid UTF-8 should parse after scrubbing.' ); + } + + /** + * Valid UTF-8 — including a literal U+FFFD — must parse without any + * incorrect-usage notice: scrubbing is the identity function on valid + * input. + */ + public function test_valid_utf8_with_literal_replacement_character_is_not_notified() { + $result = WP_CSS_Compound_Selector_List::from_selectors( ".B\u{FFFD}cher" ); + $this->assertNotNull( $result, 'Selector containing a literal U+FFFD should parse.' ); + } + + /** + * The whole input is scrubbed uniformly, so a selector list with invalid + * bytes in one of several selectors still parses as a list. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_invalid_utf8_in_selector_list_is_scrubbed() { + $result = WP_CSS_Compound_Selector_List::from_selectors( ".ok, .B\xE2\x8Ccher" ); + $this->assertNotNull( $result, 'Selector list with invalid UTF-8 should parse after scrubbing.' ); + } + + /** + * A selector consisting of nothing but an invalid byte parses: it scrubs + * to U+FFFD, which is an ident-start code point and therefore a valid + * type selector. Surprising, but it follows from the scrub running + * before tokenization — the parser never sees the invalid byte. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_lone_invalid_byte_parses_as_replacement_character_type_selector() { + $result = WP_CSS_Compound_Selector_List::from_selectors( "\x80" ); + $this->assertNotNull( $result, 'A lone invalid byte should parse as a U+FFFD type selector.' ); + } + + /** + * The scrub notice reports the byte replacement, which happens before + * parsing — it fires even when the scrubbed selector is then rejected + * by the grammar. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_invalid_utf8_notice_fires_even_when_selector_is_rejected() { + $result = WP_CSS_Compound_Selector_List::from_selectors( "\x80 div" ); + $this->assertNull( $result, 'Descendant combinators are unsupported by the compound list; the scrubbed selector should still be rejected.' ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssIdSelector.php b/tests/phpunit/tests/html-api/wpCssIdSelector.php new file mode 100644 index 0000000000000..03694fa4456e5 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssIdSelector.php @@ -0,0 +1,51 @@ +assertNull( $result ); + } else { + $this->assertSame( $expected, $result->id ); + $this->assertSame( $rest, substr( $input, $offset ) ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_id_selectors(): array { + return array( + 'valid #_-foo123' => array( '#_-foo123', '_-foo123', '' ), + 'valid #foo#bar' => array( '#foo#bar', 'foo', '#bar' ), + 'escaped #\31 23' => array( '#\\31 23', '123', '' ), + 'with descendant #\31 23 div' => array( '#\\31 23 div', '123', ' div' ), + 'escape at EOF #foo\\' => array( '#foo\\', "foo\u{fffd}", '' ), + + // Invalid + 'not ID foo' => array( 'foo' ), + 'not ID .bar' => array( '.bar' ), + 'not valid #1foo' => array( '#1foo' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php b/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php new file mode 100644 index 0000000000000..181519b3cbed3 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php @@ -0,0 +1,281 @@ +original_substitute_character = mb_substitute_character(); + mb_substitute_character( 0x2603 ); + $this->test_class = new class() extends WP_CSS_Selector_Parser_Matcher { + public function matches( $processor ): bool { + throw new Error( 'Matches called on test class.' ); + } + public static function parse( string $input, int &$offset ) { + throw new Error( 'Parse called on test class.' ); + } + + /* + * Parsing + */ + public static function test_parse_ident( string $input, int &$offset ) { + return self::parse_ident( $input, $offset ); + } + + public static function test_parse_string( string $input, int &$offset ) { + return self::parse_string( $input, $offset ); + } + + /* + * Utilities + */ + public static function test_is_ident_codepoint( string $input, int $offset ) { + return self::is_ident_codepoint( $input, $offset ); + } + + public static function test_is_ident_start_codepoint( string $input, int $offset ) { + return self::is_ident_start_codepoint( $input, $offset ); + } + }; + } + + public function tear_down(): void { + mb_substitute_character( $this->original_substitute_character ); + parent::tear_down(); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_idents(): array { + return array( + 'trailing #' => array( '_-foo123#xyz', '_-foo123', '#xyz' ), + 'trailing .' => array( '😍foo123.xyz', '😍foo123', '.xyz' ), + 'trailing " "' => array( '😍foo123 more', '😍foo123', ' more' ), + 'escaped ASCII character' => array( '\\xyz', 'xyz', '' ), + 'escape after multibyte character' => array( 'Ü\\sup', 'Üsup', '' ), + 'escape after multibyte characters' => array( 'ÜÜ\\sup', 'ÜÜsup', '' ), + 'hex escape after multibyte character' => array( 'Ü\\31 23', 'Ü123', '' ), + 'escaped space' => array( '\\ x', ' x', '' ), + 'escaped emoji' => array( '\\😍', '😍', '' ), + 'hex unicode codepoint' => array( '\\1f0a1', '🂡', '' ), + 'HEX UNICODE CODEPOINT' => array( '\\1D4B2', '𝒲', '' ), + + 'hex tab-suffixed 1' => array( "\\31\t23", '123', '' ), + 'hex newline-suffixed 1' => array( "\\31\n23", '123', '' ), + 'hex space-suffixed 1' => array( "\\31 23", '123', '' ), + 'hex tab' => array( '\\9', "\t", '' ), + 'hex a' => array( '\\61 bc', 'abc', '' ), + 'hex a max escape length' => array( '\\000061bc', 'abc', '' ), + + 'out of range replacement min' => array( '\\110000 ', "\u{fffd}", '' ), + 'out of range replacement max' => array( '\\ffffff ', "\u{fffd}", '' ), + 'leading surrogate min replacement' => array( '\\d800 ', "\u{fffd}", '' ), + 'leading surrogate max replacement' => array( '\\dbff ', "\u{fffd}", '' ), + 'trailing surrogate min replacement' => array( '\\dc00 ', "\u{fffd}", '' ), + 'trailing surrogate max replacement' => array( '\\dfff ', "\u{fffd}", '' ), + 'can start with -ident' => array( '-ident', '-ident', '' ), + 'can start with --anything' => array( '--anything', '--anything', '' ), + 'can start with ---anything' => array( '--_anything', '--_anything', '' ), + 'can start with --1anything' => array( '--1anything', '--1anything', '' ), + 'can start with -\31 23' => array( '-\31 23', '-123', '' ), + 'can start with --\31 23' => array( '--\31 23', '--123', '' ), + 'ident ends before ]' => array( 'ident]', 'ident', ']' ), + + /* + * > EOF + * > This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�). + * + * https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point + */ + 'escape at EOF' => array( 'foo\\', "foo\u{fffd}", '' ), + 'lone escape at EOF' => array( '\\', "\u{fffd}", '' ), + 'hyphen then escape at EOF' => array( '-\\', "-\u{fffd}", '' ), + + // Identity escapes of multibyte characters, by UTF-8 sequence length. + 'escaped 2-byte character' => array( "\\\u{FC}z", "\u{FC}z", '' ), + 'escaped 3-byte character' => array( "\\\u{270F}z", "\u{270F}z", '' ), + 'escaped 4-byte character' => array( "\\\u{1F0A1}z", "\u{1F0A1}z", '' ), + 'escaped 2-byte character at EOF' => array( "a\\\u{FC}", "a\u{FC}", '' ), + 'escaped 3-byte character at EOF' => array( "a\\\u{270F}", "a\u{270F}", '' ), + 'escaped 4-byte character at EOF' => array( "a\\\u{1F0A1}", "a\u{1F0A1}", '' ), + + /* + * An escaped NUL byte passes through this low-level helper unchanged. + * This is unreachable through the public selector API, where + * normalize_selector_input() replaces NUL with U+FFFD before parsing. + */ + 'escaped NUL byte' => array( "a\\\x00z", "a\x00z", '' ), + + /* + * Identity escapes of invalid UTF-8 byte sequences. + * + * These inputs are not valid UTF-8, which can only reach the parser + * through a direct `parse()` call: the public `from_selectors()` API + * replaces invalid byte sequences with U+FFFD before parsing. On + * this un-normalized path the escape decodes the maximal subpart of + * the invalid sequence (CSS Syntax §3.2 via the WHATWG Encoding + * Standard) to a single U+FFFD — independent of the + * `mb_substitute_character()` setting, which set_up() pins to ☃ + * precisely to prove that independence. Invalid bytes *after* the + * escaped subpart are not escaped; they pass through this low-level + * helper raw, exactly as unescaped invalid bytes do (the 0xAF, + * 0xA0 0x80, and 0x90 0x80 0x80 tails below). + */ + 'escaped lone continuation byte' => array( "a\\\x80z", "a\u{FFFD}z", '' ), + 'escaped overlong lead 0xC0' => array( "a\\\xC0\xAFz", "a\u{FFFD}\xAFz", '' ), + 'escaped invalid lead 0xF5' => array( "a\\\xF5z", "a\u{FFFD}z", '' ), + 'escaped truncated 3-byte sequence' => array( "a\\\xE2\x80z", "a\u{FFFD}z", '' ), + 'escaped truncated 4-byte at EOF' => array( "a\\\xF0\x9F\x82", "a\u{FFFD}", '' ), + 'escaped UTF-8-encoded surrogate' => array( "a\\\xED\xA0\x80z", "a\u{FFFD}\xA0\x80z", '' ), + 'escaped sequence above U+10FFFF' => array( "a\\\xF4\x90\x80\x80z", "a\u{FFFD}\x90\x80\x80z", '' ), + + // Invalid + 'Invalid: (empty string)' => array( '' ), + 'Invalid: bad start >' => array( '>ident' ), + 'Invalid: bad start [' => array( '[ident' ), + 'Invalid: bad start #' => array( '#ident' ), + 'Invalid: bad start " "' => array( ' ident' ), + 'Invalid: bad start 1' => array( '1ident' ), + 'Invalid: bad start -1' => array( '-1ident' ), + 'Invalid: bad start -' => array( '-' ), + ); + } + + /** + * @ticket 62653 + */ + public function test_is_ident_and_is_ident_start() { + $this->assertFalse( $this->test_class::test_is_ident_codepoint( '[', 0 ) ); + $this->assertFalse( $this->test_class::test_is_ident_codepoint( ']', 0 ) ); + $this->assertFalse( $this->test_class::test_is_ident_start_codepoint( '[', 0 ) ); + $this->assertFalse( $this->test_class::test_is_ident_start_codepoint( ']', 0 ) ); + } + + /** + * @ticket 62653 + * + * @dataProvider data_idents + */ + public function test_parse_ident( string $input, ?string $expected = null, ?string $rest = null ) { + + $offset = 0; + $result = $this->test_class::test_parse_ident( $input, $offset ); + if ( null === $expected ) { + $this->assertNull( $result ); + } else { + $this->assertSame( $expected, $result, 'Ident did not match.' ); + $this->assertSame( $rest, substr( $input, $offset ), 'Offset was not updated correctly.' ); + } + } + + /** + * The rest-of-input assertion above cannot distinguish an offset at the end + * of the input from one past it (`substr()` returns '' for both), so the + * offset arithmetic of the invalid-byte decode is pinned explicitly here: + * the escape consumes exactly the 1-byte maximal subpart and the following + * `z`, leaving the offset at — never past — the end of the input. (The + * previous `mb_substr()`-based decode advanced by the byte length of the + * substitute character and overran the end by one byte under the ☃ canary.) + */ + public function test_parse_ident_escaped_invalid_byte_does_not_overrun_offset() { + $input = "a\\\x80z"; + $offset = 0; + $result = $this->test_class::test_parse_ident( $input, $offset ); + + $this->assertSame( "a\u{FFFD}z", $result, 'Ident did not match.' ); + $this->assertSame( strlen( $input ), $offset, 'Offset should stop exactly at the end of input.' ); + } + + /** + * @ticket 62653 + * + * @dataProvider data_strings + */ + public function test_parse_string( string $input, ?string $expected = null, ?string $rest = null ) { + $offset = 0; + $result = $this->test_class::test_parse_string( $input, $offset ); + if ( null === $expected ) { + $this->assertNull( $result ); + } else { + $this->assertSame( $expected, $result, 'String did not match.' ); + $this->assertSame( $rest, substr( $input, $offset ), 'Offset was not updated correctly.' ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_strings(): array { + return array( + '"foo"' => array( '"foo"', 'foo', '' ), + '"foo"after' => array( '"foo"after', 'foo', 'after' ), + '"foo""two"' => array( '"foo""two"', 'foo', '"two"' ), + '"foo"\'two\'' => array( '"foo"\'two\'', 'foo', "'two'" ), + + "'foo'" => array( "'foo'", 'foo', '' ), + "'foo'after" => array( "'foo'after", 'foo', 'after' ), + "'foo'\"two\"" => array( "'foo'\"two\"", 'foo', '"two"' ), + "'foo''two'" => array( "'foo''two'", 'foo', "'two'" ), + + "'foo\\nbar'" => array( "'foo\\\nbar'", 'foobar', '' ), + "'foo\\31 23'" => array( "'foo\\31 23'", 'foo123', '' ), + "'Ü\\sup'" => array( "'Ü\\sup'", 'Üsup', '' ), + "'foo\\31\\n23'" => array( "'foo\\31\n23'", 'foo123', '' ), + "'foo\\31\\t23'" => array( "'foo\\31\t23'", 'foo123', '' ), + "'foo\\00003123'" => array( "'foo\\00003123'", 'foo123', '' ), + + "'foo\\" => array( "'foo\\", 'foo', '' ), + + /* + * Invalid UTF-8 in string context, reachable only via a direct + * parse() call ( from_selectors() scrubs first ): an escaped + * invalid byte decodes its maximal subpart to U+FFFD, exactly as + * in ident context; raw invalid bytes pass through unexamined. + */ + 'string with escaped invalid byte' => array( "'a\\\xC0z'", "a\u{FFFD}z", '' ), + 'string with raw invalid byte' => array( "'a\xC0z'", "a\xC0z", '' ), + + '"' => array( '"', '', '' ), + '"\\"' => array( '"\\"', '"', '' ), + '"missing close' => array( '"missing close', 'missing close', '' ), + + // Invalid + 'Invalid: (empty string)' => array( '' ), + 'Invalid: .foo' => array( '.foo' ), + 'Invalid: #foo' => array( '#foo' ), + "Invalid: 'newline\\n'" => array( "'newline\n'" ), + 'Invalid: foo' => array( 'foo' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssTypeSelector.php b/tests/phpunit/tests/html-api/wpCssTypeSelector.php new file mode 100644 index 0000000000000..94ae49bff474a --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssTypeSelector.php @@ -0,0 +1,52 @@ +assertNull( $result ); + } else { + $this->assertSame( $expected, $result->type ); + $this->assertSame( $rest, substr( $input, $offset ) ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_type_selectors(): array { + return array( + 'any *' => array( '* .class', '*', ' .class' ), + 'a' => array( 'a', 'a', '' ), + 'div.class' => array( 'div.class', 'div', '.class' ), + 'custom-type#id' => array( 'custom-type#id', 'custom-type', '#id' ), + 'escape at EOF foo\\' => array( 'foo\\', "foo\u{fffd}", '' ), + + // Invalid + 'Invalid: (empty string)' => array( '' ), + 'Invalid: #id' => array( '#id' ), + 'Invalid: .class' => array( '.class' ), + 'Invalid: [attr]' => array( '[attr]' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php new file mode 100644 index 0000000000000..fcb1acf3fa7d6 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php @@ -0,0 +1,110 @@ +' ); + $this->assertFalse( $processor->select( 'div' ) ); + } + + /** + * @ticket 62653 + * + * @dataProvider data_selectors + */ + public function test_selects_all_matches( string $html, string $selector, int $match_count ) { + $processor = WP_HTML_Processor::create_full_parser( $html ); + $count = 0; + while ( $processor->select( $selector ) ) { + $breadcrumb_string = implode( ', ', $processor->get_breadcrumbs() ); + $this->assertTrue( + $processor->get_attribute( 'match' ), + "Matched unexpected tag {$processor->get_tag()} @ {$breadcrumb_string}" + ); + ++$count; + } + $this->assertSame( $match_count, $count, 'Did not match expected number of tags.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_selectors(): array { + return array( + 'any' => array( '

', '*', 5 ), + 'quirks mode ID' => array( '

In quirks mode, ID matching is case-insensitive.', '#id', 2 ), + 'quirks mode class' => array( '

In quirks mode, class matching is case-insensitive.', '.c', 2 ), + 'no-quirks mode ID' => array( '

In no-quirks mode, ID matching is case-sensitive.', '#id', 1 ), + 'no-quirks mode class' => array( '

In no-quirks mode, class matching is case-sensitive.', '.c', 1 ), + 'any descendant' => array( '

', 'section *', 4 ), + 'any child matches all children' => array( '

', 'section > *', 2 ), + + 'multiple complex selectors' => array( '

', 'section > div p > i', 1 ), + + // Per Selectors-4, the substring matchers ^= $= *= match nothing when the value + // is empty. ~= also matches nothing: an empty string is never a list item. + 'empty value ^= matches nothing' => array( '', '[x^=""]', 0 ), + 'empty value $= matches nothing' => array( '', '[x$=""]', 0 ), + 'empty value *= matches nothing' => array( '', '[x*=""]', 0 ), + 'empty value ~= matches nothing' => array( '', '[x~=""]', 0 ), + 'empty value ^= i matches nothing' => array( '', '[x^="" i]', 0 ), + 'empty value = matches empty' => array( '', '[x=""]', 1 ), + 'empty value |= matches empty or hyphen-prefixed' => array( '', '[x|=""]', 2 ), + + /* + * HTML's case-insensitive attribute value list applies to + * "an HTML element in an HTML document": a foreign element with + * the same attribute name keeps case-sensitive matching. + * ( Chromium applies the list to foreign elements as well, + * diverging from the HTML specification here. ) + * + * https://html.spec.whatwg.org/multipage/semantics-other.html#case-sensitivity-of-selectors + */ + 'HTML-namespace-only attribute case-insensitivity' => array( '', '[type=TEXT]', 1 ), + ); + } + + /** + * @ticket 62653 + * + * @expectedIncorrectUsage WP_HTML_Processor::select + * + * @dataProvider data_invalid_selectors + */ + public function test_invalid_selector( string $selector ) { + $processor = WP_HTML_Processor::create_fragment( 'irrelevant' ); + $this->assertFalse( $processor->select( $selector ) ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_invalid_selectors(): array { + return array( + 'invalid selector' => array( '[invalid!selector]' ), + + // The class selectors below are not allowed in non-final position. + 'unsupported child selector' => array( '.parent > .child' ), + 'unsupported descendant selector' => array( '.ancestor .descendant' ), + + // Unsupported combinators + 'unsupported next sibling selector' => array( 'p + p' ), + 'unsupported subsequent sibling selector' => array( 'p ~ p' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php new file mode 100644 index 0000000000000..96bb8e1b4457d --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php @@ -0,0 +1,214 @@ +' ); + $this->assertFalse( $processor->select( 'div' ) ); + } + + /** + * @ticket 62653 + * + * @dataProvider data_selectors + */ + public function test_select( string $html, string $selector, int $match_count ) { + $processor = new WP_HTML_Tag_Processor( $html ); + $count = 0; + while ( $processor->select( $selector ) ) { + $this->assertTrue( + $processor->get_attribute( 'match' ), + "Matched unexpected tag {$processor->get_tag()}" + ); + ++$count; + } + $this->assertSame( $match_count, $count, 'Did not match expected number of tags.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_selectors(): array { + return array( + 'simple type' => array( '

', 'div', 2 ), + 'any type' => array( '
', '*', 2 ), + 'simple class' => array( '
', '.x', 2 ), + 'simple id' => array( '
', '#x', 2 ), + + 'attribute presence' => array( '
', '[att]', 2 ), + 'attribute empty string match' => array( '
', '[att=""]', 2 ), + 'attribute value' => array( '

', '[att=val]', 2 ), + 'attribute quoted value' => array( '

', '[att="::"]', 2 ), + 'attribute case insensitive' => array( '

', '[att="VAL"i]', 2 ), + 'attribute case sensitive mod' => array( '

', '[att="val"s]', 2 ), + + 'attribute one of' => array( '

', '[att~="b"]', 3 ), + 'attribute one of insensitive' => array( '

', '[att~="b"i]', 1 ), + 'attribute one of mod sensitive' => array( '
', '[att~="b"s]', 1 ), + 'attribute one of whitespace cases' => array( "
", '[att~="b"]', 1 ), + + 'attribute with-hyphen' => array( '

', '[att|="special"]', 2 ), + 'attribute with-hyphen insensitive' => array( '

', '[att|="special" i]', 2 ), + 'attribute with-hyphen sensitive mod' => array( '

', '[att|="special"s]', 1 ), + + 'attribute prefixed' => array( '

', '[att^="p"]', 2 ), + 'attribute prefixed insensitive' => array( '

', '[att^="p"i]', 1 ), + 'attribute prefixed sensitive mod' => array( '

', '[att^="p"s]', 1 ), + + 'attribute suffixed' => array( '

', '[att$="x"]', 2 ), + 'attribute suffixed insensitive' => array( '

', '[att$="x"i]', 1 ), + 'attribute suffixed sensitive mod' => array( '

', '[att$="x"s]', 1 ), + + 'attribute contains' => array( '

', '[att*="x"]', 2 ), + 'attribute contains insensitive' => array( '

', '[att*="x"i]', 1 ), + 'attribute contains sensitive mod' => array( '

', '[att*="x"s]', 1 ), + + /* + * An escaped trailing whitespace code point is part of the ident, + * not trailing whitespace: `.foo\ ` is the class `foo ` (with a + * space). Class attribute values are whitespace-separated token + * lists, so such a class can never match. It must NOT be confused + * with a backslash at the end of input, which decodes to U+FFFD. + */ + 'escaped space at end' => array( "

", '.foo\\ ', 0 ), + 'escaped tab at end' => array( "
", ".foo\\\t", 0 ), + + /* + * The end of input closes an open attribute selector ( and an + * unterminated string ): tokenization auto-closes simple blocks + * at EOF. + */ + 'EOF-truncated attribute presence' => array( '
', '[att', 1 ), + 'EOF-truncated attribute value' => array( '
', '[att=val', 1 ), + 'EOF-truncated quoted value' => array( '
', '[att="a b', 1 ), + 'EOF-truncated with modifier' => array( '
', '[att=val i', 1 ), + + /* + * HTML defines a set of attributes whose values must match ASCII + * case-insensitively in selectors when no modifier is present. + * An explicit `s` modifier still forces case-sensitive matching. + * Attributes outside the list stay case-sensitive by default. + * + * https://html.spec.whatwg.org/multipage/semantics-other.html#case-sensitivity-of-selectors + */ + 'HTML insensitive attribute =' => array( '', '[type=TEXT]', 2 ), + 'HTML insensitive attribute ~=' => array( '', '[rel~=nofollow]', 1 ), + 'HTML insensitive attribute ^=' => array( '', '[media^=screen]', 1 ), + 'HTML insensitive attribute |=' => array( '', '[hreflang|=en]', 1 ), + 'HTML insensitive attribute s mod' => array( '', '[type=text s]', 1 ), + 'HTML insensitive attribute i mod' => array( '', '[type=text i]', 2 ), + 'unlisted attribute stays sensitive' => array( '', '[data-type=TEXT]', 1 ), + 'listed attribute name is matched case-insensitively in the list' => array( '', '[TYPE=TEXT]', 1 ), + + 'list' => array( '

', 'a, p, .class, #id, [att]', 2 ), + 'compound' => array( '

', 'custom-el[att="bar"][ fruit ~= "banana" i]', 1 ), + ); + } + + /** + * @ticket 62653 + * + * @expectedIncorrectUsage WP_HTML_Tag_Processor::select + * + * @dataProvider data_invalid_selectors + */ + public function test_invalid_selector( string $selector ) { + $processor = new WP_HTML_Tag_Processor( 'irrelevant' ); + $this->assertFalse( $processor->select( $selector ) ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_invalid_selectors(): array { + return array( + 'complex descendant' => array( 'div *' ), + 'complex child' => array( 'div > *' ), + 'invalid selector' => array( '[invalid!selector]' ), + + /* + * A backslash before a newline at the end of input is not a valid + * escape and is not trailing whitespace: the selector is invalid. + * The CR and FF variants are normalized to a newline before + * tokenizing. + */ + 'escape before newline at end' => array( ".foo\\\n" ), + 'escape before CR at end' => array( ".foo\\\r" ), + 'escape before FF at end' => array( ".foo\\\f" ), + + /* + * EOF auto-closes an open attribute selector block, but + * grammar-level truncation is still invalid. + */ + 'truncated matcher without value' => array( '[a=' ), + 'truncated half matcher' => array( '[a~' ), + 'lone open bracket' => array( '[' ), + ); + } + + /** + * Selector strings are UTF-8 text: invalid byte sequences are replaced + * with U+FFFD per maximal subpart before parsing. A selector containing + * invalid bytes therefore matches a literal U+FFFD in the document, and + * an identity escape of an invalid byte is equivalent to the same byte + * unescaped — both are scrubbed before tokenization. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_select_scrubbed_selector_matches_replacement_character() { + $html = "
"; + + $processor = new WP_HTML_Tag_Processor( $html ); + $this->assertTrue( + $processor->select( ".a\xC0b" ), + 'Scrubbed selector should match the replacement character in the document.' + ); + + $processor = new WP_HTML_Tag_Processor( $html ); + $this->assertTrue( + $processor->select( ".a\\\xC0b" ), + 'An identity escape of an invalid byte should be equivalent to the unescaped byte.' + ); + } + + /** + * A selector containing invalid bytes can never match those same raw + * bytes in a document: the selector side is scrubbed to U+FFFD while + * the Tag Processor reports raw document bytes untouched. + * + * This pins a deliberate, documented divergence. If the HTML API value + * getters (get_attribute(), class_list(), …) are ever changed to scrub + * invalid UTF-8 in their return values, both sides become U+FFFD and + * this case flips to a match — update this expectation in the same + * change. + * + * The selector byte (0xC1) is unique within this file on purpose: + * select() memoizes the most recently parsed selector string, so the + * scrub notice only fires when this test's selector was not already + * parsed by an earlier test. A unique selector string guarantees a + * fresh parse regardless of test order. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_select_scrubbed_selector_does_not_match_raw_invalid_document_bytes() { + $processor = new WP_HTML_Tag_Processor( "
" ); + $this->assertFalse( + $processor->select( ".a\xC1b" ), + 'Scrubbed selector should not match raw invalid bytes in the document.' + ); + } +} diff --git a/tools/css-selector-fuzz/COVERAGE.md b/tools/css-selector-fuzz/COVERAGE.md new file mode 100644 index 0000000000000..e7b715c0a8894 --- /dev/null +++ b/tools/css-selector-fuzz/COVERAGE.md @@ -0,0 +1,119 @@ +# CSS Selector Fuzzer — Coverage Report + +Line coverage of `src/wp-includes/html-api/css/` under the fuzzer, measured +with phpdbg's opcode log over 3000 deterministic seeds: + + phpdbg -qrr tools/css-selector-fuzz/coverage.php --seeds 3000 --list-uncovered + +| file | covered / executable | % | +|---|---|---| +| class-wp-css-attribute-selector.php | 108 / 119 | 90.8% | +| class-wp-css-class-selector.php | 10 / 10 | 100% | +| class-wp-css-complex-selector-list.php | 16 / 16 | 100% | +| class-wp-css-complex-selector.php | 59 / 66 | 89.4% | +| class-wp-css-compound-selector-list.php | 27 / 28 | 96.4% | +| class-wp-css-compound-selector.php | 29 / 32 | 90.6% | +| class-wp-css-id-selector.php | 12 / 12 | 100% | +| class-wp-css-selector-parser-matcher.php | 120 / 124 | 96.8% | +| class-wp-css-type-selector.php | 15 / 17 | 88.2% | +| **TOTAL** | **396 / 424** | **93.4%** | + +The 28 unreached lines are all accounted for below: twelve are a phpdbg +measurement artifact (the code executes), twelve are defensive guards the +public entry points cannot reach, two are the escape decoder's +invalid-byte arm that the input scrub made unreachable through +`from_selectors()`, and two are reachable lines this seed window happens +to miss. Counting the artifact lines as covered, effective coverage is +**408 / 424 = 96.2%**. + +(Executable-line totals grew from 408 to 424 with the case-insensitive +attribute value list and the invalid-UTF-8 scrub changes.) + +## phpdbg `case`-label artifact (12 lines — code executes) + +phpdbg attributes a `switch` arm's execution to the body line, not the bare +`case X:` label line. The fuzzer exercises every one of these arms (verified +directly: the body line immediately after each label is covered, and the +lexbor differential + self-check confirm the corresponding behavior). These +are not real gaps: + +- `class-wp-css-attribute-selector.php` + - 378, 382, 386, 390, 394 — the `~= |= ^= $= *=` matcher operators. + - 419, 420, 425, 426 — the `i`/`I`/`s`/`S` case modifiers. +- `class-wp-css-compound-selector.php` + - 120, 122, 124 — the `.` / `#` / `[` subclass-selector dispatch. + +## Defensive guards unreachable from the public API (12 lines) + +These are internal precondition checks that the calling code already +guarantees, or branches for grammar the parser never emits: + +- `class-wp-css-attribute-selector.php:351` — `return null` when the first + byte is not `[`. `parse()` is only ever called by + `parse_subclass_selector()` *after* it has matched `[`, so the guard never + fires. +- `class-wp-css-complex-selector.php:170–179` — the `_doing_it_wrong` + "unsupported combinator" arm in the match walker. The parser only ever + stores `' '` (descendant) or `'>'` (child) combinators, so the match-time + default arm is dead defensively. +- `class-wp-css-compound-selector-list.php:107` — `return false` when the + processor is not on a `#tag` token. `select()` only invokes matching while + positioned on a tag; reachable only by calling `matches()` directly off a + non-tag token. +- `class-wp-css-selector-parser-matcher.php:375` — + `next_two_are_valid_escape()` EOF guard; every caller either bound-checks + first or only calls it on a known backslash byte. +- `class-wp-css-type-selector.php:45` — `return false` when `get_tag()` is + null during matching; matching only runs on resolved element tokens. +- `class-wp-css-type-selector.php:75` — `parse()` EOF guard; the compound + parser checks `offset < strlen` before calling. + +## Un-normalized input only (2 lines — pinned by PHPUnit) + +- `class-wp-css-selector-parser-matcher.php:287–288` — the escape decoder's + invalid-byte arm (consume the maximal subpart `_wp_scan_utf8()` reported, + return one U+FFFD). The invalid-UTF-8 scrub in `normalize_selector_input()` + made this arm structurally unreachable through `from_selectors()` — the + fuzzer's only entry point — and it exists for direct `parse()` callers + with un-normalized input. The escape pins in + `tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php` exercise it + for every invalid-byte decode class under the U+2603 canary. + +## Reachable, but missed by this seed window (2 lines) + +Both lines are demonstrably reachable (witnesses verified directly under +phpdbg) but sit behind enough generator coin flips that a fixed 3000-seed +window may or may not sample them; earlier revisions of this report saw +them flicker in and out across windows: + +- `class-wp-css-attribute-selector.php:345` — the `[x` minimum-length + guard; witness: `[` (also `.a[`, `a[`) at the end of input. +- `class-wp-css-selector-parser-matcher.php:149` — `parse_string()`'s + break when plain string content runs to end of input; witness: `[a="b` + (the `eof-truncated` edge-escape kind reaches it only when its quote-drop + and no-backslash coins both land, ~1 expected case per 3000 seeds). + +## Notes on what raised coverage + +- The `edge-escape` bucket drives the U+FFFD escape-decoder branch + (`consume_escaped_codepoint` for NUL / surrogate / over-max codepoints) + and the `normalize_selector_input` NUL→U+FFFD and CR/CRLF/FF→LF paths, + which the structural generators cannot reach. Its `eof-escape` kind covers + the backslash-at-end-of-input → U+FFFD decode, and its `eof-truncated` + kind covers the EOF auto-close paths in the attribute parser, including + unterminated strings (with and without a trailing "do nothing" backslash, + which keeps the `parse_string` backslash-at-EOF arm exercised). +- The `invalid-utf8` bucket and the `mutated` bucket's raw-byte splice + drive the `wp_scrub_utf8()` replacement branch and its + `_doing_it_wrong()` notice in `normalize_selector_input()` + deterministically (previously hit only organically — `chaos` + byte-slicing its multibyte `unicode` alphabet, `mutated` corrupting the + pools' few multibyte characters), and the splice makes the + unparseable-and-invalid-UTF-8 notice ordering in the worker hot. +- A few `invalid`-bucket templates (`[a=`, `[a~`, `[a="bc`, `a.`) reach + attribute / string / class parse guards that random structural generation + rarely lands on. With them the per-file numbers above are stable + at the documented 3000-seed window (e.g. `class-wp-css-class-selector.php` + reaches 10/10 reliably rather than depending on whether a bare `.` happened + to be sampled) — apart from the two borderline-frequency lines listed + above. diff --git a/tools/css-selector-fuzz/FINDINGS.md b/tools/css-selector-fuzz/FINDINGS.md new file mode 100644 index 0000000000000..16ed2d734e79e --- /dev/null +++ b/tools/css-selector-fuzz/FINDINGS.md @@ -0,0 +1,173 @@ +# CSS Selector Fuzzer — Findings + +Run: branch `html-css-fuzz` @ `5da3afedd0`, PHP 8.4.21. 5000 deterministic +seeds, 0 crashes/timeouts. Three distinct, reproduced WordPress-core correctness +bugs in the new HTML-API CSS selector support. Every selector below is valid, +supported CSS that the API mis-handles **without** reporting lack of support. + +**Status: all three bugs are fixed on this branch** (commit prefix +`CSS selector:` — Bug 1 `aed6cfb4aa`, Bug 2 `989e18da8a`, Bug 3 `0a87b20178`), +each with PHPUnit regression tests that fail pre-fix. A post-fix 5000-seed run +is clean (0 failures, 0 crashes). The repros below no longer trigger; they +remain as regression anchors and Trac-ready minimal test cases. + +No new bugs surfaced beyond these three, and no fuzzer-side (oracle or +generator) defect surfaced: with all three fixes applied a 5000-seed run is +completely clean, and the lexbor differential (third independent oracle) agreed +with the reference matcher on every compared no-quirks case (0 `lexbor-divergence`). +Caveats on the strength of that agreement: roughly half of the `compared` +cases (and ~62% of all match assertions across buckets) are vacuous `[] == []`; +older lexbor builds with #368 exclude quirks-mode class/ID matching from the +differential, though current harnesses include it when the startup probe reports +reliable class/#id behavior in both no-quirks and quirks mode. See `README.md` +for the full disclosure. + +Reproduce any case: `php tools/css-selector-fuzz/replay.php --selector '' [--html '']`. +Auto-minimize a failing seed: `php tools/css-selector-fuzz/minimize.php --seed ` +(faithful for seeds with a self-contained failure; seeds whose only recorded +failure is generator-side — `ast-mismatch`, `parse-expectation` — are refused +unless a related self-contained signature is opted into with `--signature`). + +--- + +## Bug 1 — Identity escapes mis-decode after a multibyte character (mis-parse) + +**Invariant:** `ast-mismatch` (57 hits, the dominant signature). + +`WP_CSS_Selector_Parser_Matcher::consume_escaped_codepoint()` decodes a +non-hex ("identity") escape with: + +```php +$codepoint_char = mb_substr( $input, $offset, 1, 'UTF-8' ); +``` + +`$offset` is a **byte** offset (threaded by reference through the whole +selector-string parse), but `mb_substr()`'s 2nd argument is a **character** +index. The two diverge by one per multibyte continuation byte seen earlier in +the string, so an identity escape preceded by any multibyte content decodes the +**wrong codepoint** (reads N characters too far right, N = preceding continuation bytes). + +Minimal reproduction (second selector's type should be `sup`): + +| selector | parsed context type | +|---|---| +| `#abc,\sup #x` | `sup` ✅ | +| `#Ü,\sup #x` | `uup` ❌ | +| `#ÜÜ,\sup #x` | `pup` ❌ | +| `#ÜÜÜ,\sup #x` | `" up"` ❌ | + +Hex escapes (`\75 `) use byte-correct `substr` and are unaffected; only the +non-hex identity-escape branch is wrong. Depending on what wrong codepoint is +produced this also causes spurious parse failures (a valid selector returns +`null`). + +**Fix (landed in `aed6cfb4aa`):** read the next codepoint from the byte +offset: `mb_substr( substr( $input, $offset ), 0, 1, 'UTF-8' )`. + +--- + +## Bug 2 — Empty-value `^=` `*=` `$=` match everything instead of nothing (mis-match) + +**Invariant:** `match-mismatch-html` / `match-mismatch-tag` (12 hits). + +Per Selectors-4, `[attr^=""]`, `[attr*=""]`, `[attr$=""]` match **nothing** (an +empty operand never matches). `WP_CSS_Attribute_Selector::matches()` instead: + +- `^=`: `substr_compare($attr,'',0,0) === 0` → always 0 → matches any element with the attribute. +- `*=`: `strpos($attr,'') === 0` (PHP) → matches any element with the attribute. +- `$=`: matches elements whose attribute value is exactly `""`. + +`~=` is handled correctly (returns nothing for an empty/whitespace operand). + +Reproduction against ``: + +| selector | WP matches | spec | +|---|---|---| +| `[x^=""]` | `I, B` | none | +| `[x*=""]` | `I, B` | none | +| `[x$=""]` | `I` | none | +| `[x~=""]` | none ✅ | none | + +**Fix (landed in `989e18da8a`):** in `matches()`, return `false` for `^= $= *=` +when `'' === $this->value`, before the `substr_compare`/`strpos` calls. `~=` +needs no guard — a whitespace-delimited list never yields an empty item — and +a test pins that. (No `substr_compare` length edge exists here: `-strlen('')` +is `0`, and PHP clamps out-of-range negative offsets rather than erroring.) + +--- + +## Bug 3 — Off-by-one length guard rejects `[name=x]` at end of string (false reject) + +**Invariant:** `parse-expectation` (1 hit; valid selector → `null`). + +`WP_CSS_Attribute_Selector::parse()` guards "need at least `=x]` remaining": + +```php +// need to match at least `=x]` at this point +if ( $updated_offset + 3 >= strlen( $input ) ) { + return null; +} +``` + +`>=` is off by one: it also rejects the exact-fit case where `=x]` **is** the +remaining tail. This rejects a valid attribute selector that uses the exact-match +`=` operator with a single-character **unquoted** value when its `]` is the last +character of the selector string. + +| selector | result | +|---|---| +| `[a=b]` | `null` ❌ | +| `div.x[y=z]` | `null` ❌ | +| `[a=bb]` | parsed ✅ (2-char value) | +| `[a="b"]` | parsed ✅ (quoted) | +| `[a^=b]` | parsed ✅ (2-char operator) | +| `[a=b].c` | parsed ✅ (trailing content) | + +**Fix (landed in `0a87b20178`):** change `>=` to `>` (need +`strlen - $updated_offset >= 3`). + +--- + +## Triage of the 5000-seed run (unpatched core) + +427 failures, every one attributable to one of the three bugs above. The +signature → bug mapping (and why each is a WP finding, not a fuzzer defect): + +| signature | hits | bug | how it manifests | +|---|---|---|---| +| `metamorphic-ast` (5 variants) | 328 | Bug 1 | a re-rendered / escaped variant of a selector parses to a different AST because an identity escape after multibyte content mis-decodes | +| `ast-mismatch` | 71 | Bug 1 | generated AST ≠ parsed AST, same root cause | +| `path-expectation` | 1 | Bug 1 | a path-directed selector with a multibyte-then-identity-escape value (`Über90\ x`) mis-parses, so the element it was built from no longer matches | +| `metamorphic-parse` (4 variants) | 9 | Bug 3 | a re-rendered variant ending in a single-char unquoted value at EOF is wrongly rejected | +| `parse-expectation` (2 variants) | 7 | Bug 3 | the generated selector itself ends in `=x]` and is wrongly rejected (e.g. `[dir =a]`) | +| `match-mismatch-html` | 7 | Bug 2 | empty-operand `^= *= $=` match elements the spec says they must not | +| `match-mismatch-tag` | 4 | Bug 2 | same, via the tag processor | + +Zero `lexbor-divergence`, zero `model-desync`, zero crashes/timeouts. With all +three fixes applied, the same 5000 seeds run with **0 failures** — confirming +the fuzzer reports exactly these three bugs and nothing spurious. + +## Fuzzer status + +Implemented and validated: + +- Deterministic seeds, seed-based replay, self-check suite + (`php tools/css-selector-fuzz/tests/self-check.php` passes). +- Seven-bucket selector generation including **path-directed** synthesis + (combinator positive-match rate ~68% vs ~14% before) and **edge-escape** + (U+FFFD escape decoder, input normalization). +- Three independent match oracles: the spec-faithful `ReferenceMatcher`, the + AST round-trip, and a **lexbor differential** (liblexbor v3.0.0, no-quirks + documents, tree-equality gated). The three agree on every compared case. +- **Metamorphic invariants** (oracle-free): meaning-preserving transforms keep + the match set; AST-preserving transforms keep the AST. +- **Parser-derived oracle tree** (`TreeCapture`): the processor's own parse is + ground truth, so **wild / restructured HTML** and **`` fragments** are + fuzzed, not only clean trees. +- **Line coverage** measured (93.4%, see `COVERAGE.md` — the source of truth + for current numbers; 96.2% effective, remainder justified). +- **Automatic minimizer** (`minimize.php`): delta-debugs selector and HTML to a + minimal reproducer preserving a chosen signature. + +See `README.md` for usage and `NEXT-STEPS.md` for the roadmap this work +completed. diff --git a/tools/css-selector-fuzz/NEXT-STEPS.md b/tools/css-selector-fuzz/NEXT-STEPS.md new file mode 100644 index 0000000000000..322d666ad1fe4 --- /dev/null +++ b/tools/css-selector-fuzz/NEXT-STEPS.md @@ -0,0 +1,363 @@ +# CSS Selector Fuzzer — Next Steps / Improvement Roadmap + +> **Status: all seven work items below are implemented and validated** (see +> `README.md`, `COVERAGE.md`, `FINDINGS.md`). The acceptance bar is met: +> coverage measured (93.4%; 96.2% effective — `COVERAGE.md` is the source +> of truth for the current numbers, remainder justified); +> three oracles agree on no-quirks supported cases with every divergence +> triaged; metamorphic invariants passing; combinator positive-match rate +> raised from 14.5% to ~68% (path-directed bucket); minimizer working; a clean +> 5000-seed run with all signatures triaged to the three known bugs, all of +> which still reproduce. The notes below are retained as the design rationale. +> +> **Core fixes landed:** the three FINDINGS.md bugs are fixed on this branch +> (`CSS selector:` commits `aed6cfb4aa` / `989e18da8a` / `0a87b20178`), each +> with PHPUnit regression tests. A post-fix 5000-seed run is clean. +> +> **Fuzzer-side follow-up hardening implemented (2026-06-12):** +> `tests/self-check.php` now allowlists known core parse-bug signatures in its +> fixed seed-window parse-expectation loop, while unknown mismatches still fail. +> The safe and wild document generators now inject NUL into random class tokens +> and expose the decoded U+FFFD token to class-selector generation, without +> leaking raw class values into the generic attribute-value pool. The lexbor +> differential includes quirks documents whenever the startup probe confirms +> class/#id behavior in both no-quirks and quirks mode (local master-built +> harness `3a2d595fe8c50e5076ac79c02b2ded79a777bb52` passes), and `runner.php` +> reports per-bucket/per-target vacuous and non-vacuous match assertion rates +> under `matchStats`. +> +> **Candidate finding 4 — FIXED:** per CSS Syntax 3 §4.3.8, `\` followed by +> EOF is a valid escape (EOF is not a newline), and §4.3.7 says consuming it +> returns U+FFFD — so `.foo\` parses as class `foo\u{FFFD}`. Verified against +> lexbor (agrees: `.foo\` matches class `foo\u{FFFD}`; `\` parses as type +> `\u{FFFD}`). Fixed on this branch (`CSS selector:` commit): EOF guard in +> `consume_escaped_codepoint()` returns U+FFFD, `next_two_are_valid_escape()` +> accepts a backslash as the final byte. Review of the fix surfaced a second +> bug in the same family: `normalize_selector_input()` trimmed *trailing* +> whitespace before tokenizing, so `.foo\ ` (escaped space — valid class +> `foo `, matches nothing) and `.foo\` (invalid escape — must be +> rejected) both collapsed to `.foo\` and matched class `foo\u{FFFD}` — a +> wrong-match-set bug. Fixed by switching to `ltrim()`; the grammar consumes +> insignificant trailing whitespace. Fuzzer updated to match: the lone `\` +> invalid-bucket entry became `\` (still invalid), and `edge-escape` +> gained an `eof-escape` kind covering `.name\` / `#name\` / `name\`. +> +> **Candidate finding 5 (recorded 2026-06-10, low severity, not fixed):** +> the attribute-selector case modifier is matched byte-wise (`i`/`I`/`s`/`S` +> literals), so an *escaped* modifier ident like `[a=b \69]` (tokenizes to +> the ident `i`) is rejected. Per the Selectors-4 grammar ` = +> i | s` these are ident tokens, so escapes should arguably be accepted — +> but browsers are themselves inconsistent (Chromium accepts `[a=b \69]` +> and rejects `[a=b \73]`). Fail-safe refusal, not a mis-match; revisit only +> if the matcher ever moves to token-level parsing. +> +> **EOF auto-close for attribute selectors — IMPLEMENTED (2026-06-10):** +> per CSS Syntax 3 §5.4.8/§4.3.5, the end of input closes an unterminated +> simple block (and an unterminated string), so `[att=val`, `[att`, +> `[att="a b`, and `[att=val i` are valid selectors; grammar-level +> truncations (`[`, `[a=`, `[a~`, `[a=b, div`) stay invalid. Verified +> against Chromium form-by-form, including an exhaustive per-byte +> truncation table in review. lexbor rejects all EOF-truncated forms +> (drafted as `lexbor/UPSTREAM-ISSUES.md` issue 4); the differential is +> unaffected because it compares canonical re-renders. Fuzzer gained an +> `eof-truncated` edge-escape kind and the invalid corpus was reshuffled +> along the new validity boundary; COVERAGE.md regenerated. +> +> **Invalid-UTF-8 input policy — IMPLEMENTED as scrub (2026-06-11):** +> selector strings are UTF-8 text; `normalize_selector_input()` now decodes +> the byte stream first via `wp_scrub_utf8()` (WP 6.9, maximal-subpart +> U+FFFD replacement, matching the WHATWG decoder CSS Syntax §3.2 invokes), +> and reports a `_doing_it_wrong()` (named `::from_selectors`) when +> the input changed. The `mb_substitute_character()` leak in +> `consume_escaped_codepoint()` is gone structurally: the identity arm's +> `mb_substr()` fallback is replaced by "consume the maximal subpart the +> `_wp_scan_utf8()` scan already reported, return one U+FFFD" — reachable +> only via direct `parse()` calls with un-normalized input, and consistent +> with the scrub when it is. Decision history: reject (`wp_is_valid_utf8()` +> → null) and raw passthrough were rejected after a three-persona +> adversarial panel; scrub is the unique option stable under both the +> current raw value getters and their likely scrubbed future. The U+2603 +> canary in `wpCssSelectorParserMatcher.php` set_up() is retained +> permanently — its job inverted from documenting the leak to proving +> setting-independence. Worker.php learned the notice contract (scrub +> notice expected iff `!wp_is_valid_utf8(selector)`) and flushes the +> `select()` parse caches before each notice-assertion window so the +> once-per-parse notice is deterministic under case re-runs. +> **Linked obligation:** the select-level pin +> `test_select_scrubbed_selector_does_not_match_raw_invalid_document_bytes` +> documents that scrubbed selectors cannot match raw invalid document +> bytes; if the HTML API value getters (`get_attribute()`, `class_list()`, +> …) are ever changed to scrub their return values, that case flips to a +> match and the pin must be updated in the same change. +> **Optional follow-up:** tightening the `parse()` prototype from public to +> protected (the classes are `@access private`) would make un-normalized +> input structurally impossible and let the defensive escape arm be +> deleted. +> +> **HTML case-insensitive attribute value list — IMPLEMENTED (2026-06-10):** +> per https://html.spec.whatwg.org/multipage/semantics-other.html#case-sensitivity-of-selectors +> the values of ~46 listed attributes (`type`, `rel`, `lang`, `dir`, +> `media`, ...) match ASCII case-insensitively on HTML elements when the +> selector has no modifier; an explicit `s` still forces sensitivity, and +> elements outside the html namespace are unaffected. Oracle notes from +> verification: +> - **lexbor does not implement the rule at all** (`[rel=nofollow]` does +> not match `rel="NOFOLLOW"`) — compensated in the differential the same +> way as lexbor #368 (lexbor is compared against the reference run with +> the list disabled); drafted as `lexbor/UPSTREAM-ISSUES.md` issue 5. +> - The case-flip generator twist in `path_attr_feature` makes the folding +> load-bearing for `mustMatchFid` (mutation-tested: disabling the core +> branch fires 11 failures in 3000 seeds). Minor leftovers from review: +> the unused `expected_*_processor_matches` back-compat helpers in +> `ReferenceMatcher` silently default rows to the html namespace — fine +> today (the safe model generator emits no foreign content) but a trap +> for a future caller; and `s`-forces-sensitivity only gets differential +> coverage when sampled values happen to differ in case (pinned by unit +> tests instead). +> - **Chromium applies the list to foreign elements too** (`[type=TEXT]` +> matches ``), diverging from the HTML spec's "on an +> HTML element" scoping. WP follows the spec (html namespace only, via +> `get_namespace()`). The standalone Tag Processor has no namespace +> tracking and applies the list to every element — an inherent +> tag-processor approximation, same as its ancestor-blind matching. +> +> **Session decisions (2026-06-10, both since implemented — see the +> IMPLEMENTED entries above):** EOF-truncated selectors (`div[a=b`) are +> spec-conformant (CSS Syntax auto-closes open blocks at EOF) rather than +> documented as an intentional rejection. HTML's default case-insensitive +> attribute value list is implemented (no-modifier + html-namespace + +> listed attribute; explicit `s` keeps forcing case-sensitivity). +> Grammar-level truncations (`[`, `[a=`, `div >`, `div,`) stay invalid — +> browsers reject those too. No Trac tickets for any of this. +> +> **O(1) identity-escape decode — IMPLEMENTED (2026-06-11, perf only):** +> `consume_escaped_codepoint()`'s identity arm no longer copies the input +> tail per escape (`mb_substr( substr( … ) )`); it sizes the code point in +> place with `_wp_scan_utf8( $input, $at, $invalid_length, 4, 1 )` +> (`compat-utf8.php`, WP 6.9). 200KB all-escape selector: 180 ms → 45 ms, +> scaling now linear (47/90/180 ms at 200/400/800KB; previously ~4× per +> doubling). Behavior is byte-identical by construction: escapes of +> *invalid* UTF-8 still fall through to the literal old `mb_substr()` line +> (re-verified ~74M differential cases, 0 mismatches, including non-default +> `mb_substitute_character` settings), so the open invalid-UTF-8 policy +> decision is untouched — and that fallback path remains quadratic for +> selectors made of escaped invalid bytes (accepted; developer-supplied +> input). Caution recorded in-code: `_wp_utf8_codepoint_span()` looks like +> the natural helper but passes `max_bytes = null`, making its ASCII +> fast-path O(tail) per call — quadratic again. Escape pin coverage grew to +> 14 cases (2/3/4-byte chars incl. at-EOF, NUL, each invalid-byte class). +> (Superseded the same day for invalid bytes: the `mb_substr()` fallback and +> its quadratic tail were removed by the scrub implementation — see the +> invalid-UTF-8 policy entry above.) +> +> **Fuzzer coverage for the scrub surface — IMPLEMENTED (2026-06-11):** +> the deferred coverage work for the invalid-UTF-8 scrub landed in three +> pieces. (1) A dedicated `invalid-utf8` generator bucket injects raw +> ill-formed sequences into class/ID/attribute-name idents and quoted +> string operands and carries the post-scrub AST; the per-class maximal- +> subpart U+FFFD counts are pinned independently of `wp_scrub_utf8()` +> (self-check additionally duplicates the class names and byte values, so +> a deleted or drifted table entry fails instead of shrinking the +> assertion). (2) A `mutated`-bucket splice kind inserts raw ill-formed +> sequences at arbitrary byte offsets — no expectations, but it makes the +> worker's invalid-UTF-8 rejection branch hot (scrub + two `select()` +> notices), which no other bucket reached. (3) The explicit lexbor probe: +> lexbor accepts raw invalid selector bytes and replaces them with U+FFFD, +> but NOT per the WHATWG maximal-subpart rule — one U+FFFD per byte for +> truncated sequences (`E2 8C` → 2, spec 1) and one per whole sequence for +> UTF-8-encoded surrogate halves (`ED A0 80` → 1, spec 3) — drafted as +> `lexbor/UPSTREAM-ISSUES.md` issue 6. The differential is unaffected and +> stays live for the bucket: it feeds lexbor the canonical re-render of +> the post-scrub AST (escaped, pure ASCII), the same mechanism that +> sidesteps lexbor's other byte-level parsing bugs. Doc-side observation: +> lexbor keeps raw invalid bytes in the DOM unchanged (same stance as the +> Tag Processor), so raw doc bytes match nothing in either engine. The +> handoff's optional metamorphic relation `parse(s) === parse(scrub(s))` +> was skipped deliberately: it is near-tautological (it could only catch +> a `from_selectors()` bypass, and no public path bypasses it). +> +> **Still open:** `gen_chaos()`'s whole-codepoint `unicode` branch is dead +> code — it compares the alphabet *string* against the key `'unicode'` after +> the value lookup already happened — so the unicode alphabet is byte-sliced +> by the generic fallback instead. That slicing is what makes chaos emit +> invalid UTF-8 organically (~15% of chaos cases), so making the branch live +> is a behavior decision, not just a cleanup: it would remove chaos's organic +> ill-formed-byte production, leaving the deliberate paths (`invalid-utf8` +> bucket, `mutated` splice) plus `mutated`'s residual organic corruption of +> pool multibyte characters (~2% of mutated cases even without the splice). + +Repo: `/Users/jonsurrell/a8c/wordpress-develop/html-css-fuzz`, branch +`html-css-fuzz` (trunk + merged `html-api/add-css-selector-parser`). +PHP 8.4.21. The fuzzer and all fixes are committed on this branch +(`CSS selector:` / `CSS selector fuzz:` prefixed commits). `/artifacts` is +gitignored (runner output lives there). + +## Measured weaknesses driving this plan + +- Match oracle is a hand-reimplementation (`ReferenceMatcher`) by the same author + who could share a spec misreading with WP — no third opinion exists today. +- Positive-match rate is low (measured): supported-compound 39.6% of parseable + cases match ≥1 element; supported-complex only **14.5%**; ~72% of all supported + cases match nothing. Most match assertions are vacuous `[] == []`. +- The "structurally safe element set" restriction (needed so `model == + parse-tree` holds) means combinator/breadcrumb matching is only ever tested on + clean trees — never on foster-parented / adoption-agency / foreign-content / + implied-end-tag restructured trees, which are the hard cases. +- No metamorphic invariants (the cheapest oracle-free signal class) — absent. +- Line coverage never measured. Some target branches are provably unreachable by + the current generator (e.g. `consume_escaped_codepoint`'s U+FFFD path for + null/surrogate/over-max codepoints; the `normalize_selector_input` NUL→U+FFFD + path). +- No automatic minimizer (the sibling `html-api-fuzz` branch ships + `tools/html-api-fuzz/minimize.php` as a pattern to copy). +- Match path only exercises `WP_HTML_Processor::create_full_parser`; fragment + contexts and varied quirks-mode triggers (only doctype presence is toggled) + are untested. + +## Work items, in priority order + +### 1. Metamorphic invariants (cheapest, highest signal-per-effort, no deps) + +Add oracle-free relations to `Worker::run_case` that must hold for any parseable +supported selector over any document. For each, transform the selector, assert +the match set (both processors) is unchanged — or for AST-level ones, assert the +extracted AST is unchanged: + +- ASCII-case-fold a type-selector name → identical matches (type names are + case-insensitive). +- Reorder subclass selectors within a compound (`div.a#b[c]` ≡ any permutation of + the subclass part) → identical matches and structurally-equivalent AST. +- Escape an arbitrary ident codepoint that does not require escaping → identical + AST and matches (exercises the escape decoder against the no-op case). +- Append a redundant universal (`sel` vs `sel:where`-free `*sel` where the type + slot is empty → `*` + subclasses) → identical matches. +- Duplicate a selector-list branch (`a, a`) → identical matches. +- Whitespace-insert around combinators and commas where insignificant → identical + matches. + +These need no external engine; they would have independently caught Bug 1. + +### 2. Path-directed generation (fix the 14.5% positive-match rate) + +Add a generation mode that GUARANTEES positive matches and meaningful negatives: + +- Pick a random element in the generated model tree. +- Synthesize a selector that must match it: type from its tag, subclasses from a + subset of its real classes/id/attributes, and (for complex) a context chain + drawn from its real ancestor tags with `>`/descendant combinators matching the + actual nesting. +- Emit the matching selector (assert it matches that element) AND near-miss + mutations (swap one ancestor combinator `>`↔descendant, drop/extend a class, + change one attribute operator) and assert the flip. + +This makes the combinator/breadcrumb walker actually exercised with real depth +and real positive/negative boundaries instead of mostly-empty match sets. Keep +the existing buckets; add this as a new bucket. + +### 3. lexbor differential oracle (the match-oracle correctness ceiling) + +Use lexbor as a THIRD, independent oracle — primarily to validate +`ReferenceMatcher`, secondarily to unlock wilder HTML. Build cost is acceptable +(confirmed by maintainer). Refs: https://lexbor.com/modules/selectors/ and the +HTML module for selector matching. + +Design: + +- C harness linking liblexbor: read many `{html, selector}` cases from stdin + (one process, batched — invoked by the runner like the existing PHP worker + subprocess; isolate crashes). Parse HTML with `lxb_html_document_parse`, parse + the selector with the CSS/selectors module, run `lxb_selectors_find`, and via + the callback collect each matched element's unique `data-fid` attribute. Emit + one line of matched-fid sets per case. FFI to `liblexbor.so` is an acceptable + alternative but a standalone CLI isolates crashes better. +- Mark every generated element with a unique `data-fid` (the generator already + does this). +- **Tree-equality gate:** only run the differential on cases where WP's tree and + lexbor's tree agree (compare the fid→tag→breadcrumb sequence from each). This + isolates the SELECTOR layer from HTML tree-construction differences (which are + a different fuzzer's concern — see `html-api-fuzz`). Bonus: this gate lets you + fuzz ARBITRARY/wild HTML and keep any case where the two trees agree, which + relaxes the current "safe element set" restriction and reaches restructured + trees the present generator can't produce. +- Three-way verdict: `reference ≠ lexbor` ⇒ fuzzer-oracle bug (fix the fuzzer); + `reference == lexbor ≠ WP` ⇒ high-confidence WP finding. + +**CRITICAL CAVEAT — quirks-mode / case-sensitivity:** lexbor has a known +class/ID case-sensitivity bug — https://github.com/lexbor/lexbor/issues/368. +WP folds class/ID names ASCII-case-insensitively in QUIRKS mode and +case-sensitively in no-quirks (`WP_HTML_Tag_Processor::is_quirks_mode()`); type +names are always case-insensitive. Do NOT trust lexbor on quirks-mode case +behavior. Restrict the lexbor differential to **no-quirks documents** (emit +``), and keep `ReferenceMatcher` as the authority for the +quirks-mode path. Record the exact lexbor master commit used and note whether +#368 is fixed in it. Re-evaluate enabling quirks comparison only after verifying +lexbor's behavior against that issue. + +- Also surface (don't auto-fail) **attribute default case-insensitivity**: + Selectors-4/HTML define a set of attributes matched case-insensitively by + default; WP appears to implement only explicit `i`/`s` modifiers. lexbor may + implement the default set, producing divergence that is either a real WP + conformance gap or an intentional subset limitation — triage per case and + report. + +### 4. Parser-derived oracle tree (decouple "tree right" from "match right") + +Instead of asserting `model == parse-tree`, walk the processor ONCE to capture +the ground-truth tree (fid → tag → breadcrumbs → attributes), then run both the +reference matcher and the lexbor differential against arbitrary/wild HTML using +that captured tree as truth for the selector layer. This is the structural +change that makes #3's wild-HTML mode fully general and lets the generator reuse +`html-api-fuzz`'s nasty-HTML generator. (`model-desync` becomes a separate, +optional sanity check rather than a precondition.) + +### 5. Coverage measurement + reach the unreachable branches + +- Wire line/branch coverage (phpdbg is available: `phpdbg -qrr` with + coverage, or install pcov/xdebug) over the `src/wp-includes/html-api/css/` + classes; gate "done" on a coverage target and a written list of intentionally- + unreached lines. +- Add a generator path emitting raw hex escapes for null / surrogate + (U+D800–U+DFFF) / over-max (> U+10FFFF) codepoints and assert they decode to + U+FFFD (currently unreachable — the renderer only escapes real codepoints). +- Fuzz NUL bytes and CR/FF in the selector INPUT to exercise + `normalize_selector_input` (NUL→U+FFFD, CR/CRLF/FF→LF). + +### 6. Automatic minimizer + +Port the delta-debugging pattern from `tools/html-api-fuzz/minimize.php`: given a +failing seed, shrink both the HTML and the selector (byte/structural deletes, +keep-failing) to a minimal reproducer. Wire into `replay.php` or a new +`minimize.php`. Bugs 1 and 3 were hand-minimized; automate it. + +### 7. Broaden match surface + +- Run the match oracle through `create_fragment` with varied fragment contexts, + not just `create_full_parser`. +- Vary all quirks-mode triggers (not only doctype presence): no-doctype, + malformed doctype, ``, limited-quirks doctypes. + +## Acceptance bar for "exacting standards" + +- Coverage measured and reported for the `css/` classes, with a justified list of + any unreached lines. +- Three independent oracles agree on no-quirks supported cases (AST round-trip, + `ReferenceMatcher`, lexbor); divergences are triaged to either a WP finding or + a fuzzer-oracle fix — never left ambiguous. +- Metamorphic invariants in place and passing. +- Positive-match rate for combinator selectors materially raised (path-directed + generation): ~68% in that bucket vs ~14% before, so the combinator/breadcrumb + walker is genuinely exercised. (Aggregate across all buckets remains ~62% + vacuous `[] == []`, by design — the negative-oriented and parse-focused + buckets are intentionally mostly empty-set; see `README.md`.) +- Minimizer produces minimal repros automatically. +- A clean multi-thousand-seed run with all signatures triaged; `FINDINGS.md` + updated with any new bugs (each with a minimal repro and a one-line fix + direction), and confirmation that the three known bugs still reproduce. + +## Existing bugs to keep verifying (regression anchors) + +From `FINDINGS.md` — all three are fixed on this branch and pinned by PHPUnit +tests; the minimal repros must now NOT trigger (a clean 5000-seed run confirms): +1. Identity escape after multibyte mis-decodes: `#Ü,\sup #x` → type must be `sup`. +2. Empty-value substring matchers: `[x^=""]`, `[x*=""]`, `[x$=""]` must match nothing. +3. Off-by-one length guard: `[a=b]` (single-char unquoted value, exact `=`, at EOF) must parse. diff --git a/tools/css-selector-fuzz/README.md b/tools/css-selector-fuzz/README.md new file mode 100644 index 0000000000000..655d52293cc29 --- /dev/null +++ b/tools/css-selector-fuzz/README.md @@ -0,0 +1,230 @@ +# CSS Selector Fuzzer + +Generative fuzzer for the HTML API CSS selector support: +`WP_CSS_Compound_Selector_List`, `WP_CSS_Complex_Selector_List`, and the +`select()` methods on `WP_HTML_Tag_Processor` and `WP_HTML_Processor`. + +Every case is fully deterministic from its integer seed: the same seed always +produces the same document, the same selector, and the same verdict. + +## What a case does + +1. Generate a random HTML document — 70% from a structurally "safe" element + set with a known model tree (of these, ~20% are parsed as a `` + fragment via `create_fragment` instead of a full document, exercising the + fragment `select()` path), 30% "wild" (misnested, implied-end-tag, + foreign-content, token soup with one of five doctypes spanning no-quirks, + quirks, and limited-quirks). `create_fragment` only accepts the `` + context publicly, so that is the fragment context fuzzed. +2. Capture the processor's own view of the document as the matching oracle's + ground truth (`TreeCapture`): a flat list of rows in visit order, each + carrying the element's tag, attributes, and ancestor tag list (context + selectors are type-only, so that is everything matching can observe). + For safe documents the capture must agree with the generated model + (`model-desync`) — that soundness check is what justifies trusting the + capture on wild documents. Wild documents that hit a construct the + processor bails on (foster parenting, complex adoption-agency runs) are + deterministically regenerated a bounded number of times. +3. Generate a selector in one of nine buckets: + - `supported-compound` — must parse in both grammars; carries intended AST. + - `supported-complex` — uses `>`/descendant combinators; must parse only + in the complex grammar; carries intended AST. + - `path-directed` — synthesized from a real element of the generated tree + (type from its tag, subclasses from its actual classes/id/attributes, + context chain from its actual ancestors), guaranteed by construction to + match that element — or flipped into a near-miss (wrong type/class/attr + guarantees a non-match; loosening `>` to descendant must keep matching). + The guarantee is asserted against the reference matcher + (`path-expectation`). Within this bucket ~67% of match assertions are + non-vacuous (positive-match rate ~68% for combinator selectors, vs ~14% + in `supported-complex`). Across *all* buckets ~38% of match assertions + are non-vacuous: the negative-oriented buckets (`unsupported`, + `invalid`, much of `supported-*`) and `edge-escape` (which targets the + parse/escape-decode path, not matching) are intentionally mostly + empty-set, so the aggregate `[] == []` rate is ~62%. The point of + path-directed generation is that the *combinator/breadcrumb* walker — + the part most likely to harbor a matching bug — is now exercised with + real depth, not that every assertion is non-vacuous. + `runner.php` persists per-bucket/per-target match assertion counts and + vacuous/non-vacuous rates under `matchStats` in `state.json`, so this + distribution is reported on every run instead of relying on stale notes. + - `unsupported` — valid CSS the API intentionally rejects (pseudo-classes + and -elements, `+`/`~`/`||` combinators, namespaces, non-type context + selectors); must not parse. + - `invalid` — not valid CSS; must not parse. + - `invalid-utf8` — a small supported selector with a raw ill-formed UTF-8 + byte sequence (lone continuation, truncated 2/3/4-byte, overlong, + surrogate half, beyond U+10FFFF) injected into a class/ID/attribute + ident or string operand; `from_selectors()` scrubs the input first, so + the case must parse and carries the post-scrub AST (one U+FFFD per + maximal subpart, with per-class subpart counts pinned independently of + `wp_scrub_utf8()`). + - `chaos` — arbitrary bytes; no parse expectation. + - `mutated` — a supported selector with random byte mutations, including + raw ill-formed UTF-8 splices at arbitrary byte offsets; no parse + expectation. + - `edge-escape` — selectors that exercise otherwise-unreachable parser + branches: hex escapes for NUL / surrogate / over-max codepoints (must + decode to U+FFFD) and raw NUL / CR / CRLF / FF bytes in the input (must + normalize per `normalize_selector_input`); carries the intended AST. +4. Check invariants: + - No PHP error/warning/exception from parsing or matching, ever. + - Parse result (instance vs `null`) matches the bucket's expectation. + - Anything the compound grammar parses, the complex grammar parses, and + both produce the same AST. + - Parsed AST equals the generated AST (escapes, strings, whitespace and + case randomization must not change meaning). + - For any selector that parses (including chaos/mutated), the `select()` + match set equals an independent spec-faithful reference matcher, on both + processors, including quirks-mode class/ID case-insensitivity. + - For any selector that does not parse, `select()` returns `false`, + `_doing_it_wrong` fires exactly once per call (also via the parse + cache), and the processor remains usable. + - The processor ends with no `get_last_error()`/unsupported state. + - Metamorphic relations (oracle-free, run on otherwise-clean cases whose + selector parsed): meaning-preserving transforms of the selector must + select exactly the same elements as the original, and AST-preserving + transforms must parse to exactly the transformed AST. Transforms: + re-render with fresh whitespace/quoting and aggressive no-op escapes, + ASCII-case-fold of type names, subclass reordering within a compound, + explicit `*` for an omitted type, and selector-list branch duplication. + Skipped for ASTs containing invalid UTF-8 (reachable only from + chaos/mutated inputs), which the renderer cannot round-trip. + - lexbor differential (third, independent oracle; requires the harness — + see below): on full-document cases whose selector parsed, a canonical + re-render of the verified AST is matched by liblexbor and compared, + as a multiset of fids, against the reference matcher. Quirks documents + participate only when the startup probe confirms lexbor's class/#id + folding behavior in both no-quirks and quirks mode. Gated on WP and + lexbor building the same element tree (fid/tag/ancestry), so it tests + the selector layer, not tree construction. Verdicts: + `lexbor-divergence` (lexbor ≠ reference) is a fuzzer-oracle problem; + `match-mismatch-html` with no accompanying divergence means reference + == lexbor ≠ WP — a high-confidence WP finding. + - Repeating a case yields a byte-identical result digest (determinism). + Note the digest covers the WP-under-test surface (selector, html, + parse-nullness, ASTs, failure invariants) but **not** the lexbor + oracle's own output, so it would not flag a flaky lexbor result that + never escalates to a `lexbor-divergence` failure. + +## lexbor harness + +Build with `sh tools/css-selector-fuzz/lexbor/build.sh` (clones and builds +liblexbor from upstream `master`; the build script prints the exact commit). +The worker auto-detects the binary at `tools/css-selector-fuzz/lexbor/harness` +and reports per-batch tallies, persisted to `state.json` under `lexbor`: + +- `compared` — the differential ran and matched fid-multisets. +- `tree-gated` — WP and lexbor built different trees; differential skipped. +- `skipped-quirks` / `skipped-utf8` — quirks document while lexbor class/#id + case behavior is not trusted / non-UTF-8 AST. +- `n/a` — the differential does not apply (unparseable selector, fragment, no + captured tree). +- `unavailable` / `error` — the harness was missing or died. The runner prints + a loud warning if these appear after the harness had run, so a third oracle + that dies mid-run cannot hide behind a green run. + +Known lexbor issues compensated for when present: + +- [#368](https://github.com/lexbor/lexbor/issues/368): + class and `#id` selectors match ASCII case-insensitively even in + no-quirks documents (`[id=…]` attribute matching is correctly + case-sensitive). Detected by a startup probe; when present, lexbor is + compared against the reference matcher run with quirks-style class/ID + folding, and quirks-mode documents are excluded from the differential + entirely. The same startup probe also checks class and `#id` selectors in + quirks mode; only when all four probes pass is quirks-mode class/ID matching + included in the differential. +- lexbor rejects uppercase `I`/`S` attribute-selector modifiers, and its + non-ASCII ident-codepoint table omits U+00B7 and U+00C0–U+00F6 (it + starts at U+00F8), rejecting e.g. `.Über` while accepting `.über`. + Both sidestepped by the canonical re-render (lowercase modifiers, all + non-ASCII hex-escaped); both are candidate upstream reports, not WP + findings. +- `lxb_selectors_find` reports a node once per matching selector-list + branch; `LXB_SELECTORS_OPT_MATCH_FIRST` dedupes. +- lexbor matches `[x~=""]` against whitespace-only attribute values + (e.g. `x=" "`); Selectors-4 and Chrome say an empty operand never + matches a list item, and WP agrees with them. Latent + `lexbor-divergence` noise source if the generator ever pairs `~=""` + with whitespace-valued attributes; candidate upstream report, not a + WP finding. + +## Known oracle limitations (document-side decoding) + +The match oracle's independence differs between class and attribute selectors: + +- **Class values are matched by two genuinely independent tokenizers.** WP's + `select('.x')` goes through `WP_HTML_Tag_Processor::class_list()`, which + splits on ASCII whitespace and folds NUL → U+FFFD per token; + `ReferenceMatcher::class_matches()` reimplements that independently (and is + pinned against `class_list()` on NUL/FF boundary inputs by `self-check.php`). + The safe and wild random document generators now inject NUL into class + tokens occasionally and expose the decoded U+FFFD token to class-selector + generation. Raw class attribute values are intentionally kept out of the + generic `attrValues` pool so attribute-selector generation does not inherit + class-list-only decoding semantics. +- **Attribute values are matched through a single shared read.** Both WP's + attribute matcher and `ReferenceMatcher::attr_matches()` read the same + `get_attribute()` output, so a value-decoding bug there would be shared and + invisible regardless of input — a genuine shared-oracle limitation that no + generator change can close (it needs an independent attribute-value decoder, + which lexbor partly provides on no-quirks documents). + +## Usage + +Bounded fuzz run (process-isolated chunks, crash/hang attribution): + + php tools/css-selector-fuzz/runner.php --max-seeds 1000 --duration-seconds 60 + +Artifacts go to `artifacts/css-selector-fuzz/run-*/` and are intentionally +small: `state.json` (counters, per-signature tallies) and `failures.ndjson` +(one line per failure, with base64 selector + document for offline analysis). + +Replay a failure by seed: + + php tools/css-selector-fuzz/replay.php --seed 42 --show-html + php tools/css-selector-fuzz/replay.php --seed 42 --json + +Probe a specific selector: + + php tools/css-selector-fuzz/replay.php --selector 'section > div.cls' --html '
' + +Minimize a failing case to a small reproducer (delta-debugging; shrinks +both the selector and the HTML while preserving a failure signature): + + php tools/css-selector-fuzz/minimize.php --seed 1234 + php tools/css-selector-fuzz/minimize.php --selector 'sel' --html '<…>' --signature match-mismatch + +The minimizer drives `Worker::run_pair`, which checks only **self-contained** +invariants — those computable from the (selector, html) pair without the +generator's intended AST: `match-mismatch-*`, `metamorphic-*`, +`lexbor-divergence`, `parse-error`, `ast-shape`, `ast-cross-grammar`, and the +rejection checks. The generator-side invariants `ast-mismatch`, +`parse-expectation`, `path-expectation`, and `model-desync` are **not** +self-contained and cannot be reproduced from the pair alone. + +So `--seed` faithfully minimizes only seeds whose failure is self-contained. +The three known bugs each *also* surface a self-contained signature (Bug 1 → +`metamorphic-ast`, Bug 2 → `match-mismatch-html`, Bug 3 → `metamorphic-parse`), +but a seed whose recorded failure is *only* the generator-side form (e.g. a +Bug-1 seed that recorded `ast-mismatch` before the metamorphic phase ran) is +**refused by default** rather than silently retargeted — pass `--signature` +to opt into minimizing a related self-contained signature, which is then +clearly labelled as a retarget in the output. + +Run a batch in-process (no isolation, faster): + + php tools/css-selector-fuzz/worker.php --start-seed 1 --count 500 + +Measure line coverage of the `css/` classes (see `COVERAGE.md` for the +current report and a justified list of unreached lines): + + phpdbg -qrr tools/css-selector-fuzz/coverage.php --seeds 3000 --list-uncovered + +Options of note: + +- `runner.php --stop-on-failure` stops at the first failing chunk. +- `worker.php --determinism-every N` re-runs every Nth seed twice (default 16). +- `worker.php --max-failures N` stops a batch after N failures (default 200) + to bound artifact size. diff --git a/tools/css-selector-fuzz/coverage.php b/tools/css-selector-fuzz/coverage.php new file mode 100644 index 0000000000000..64c60a7d5e42c --- /dev/null +++ b/tools/css-selector-fuzz/coverage.php @@ -0,0 +1,92 @@ +#!/usr/bin/env php + $lines ) { + foreach ( $lines as $line => $hits ) { + $oplog[ $file ][ $line ] = true; + } + } +} + +$executable = phpdbg_get_executable( array( 'files' => $targets ) ); + +$total_exec = 0; +$total_covered = 0; + +foreach ( $targets as $file ) { + $exec_lines = array_keys( $executable[ $file ] ?? array() ); + $covered_lines = array_keys( $oplog[ $file ] ?? array() ); + $covered_lines = array_intersect( $covered_lines, $exec_lines ); + $uncovered = array_diff( $exec_lines, $covered_lines ); + + $total_exec += count( $exec_lines ); + $total_covered += count( $covered_lines ); + + printf( + "%-55s %4d/%4d lines %5.1f%%\n", + basename( $file ), + count( $covered_lines ), + count( $exec_lines ), + count( $exec_lines ) > 0 ? 100 * count( $covered_lines ) / count( $exec_lines ) : 100 + ); + + if ( $list_uncovered && array() !== $uncovered ) { + $source = file( $file ); + sort( $uncovered ); + foreach ( $uncovered as $line ) { + printf( " !%4d %s\n", $line, rtrim( $source[ $line - 1 ] ?? '' ) ); + } + } +} + +printf( + "%-55s %4d/%4d lines %5.1f%%\n", + 'TOTAL', + $total_covered, + $total_exec, + $total_exec > 0 ? 100 * $total_covered / $total_exec : 100 +); diff --git a/tools/css-selector-fuzz/lexbor/UPSTREAM-ISSUES.md b/tools/css-selector-fuzz/lexbor/UPSTREAM-ISSUES.md new file mode 100644 index 0000000000000..40c2ccb80f97c --- /dev/null +++ b/tools/css-selector-fuzz/lexbor/UPSTREAM-ISSUES.md @@ -0,0 +1,278 @@ +# lexbor — draft upstream bug reports + +Six spec-conformance bugs in liblexbor's CSS selectors support, found while +using lexbor as a differential oracle for the WordPress HTML-API CSS selector +fuzzer (`tools/css-selector-fuzz/`). Issues 1–3 were re-verified directly +against the harness on 2026-06-10; issues 4–5 surfaced during the WP +conformance-fix session and were re-verified on 2026-06-11; issue 6 came out +of the explicit invalid-byte probe for the WP scrub coverage work +(2026-06-11). + +- **Default build target:** lexbor upstream `master`, built by + `tools/css-selector-fuzz/lexbor/build.sh`. Record the exact commit printed + by the build script when verifying any issue. +- **Upstream repo:** https://github.com/lexbor/lexbor +- **Already filed upstream — do NOT refile:** + [#368](https://github.com/lexbor/lexbor/issues/368) (class/`#id` selectors + match ASCII case-insensitively in no-quirks documents). + +## Instructions for the filing agent + +1. **Re-verify at current lexbor master first.** Any of these may already be + fixed. Run `build.sh` and re-run the repros below. Only file what still + reproduces, and say in the report which commit you tested. +2. **Search for duplicates** before filing (suggested queries: `~=`, + `attr-modifier`, `case insensitive modifier`, `ident code point`, + `U+00B7`, `non-ascii`, `EOF`, `unclosed`, `simple block`, + `case-insensitive attribute`, `querySelector`). #368 shows the + maintainer's preferred repro style. +3. **One issue per bug.** Reduce each to a self-contained C repro (sketch + below); maintainers should not need this repo's harness. +4. Reproduction via this repo (fast path): build the harness + (`sh tools/css-selector-fuzz/lexbor/build.sh`), then feed it + `base64(html) TAB base64(selector)` lines on stdin. Response lines per + case: `Rtagfidancestors` (tree rows), `Mfid` (match), + `Xreason` (selector parse error), terminated by `D`. See + `lib/LexborOracle.php` for a reference client and `harness.c` for the + exact lexbor API usage (`lxb_html_document_parse`, + `lxb_css_selectors_parse`, `lxb_selectors_find`). + +Minimal C repro skeleton (adapt per issue; `harness.c` is the full reference): + +```c +/* cc repro.c -llexbor */ +#include +#include +#include + +static lxb_status_t cb(lxb_dom_node_t *n, lxb_css_selector_specificity_t s, void *ctx) { + (*(int *)ctx)++; + return LXB_STATUS_OK; +} + +int main(void) { + const lxb_char_t html[] = ""; + const lxb_char_t sel[] = "[x~=\"\"]"; + int hits = 0; + + lxb_html_document_t *doc = lxb_html_document_create(); + lxb_html_document_parse(doc, html, sizeof(html) - 1); + + lxb_css_parser_t *parser = lxb_css_parser_create(); + lxb_css_parser_init(parser, NULL); + lxb_selectors_t *selectors = lxb_selectors_create(); + lxb_selectors_init(selectors); + + lxb_css_selector_list_t *list = + lxb_css_selectors_parse(parser, sel, sizeof(sel) - 1); + if (list == NULL) { printf("selector parse error\n"); return 1; } + + lxb_selectors_find(selectors, lxb_dom_interface_node(doc), + list, cb, &hits); + printf("matches: %d\n", hits); /* spec: 0 */ + return 0; +} +``` + +--- + +## Issue 1 — `[x~=""]` matches whitespace-only attribute values + +Per Selectors Level 4, `[att~=val]` with an empty `val` never matches: + +> If "val" is the empty string, it will never represent anything. +> — https://www.w3.org/TR/selectors-4/#attribute-representation (§6.1) + +lexbor instead matches elements whose attribute value consists only of +whitespace, suggesting its list-splitting yields an empty token for +whitespace-only values. Verified at v3.0.0 (`data-fid="a"` on the element): + +| document | selector | lexbor | spec / Chrome 149 | +|---------------------------|-----------|-----------|-------------------| +| `` (space) | `[x~=""]` | matches ❌ | no match | +| `` (tab) | `[x~=""]` | matches ❌ | no match | +| `` | `[x~=""]` | no match ✅ | no match | +| `` | `[x~=""]` | no match ✅ | no match | +| `` (control) | `[x~=a]` | matches ✅ | matches | + +Chrome 149 (`document.querySelectorAll`) returns no match for all `[x~=""]` +rows (verified 2026-06-10 via Playwright during the WordPress fix review). + +## Issue 2 — uppercase `I`/`S` attribute-selector modifiers rejected + +Selectors Level 4 §6.3 defines the modifiers explicitly as case-insensitive: + +> ...adding the identifier `i` (or `I`) ... adding the identifier `s` (or `S`) ... +> — https://www.w3.org/TR/selectors-4/#attribute-case + +lexbor parses the lowercase forms but reports a selector parse error for the +uppercase forms. Verified at v3.0.0: + +| selector | lexbor | spec | +|--------------|---------------|---------| +| `[x=abc i]` | parses ✅ | parses | +| `[x=abc I]` | parse error ❌ | parses | +| `[x=abc s]` | parses ✅ | parses | +| `[x=abc S]` | parse error ❌ | parses | + +Note for browser comparison: Chrome 149 had not shipped the `s` modifier at +all (throws SyntaxError), so compare `I` against Chrome and `S` against the +spec text / Firefox. + +## Issue 3 — non-ASCII ident code points below U+00F8 rejected + +CSS Syntax Level 3 defines the non-ASCII ident code points to include +U+00B7 and U+00C0–U+00D6 / U+00D8–U+00F6: + +> non-ASCII ident code point: U+00B7, U+00C0 to U+00D6, U+00D8 to U+00F6, +> U+00F8 to U+037D, ... +> — https://www.w3.org/TR/css-syntax-3/#non-ascii-ident-code-point + +lexbor's table appears to start at U+00F8: code points in the earlier ranges +are rejected both in ident-start and non-start positions. Verified at v3.0.0 +(raw UTF-8 selectors; class attribute contains the same characters): + +| selector | codepoint(s) | lexbor | spec | +|-----------|---------------------|---------------|---------| +| `.über` | U+00FC (≥ U+00F8) | parses ✅ | parses | +| `.øx` | U+00F8 (boundary) | parses ✅ | parses | +| `.Über` | U+00DC (U+00D8–F6) | parse error ❌ | parses | +| `.a·b` | U+00B7 (non-start) | parse error ❌ | parses | +| `.÷x` | U+00F7 (excluded) | parse error ✅ | error | + +The U+00F7 row is a control: the division sign is correctly NOT an ident code +point, so lexbor's boundary is off by exactly the U+00B7 / U+00C0–U+00F6 +ranges. Workaround used by this fuzzer: hex-escape all non-ASCII (`\dc ber` +parses fine), which is why this surfaces only with raw multibyte selectors. + +## Issue 4 — EOF does not auto-close an open attribute selector block + +Per CSS Syntax Level 3, tokenization auto-closes unterminated simple blocks +at the end of input (a parse error, but the block is returned), and an +unterminated string at EOF returns the string token: + +> \: This is a parse error. Return the block. +> — https://www.w3.org/TR/css-syntax-3/#consume-simple-block (§5.4.8) + +> EOF: This is a parse error. Return the \. +> — https://www.w3.org/TR/css-syntax-3/#consume-string-token (§4.3.5) + +So `[att=val` is the same selector as `[att=val]`, and `[att="a b` carries +the string value `a b`. lexbor reports a selector parse error for every +EOF-truncated attribute selector. Verified at v3.0.0 against +`
` / `
`: + +| selector | lexbor | spec / Chrome 149 | +|-----------------|---------------|-------------------| +| `[att]` | parses ✅ | parses | +| `[att=val]` | parses ✅ | parses | +| `[att` | parse error ❌ | parses, matches | +| `[att=val` | parse error ❌ | parses, matches | +| `[att="a b` | parse error ❌ | parses, matches | +| `[att=val i` | parse error ❌ | parses, matches | +| `div[att` | parse error ❌ | parses, matches | +| `[att=` | parse error ✅ | error (grammar) | +| `[att~` | parse error ✅ | error (grammar) | +| `[` | parse error ✅ | error (grammar) | +| `[att=val, div` | parse error ✅ | error (comma is inside the open block) | + +The last four rows are controls: truncation inside the selector *grammar* +(matcher without value, lone bracket) is invalid even after auto-close, and +lexbor correctly rejects those. Chrome 149 (`document.querySelectorAll`) +accepts and rejects exactly per the table (verified 2026-06-10 via +Playwright). Note lexbor's escape handling at EOF is fine — `.foo\` parses +as class `foo\u{FFFD}` per §4.3.7 — the gap is specifically the simple-block +auto-close. + +## Issue 5 — HTML's case-insensitive attribute value list not implemented + +HTML defines 46 attributes (`type`, `rel`, `lang`, `dir`, `media`, +`hreflang`, `http-equiv`, ...) whose values must match ASCII +case-insensitively in attribute selectors on an HTML element when the +selector has no `i`/`s` modifier: + +> Attribute selectors on an HTML element in an HTML document must treat the +> values of attributes with the following names as ASCII case-insensitive: … +> — https://html.spec.whatwg.org/multipage/semantics-other.html#case-sensitivity-of-selectors + +lexbor matches all attribute values case-sensitively unless the selector +carries an explicit `i`. Verified at v3.0.0 against +`` (`e1`), `` (`e2`), +`` (`e3`): + +| selector | lexbor | spec / Chrome 149 | +|--------------------|--------------|-------------------| +| `[rel=nofollow]` | `e2` only ❌ | `e1` and `e2` | +| `[rel=NOFOLLOW]` | `e1` only ❌ | `e1` and `e2` | +| `[rel=nofollow i]` | `e1`, `e2` ✅ | `e1` and `e2` | +| `[rel=nofollow s]` | `e2` only ✅ | `e2` only | +| `[data-x=abc]` | no match ✅ | no match (unlisted attribute) | + +The last three rows are controls: explicit modifiers work, and attributes +outside the list stay case-sensitive. Chrome 149 agrees with the spec column +(verified 2026-06-10 via Playwright), with one scoping caveat the report +should mention: the spec restricts the rule to elements in the HTML +namespace, but Chrome also folds on SVG-namespace elements +(`` matches `[type=TEXT]`), so an implementation true +to the spec letter would scope by element namespace. This may be framed as +a feature request rather than a bug if lexbor considers document-language +selector rules out of scope for its selectors module — but lexbor is an +HTML engine and browsers uniformly implement the folding, so matching +against HTML documents diverges from every browser without it. + +## Issue 6 — ill-formed UTF-8 in selectors is not decoded per the Encoding Standard + +CSS Syntax Level 3 decodes the input byte stream via the Encoding Standard +before tokenizing: + +> To decode bytes, ... Otherwise, decode bytes with fallback encoding utf-8. +> — https://www.w3.org/TR/css-syntax-3/#input-byte-stream (§3.2) + +The Encoding Standard's UTF-8 decoder replaces each **maximal subpart of an +ill-formed subsequence** with a single U+FFFD (the boundaries follow the +decoder's byte-range tables; see also Unicode §3.9 "U+FFFD Substitution of +Maximal Subparts"): + +> https://encoding.spec.whatwg.org/#utf-8-decoder + +lexbor accepts raw ill-formed bytes in selectors (no parse error) and +replaces them with U+FFFD, but with different boundaries: a truncated +multi-byte sequence yields one U+FFFD **per byte** instead of one per +maximal subpart, and a UTF-8-encoded surrogate half (`ED A0 80`–`ED BF +BF`) is decoded permissively as a **single unit** yielding one U+FFFD +instead of three. Verified at v3.0.0 by matching raw-byte class selectors +against elements whose class attributes contain literal U+FFFD runs +(`
` = 1×U+FFFD ... `
` = 4×U+FFFD; +`�` below is U+FFFD, U+FFFD counts in parentheses): + +| selector bytes | WHATWG decode | lexbor | +|-----------------------|----------------|----------------| +| `.ab` | `a�b` (1) ✅ | `a��b` (2) ❌ | +| `.ab` | `a�b` (1) ✅ | `a���b` (3) ❌ | +| `.ab` | `a���b` (3) ✅ | `a�b` (1) ❌ | +| `.ab` | `a���b` (3) ✅ | `a�b` (1) ❌ | +| `.a<80>b` | `a�b` (1) | `a�b` (1) ✅ | +| `.ab` | `a�b` (1) | `a�b` (1) ✅ | +| `.ab` | `a��b` (2) | `a��b` (2) ✅ | +| `.ab` | `a���b` (3) | `a���b` (3) ✅ | +| `.ab` | `a����b` (4) | `a����b` (4) ✅ | + +The agreeing rows are controls where per-byte replacement coincides with +the maximal-subpart rule (lone continuation/lead bytes, overlongs whose +subparts are all single bytes, beyond-U+10FFFF). The same behavior applies +inside string tokens (`[x="p<80>q"]` matches `x="p�q"`). Two truncated +sequences are exactly where the algorithms separate: `E2 8C` is **one** +maximal subpart (E2 accepts two continuations and 8C is a valid first +continuation), while `ED A0` is **not** a subpart at all (ED restricts its +first continuation to 80–9F), so `ED A0 80` is three. + +Notes for the filing agent: browsers only exercise this decode through the +stylesheet byte stream (JS `querySelectorAll` strings are already UTF-16), +so compare against an external stylesheet with raw bytes, or against +another Encoding Standard implementation (e.g. `TextDecoder('utf-8')`, +whose output for the byte sequences above shows the maximal-subpart +boundaries directly). Document-side context: lexbor stores raw ill-formed +bytes from the HTML byte stream unchanged in the DOM (a raw `<80>` in a +class attribute is matched by no selector, not even one with the same raw +bytes), so the repro must put literal U+FFFD characters in the document +and raw bytes only in the selector. diff --git a/tools/css-selector-fuzz/lexbor/build.sh b/tools/css-selector-fuzz/lexbor/build.sh new file mode 100644 index 0000000000000..a1471675bc815 --- /dev/null +++ b/tools/css-selector-fuzz/lexbor/build.sh @@ -0,0 +1,47 @@ +#!/bin/sh +# +# Builds the lexbor differential harness. +# +# Builds against upstream lexbor master. The exact commit is printed after +# each build and recorded in the build cache. +# Note: lexbor issue #368 ("Class/ID selectors are ASCII case-insensitive +# even in no-quirks mode") is detected at startup and compensated for when +# present (see LexborOracle.php). +# +# Usage: +# sh tools/css-selector-fuzz/lexbor/build.sh [lexbor-src-dir] +# +# Produces tools/css-selector-fuzz/lexbor/harness. + +set -e + +HERE="$(cd "$(dirname "$0")" && pwd)" +SRC="${1:-/tmp/lexbor-src}" +BRANCH="master" + +if [ ! -d "$SRC" ]; then + echo "Cloning lexbor into $SRC ..." + git clone https://github.com/lexbor/lexbor "$SRC" +fi + +git -C "$SRC" fetch --quiet origin "$BRANCH" +git -C "$SRC" checkout --quiet -B "$BRANCH" "origin/$BRANCH" +REV="$(git -C "$SRC" rev-parse --verify HEAD)" +STAMP="$SRC/build/.lexbor-rev" + +if [ ! -f "$SRC/build/liblexbor_static.a" ] || [ ! -f "$STAMP" ] || [ "$(cat "$STAMP")" != "$REV" ]; then + echo "Building liblexbor_static ($BRANCH $REV) ..." + mkdir -p "$SRC/build" + cd "$SRC/build" + cmake -DCMAKE_BUILD_TYPE=Release -DLEXBOR_BUILD_SHARED=OFF \ + -DLEXBOR_BUILD_STATIC=ON -DLEXBOR_BUILD_TESTS=OFF \ + -DLEXBOR_BUILD_EXAMPLES=OFF .. > /dev/null + make -j8 lexbor_static > /dev/null + printf '%s\n' "$REV" > "$STAMP" + cd "$HERE" +fi + +cc -O2 -Wall -Wextra -o "$HERE/harness" "$HERE/harness.c" \ + -I "$SRC/source" "$SRC/build/liblexbor_static.a" + +echo "Built $HERE/harness (lexbor $BRANCH $REV)" diff --git a/tools/css-selector-fuzz/lexbor/harness.c b/tools/css-selector-fuzz/lexbor/harness.c new file mode 100644 index 0000000000000..582c387a92f86 --- /dev/null +++ b/tools/css-selector-fuzz/lexbor/harness.c @@ -0,0 +1,290 @@ +/* + * lexbor differential harness for the CSS selector fuzzer. + * + * Reads one case per line from stdin: + * + * base64(html) "\t" base64(selector) "\n" + * + * For each case, parses the HTML with lexbor, parses the selector with the + * lexbor CSS selectors module, runs lxb_selectors_find over the whole + * document, and emits: + * + * R "\t" TAG "\t" FID "\t" ANC1,ANC2,... one per element, document + * pre-order; ancestors are + * nearest-first uppercase tags + * M "\t" FID one per match, in find order + * X "\t" parse selector did not parse + * X "\t" html html did not parse + * D end of case (then flush) + * + * FID is the element's data-fid attribute value, or "(missing-fid:TAG)" + * for elements without one (matching the fuzzer's placeholder convention). + * Tags are ASCII-uppercased. + * + * Build: see build.sh next to this file. The script builds upstream lexbor + * master and prints the exact commit used. + */ + +#include +#include +#include + +#include +#include +#include + +#define MAX_DEPTH 512 + +static unsigned char * +b64_decode(const char *in, size_t in_len, size_t *out_len) +{ + static const signed char table[256] = { + ['A'] = 0, ['B'] = 1, ['C'] = 2, ['D'] = 3, ['E'] = 4, + ['F'] = 5, ['G'] = 6, ['H'] = 7, ['I'] = 8, ['J'] = 9, + ['K'] = 10, ['L'] = 11, ['M'] = 12, ['N'] = 13, ['O'] = 14, + ['P'] = 15, ['Q'] = 16, ['R'] = 17, ['S'] = 18, ['T'] = 19, + ['U'] = 20, ['V'] = 21, ['W'] = 22, ['X'] = 23, ['Y'] = 24, + ['Z'] = 25, ['a'] = 26, ['b'] = 27, ['c'] = 28, ['d'] = 29, + ['e'] = 30, ['f'] = 31, ['g'] = 32, ['h'] = 33, ['i'] = 34, + ['j'] = 35, ['k'] = 36, ['l'] = 37, ['m'] = 38, ['n'] = 39, + ['o'] = 40, ['p'] = 41, ['q'] = 42, ['r'] = 43, ['s'] = 44, + ['t'] = 45, ['u'] = 46, ['v'] = 47, ['w'] = 48, ['x'] = 49, + ['y'] = 50, ['z'] = 51, ['0'] = 52, ['1'] = 53, ['2'] = 54, + ['3'] = 55, ['4'] = 56, ['5'] = 57, ['6'] = 58, ['7'] = 59, + ['8'] = 60, ['9'] = 61, ['+'] = 62, ['/'] = 63, + }; + + unsigned char *out = malloc(in_len / 4 * 3 + 4); + size_t o = 0; + unsigned int acc = 0; + int bits = 0; + + if (out == NULL) { + return NULL; + } + + for (size_t i = 0; i < in_len; i++) { + unsigned char c = (unsigned char) in[i]; + /* + * The PHP adapter always feeds well-formed base64_encode() output, + * but guard anyway: skip padding/whitespace, and actually skip any + * byte not in the alphabet ('A' legitimately maps to 0, so test the + * byte itself, not its table value). c is unsigned, so table[c] is + * always in bounds. + */ + if (c == '=' || c == '\n' || c == '\r') { + continue; + } + if (c != 'A' && table[c] == 0) { + continue; + } + acc = (acc << 6) | (unsigned int) table[c]; + bits += 6; + if (bits >= 8) { + bits -= 8; + out[o++] = (unsigned char) ((acc >> bits) & 0xFF); + } + } + + *out_len = o; + return out; +} + +static void +put_upper(const lxb_char_t *name, size_t len) +{ + for (size_t i = 0; i < len; i++) { + unsigned char c = name[i]; + if (c >= 'a' && c <= 'z') { + c = (unsigned char) (c - 'a' + 'A'); + } + putchar(c); + } +} + +/* + * Emit a data-fid value, replacing the framing bytes TAB / LF / CR with '?'. + * Generated documents only ever use fids like "w12" / "e3", so this never + * fires in practice; it guards the line-and-tab protocol against a fid that + * contains a control char (which would otherwise desync row/match parsing on + * the PHP side). LexborOracle applies the identical replacement when reading + * WP's own fids, so a sanitized fid still compares equal — the worst case is + * a benign tree-gated skip, never a false divergence. + */ +static void +put_fid_value(const lxb_char_t *value, size_t value_len) +{ + for (size_t i = 0; i < value_len; i++) { + unsigned char c = value[i]; + putchar((c == '\t' || c == '\n' || c == '\r') ? '?' : c); + } +} + +static void +put_fid(lxb_dom_node_t *node) +{ + lxb_dom_element_t *element = lxb_dom_interface_element(node); + size_t value_len = 0; + const lxb_char_t *value = lxb_dom_element_get_attribute( + element, (const lxb_char_t *) "data-fid", 8, &value_len); + + if (value != NULL) { + put_fid_value(value, value_len); + return; + } + + size_t name_len = 0; + const lxb_char_t *name = lxb_dom_element_qualified_name(element, &name_len); + fputs("(missing-fid:", stdout); + put_upper(name, name_len); + putchar(')'); +} + +struct walk_state { + const lxb_char_t *stack[MAX_DEPTH]; /* uppercase emitted on the fly */ + size_t stack_len[MAX_DEPTH]; + int depth; +}; + +static void +walk(lxb_dom_node_t *node, struct walk_state *state) +{ + for (lxb_dom_node_t *child = node->first_child; child != NULL; + child = child->next) { + if (child->type != LXB_DOM_NODE_TYPE_ELEMENT) { + continue; + } + + size_t name_len = 0; + const lxb_char_t *name = lxb_dom_element_qualified_name( + lxb_dom_interface_element(child), &name_len); + + fputs("R\t", stdout); + put_upper(name, name_len); + putchar('\t'); + put_fid(child); + putchar('\t'); + for (int i = state->depth - 1; i >= 0; i--) { + put_upper(state->stack[i], state->stack_len[i]); + if (i > 0) { + putchar(','); + } + } + putchar('\n'); + + if (state->depth < MAX_DEPTH) { + state->stack[state->depth] = name; + state->stack_len[state->depth] = name_len; + state->depth++; + walk(child, state); + state->depth--; + } + } +} + +static lxb_status_t +find_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec, + void *ctx) +{ + (void) spec; + (void) ctx; + fputs("M\t", stdout); + put_fid(node); + putchar('\n'); + return LXB_STATUS_OK; +} + +int +main(void) +{ + char *line = NULL; + size_t line_cap = 0; + ssize_t line_len; + + while ((line_len = getline(&line, &line_cap, stdin)) > 0) { + char *tab = memchr(line, '\t', (size_t) line_len); + if (tab == NULL) { + fputs("X\tprotocol\nD\n", stdout); + fflush(stdout); + continue; + } + + size_t html_len = 0; + size_t selector_len = 0; + unsigned char *html = b64_decode(line, (size_t) (tab - line), &html_len); + unsigned char *selector = b64_decode( + tab + 1, (size_t) (line + line_len - tab - 1), &selector_len); + + if (html == NULL || selector == NULL) { + fputs("X\tprotocol\nD\n", stdout); + fflush(stdout); + free(html); + free(selector); + continue; + } + + lxb_html_document_t *document = lxb_html_document_create(); + if (lxb_html_document_parse(document, html, html_len) + != LXB_STATUS_OK) { + fputs("X\thtml\nD\n", stdout); + fflush(stdout); + lxb_html_document_destroy(document); + free(html); + free(selector); + continue; + } + + struct walk_state state = { .depth = 0 }; + walk(lxb_dom_interface_node(document), &state); + + /* + * Parser and selectors engine are created per case: + * lxb_css_selector_list_destroy_memory() releases the parser's + * whole arena, so reuse across cases is unsafe. + */ + lxb_css_parser_t *parser = lxb_css_parser_create(); + lxb_selectors_t *selectors = lxb_selectors_create(); + if (lxb_css_parser_init(parser, NULL) != LXB_STATUS_OK + || lxb_selectors_init(selectors) != LXB_STATUS_OK) { + fputs("X\tinit\nD\n", stdout); + fflush(stdout); + lxb_selectors_destroy(selectors, true); + lxb_css_parser_destroy(parser, true); + lxb_html_document_destroy(document); + free(html); + free(selector); + continue; + } + + /* Report each node once even when several list branches match. */ + lxb_selectors_opt_set(selectors, LXB_SELECTORS_OPT_MATCH_FIRST); + + lxb_css_selector_list_t *list = lxb_css_selectors_parse( + parser, selector, selector_len); + + if (parser->status != LXB_STATUS_OK || list == NULL) { + fputs("X\tparse\n", stdout); + } + else { + lxb_status_t status = lxb_selectors_find( + selectors, lxb_dom_interface_node(document), list, + find_callback, NULL); + if (status != LXB_STATUS_OK) { + fputs("X\tfind\n", stdout); + } + lxb_css_selector_list_destroy_memory(list); + } + + fputs("D\n", stdout); + fflush(stdout); + + lxb_selectors_destroy(selectors, true); + lxb_css_parser_destroy(parser, true); + lxb_html_document_destroy(document); + free(html); + free(selector); + } + + free(line); + return EXIT_SUCCESS; +} diff --git a/tools/css-selector-fuzz/lib/AstExtractor.php b/tools/css-selector-fuzz/lib/AstExtractor.php new file mode 100644 index 0000000000000..ed81beac30c3e --- /dev/null +++ b/tools/css-selector-fuzz/lib/AstExtractor.php @@ -0,0 +1,149 @@ + array(), + 'self' => self::from_compound( $selector ), + ); + } + return $out; + } + + private static function from_complex( \WP_CSS_Complex_Selector $selector ): array { + $context = array(); + foreach ( (array) $selector->context_selectors as $pair ) { + if ( ! is_array( $pair ) || 2 !== count( $pair ) ) { + throw new \UnexpectedValueException( 'Context selector pair has unexpected shape.' ); + } + if ( ! $pair[0] instanceof \WP_CSS_Type_Selector ) { + throw new \UnexpectedValueException( 'Context selector is not a type selector: ' . self::describe( $pair[0] ) ); + } + if ( ! in_array( $pair[1], array( ' ', '>' ), true ) ) { + throw new \UnexpectedValueException( 'Context selector uses unsupported combinator: ' . var_export( $pair[1], true ) ); + } + $context[] = array( $pair[0]->type, $pair[1] ); + } + + return array( + 'context' => $context, + 'self' => self::from_compound( $selector->self_selector ), + ); + } + + private static function from_compound( \WP_CSS_Compound_Selector $selector ): array { + $subs = null; + if ( null !== $selector->subclass_selectors ) { + if ( array() === $selector->subclass_selectors ) { + throw new \UnexpectedValueException( 'Compound selector has empty (non-null) subclass selector array.' ); + } + $subs = array(); + foreach ( $selector->subclass_selectors as $sub ) { + $subs[] = self::from_subclass( $sub ); + } + } + + if ( null === $selector->type_selector && null === $subs ) { + throw new \UnexpectedValueException( 'Compound selector has neither type nor subclass selectors.' ); + } + + return array( + 'type' => null === $selector->type_selector ? null : $selector->type_selector->type, + 'subs' => $subs, + ); + } + + private static function from_subclass( $sub ): array { + if ( $sub instanceof \WP_CSS_Class_Selector ) { + return array( + 'kind' => 'class', + 'name' => $sub->class_name, + ); + } + if ( $sub instanceof \WP_CSS_ID_Selector ) { + return array( + 'kind' => 'id', + 'name' => $sub->id, + ); + } + if ( $sub instanceof \WP_CSS_Attribute_Selector ) { + $valid_matchers = array( + null, + \WP_CSS_Attribute_Selector::MATCH_EXACT, + \WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT, + \WP_CSS_Attribute_Selector::MATCH_EXACT_OR_HYPHEN_SUFFIXED, + \WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY, + \WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY, + \WP_CSS_Attribute_Selector::MATCH_CONTAINS, + ); + if ( ! in_array( $sub->matcher, $valid_matchers, true ) ) { + throw new \UnexpectedValueException( 'Attribute selector has unknown matcher: ' . var_export( $sub->matcher, true ) ); + } + $valid_modifiers = array( + null, + \WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, + \WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, + ); + if ( ! in_array( $sub->modifier, $valid_modifiers, true ) ) { + throw new \UnexpectedValueException( 'Attribute selector has unknown modifier: ' . var_export( $sub->modifier, true ) ); + } + if ( ( null === $sub->matcher ) !== ( null === $sub->value ) ) { + throw new \UnexpectedValueException( 'Attribute selector matcher/value nullness mismatch.' ); + } + return array( + 'kind' => 'attr', + 'name' => $sub->name, + 'matcher' => $sub->matcher, + 'value' => $sub->value, + 'modifier' => $sub->modifier, + ); + } + throw new \UnexpectedValueException( 'Unknown subclass selector: ' . self::describe( $sub ) ); + } + + private static function get_private( $object, string $property, string $declaring_class ) { + $reflection = new \ReflectionProperty( $declaring_class, $property ); + $reflection->setAccessible( true ); + $value = $reflection->getValue( $object ); + if ( ! is_array( $value ) ) { + throw new \UnexpectedValueException( "Property {$property} is not an array." ); + } + return $value; + } + + private static function describe( $value ): string { + return is_object( $value ) ? get_class( $value ) : gettype( $value ); + } +} diff --git a/tools/css-selector-fuzz/lib/Bootstrap.php b/tools/css-selector-fuzz/lib/Bootstrap.php new file mode 100644 index 0000000000000..6d33b4de7c4e9 --- /dev/null +++ b/tools/css-selector-fuzz/lib/Bootstrap.php @@ -0,0 +1,63 @@ + */ + public static function doing_it_wrong_calls(): array { + return $GLOBALS['css_selector_fuzz_doing_it_wrong']; + } +} diff --git a/tools/css-selector-fuzz/lib/DocumentGenerator.php b/tools/css-selector-fuzz/lib/DocumentGenerator.php new file mode 100644 index 0000000000000..20da2825b6ed2 --- /dev/null +++ b/tools/css-selector-fuzz/lib/DocumentGenerator.php @@ -0,0 +1,628 @@ +prng = $prng; + $this->max_elements = $max_elements; + $this->pools = array( + 'tags' => array(), + 'classes' => array(), + 'ids' => array(), + 'attrNames' => array(), + 'attrValues' => array(), + ); + } + + /** + * @return array{model: array, html: string, quirks: bool, pools: array} + */ + public static function generate( Prng $prng ): array { + $generator = new self( $prng, $prng->int( 8, 40 ) ); + return $generator->build(); + } + + /** + * Generates a ``-context fragment: body-level content rendered + * without the document wrapper, parsed via create_fragment. The model's + * top-level elements carry the implicit BODY/HTML ancestors the fragment + * parser reports in breadcrumbs. + * + * @return array{ + * model: null, + * children: array, + * html: string, + * context: string, + * fragment: true, + * quirks: bool, + * pools: array, + * } + */ + public static function generate_fragment( Prng $prng ): array { + $generator = new self( $prng, $prng->int( 6, 30 ) ); + return $generator->build_fragment(); + } + + private function build_fragment(): array { + $children = array(); + $child_budget = $this->prng->int( 1, 6 ); + for ( $i = 0; $i < $child_budget && $this->element_count < $this->max_elements; $i++ ) { + $children[] = $this->random_subtree( 0 ); + } + + $bits = array(); + foreach ( $children as $child ) { + $bits[] = $this->render_element( $child ); + } + $filler = array( '', 'text', ' more ', "\n ", '& x', 'café ✓', '' ); + $html = ''; + foreach ( $bits as $bit ) { + if ( $this->prng->chance( 35 ) ) { + $html .= $this->prng->choice( $filler ); + } + $html .= $bit; + } + + foreach ( $this->pools as $key => $values ) { + $this->pools[ $key ] = array_values( array_unique( $values ) ); + } + + return array( + 'model' => null, + 'children' => $children, + 'html' => $html, + 'context' => '', + 'fragment' => true, + 'quirks' => false, + 'pools' => $this->pools, + ); + } + + /** + * Rows ( TreeCapture shape ) for a ``-context fragment: the + * top-level children flattened with the implicit HTML/BODY ancestors the + * fragment parser reports. + */ + public static function rows_from_fragment( array $children ): array { + $html_root = array( 'tag' => 'html', 'fid' => '(html)', 'attrs' => array(), 'children' => array() ); + $body_root = array( 'tag' => 'body', 'fid' => '(body)', 'attrs' => array(), 'children' => $children ); + + $rows = array(); + foreach ( $children as $child ) { + foreach ( self::flatten_with_ancestors( $child, array( $body_root, $html_root ) ) as $pair ) { + list( $element, $ancestors ) = $pair; + + $attrs = array(); + $seen = array(); + foreach ( $element['attrs'] as $attr ) { + $lower = ascii_strtolower( $attr[0] ); + if ( isset( $seen[ $lower ] ) ) { + continue; + } + $seen[ $lower ] = true; + $attrs[] = array( $lower, $attr[1] ); + } + + $ancestor_tags = array(); + foreach ( $ancestors as $ancestor ) { + $ancestor_tags[] = strtoupper( ascii_strtolower( $ancestor['tag'] ) ); + } + + $rows[] = array( + 'tag' => strtoupper( ascii_strtolower( $element['tag'] ) ), + 'fid' => $element['fid'], + 'attrs' => $attrs, + 'ancestorTags' => $ancestor_tags, + ); + } + } + return $rows; + } + + private function build(): array { + $has_doctype = $this->prng->chance( 85 ); + + $head_children = array(); + if ( $this->prng->chance( 60 ) ) { + $head_children[] = $this->make_element( 'title', array(), array() ); + } + if ( $this->prng->chance( 30 ) ) { + $head_children[] = $this->make_element( 'meta', $this->random_attrs(), array() ); + } + + $body_children = array(); + $child_budget = $this->prng->int( 1, 6 ); + for ( $i = 0; $i < $child_budget && $this->element_count < $this->max_elements; $i++ ) { + $body_children[] = $this->random_subtree( 0 ); + } + + $head = $this->make_element( 'head', array(), $head_children ); + $body = $this->make_element( 'body', $this->prng->chance( 30 ) ? $this->random_attrs() : array(), $body_children ); + $html = $this->make_element( 'html', $this->prng->chance( 20 ) ? $this->random_attrs() : array(), array( $head, $body ) ); + + $rendered = ( $has_doctype ? '' : '' ) . $this->render_element( $html ); + + foreach ( $this->pools as $key => $values ) { + $this->pools[ $key ] = array_values( array_unique( $values ) ); + } + + return array( + 'model' => $html, + 'html' => $rendered, + 'quirks' => ! $has_doctype, + 'pools' => $this->pools, + ); + } + + private function random_subtree( int $depth ): array { + ++$this->element_count; + + if ( $depth >= 7 || $this->element_count >= $this->max_elements || $this->prng->chance( 25 ) ) { + // Leaf. + if ( $this->prng->chance( 25 ) ) { + return $this->make_element( $this->prng->choice( self::VOID_TAGS ), $this->random_attrs(), array(), true ); + } + return $this->make_element( $this->prng->choice( self::SAFE_TAGS ), $this->random_attrs(), array() ); + } + + $children = array(); + $child_count = $this->prng->int( 1, 4 ); + for ( $i = 0; $i < $child_count && $this->element_count < $this->max_elements; $i++ ) { + $children[] = $this->random_subtree( $depth + 1 ); + } + + return $this->make_element( $this->prng->choice( self::SAFE_TAGS ), $this->random_attrs(), $children ); + } + + private function make_element( string $tag, array $attrs, array $children, bool $is_void = false ): array { + $fid = 'e' . $this->fid_counter++; + + $written_tag = $this->prng->chance( 15 ) ? $this->random_case( $tag ) : $tag; + + $this->pools['tags'][] = $tag; + + return array( + 'tag' => $written_tag, + 'fid' => $fid, + 'attrs' => $attrs, + 'children' => $children, + 'void' => $is_void || in_array( strtolower( $tag ), array( 'meta', 'br', 'hr', 'img', 'wbr', 'input', 'embed' ), true ), + ); + } + + /** @return array name/value pairs in source order. */ + private function random_attrs(): array { + $attrs = array(); + $count = $this->prng->weighted( + array( + 0 => 15, + 1 => 30, + 2 => 30, + 3 => 15, + 4 => 10, + ) + ); + + $used_names = array(); + for ( $i = 0; $i < $count; $i++ ) { + $name = $this->prng->choice( self::ATTR_NAMES ); + + // Occasionally repeat an attribute name: the processor keeps the first. + $is_duplicate = isset( $used_names[ ascii_strtolower( $name ) ] ); + if ( $is_duplicate && ! $this->prng->chance( 20 ) ) { + continue; + } + $used_names[ ascii_strtolower( $name ) ] = true; + + if ( $this->prng->chance( 12 ) ) { + $name = $this->random_case( $name ); + } + + $lower = ascii_strtolower( $name ); + if ( 'class' === $lower ) { + $value = $this->random_class_value(); + } elseif ( 'id' === $lower ) { + $value = $this->prng->chance( 85 ) ? $this->random_id_value() : ( $this->prng->chance( 50 ) ? '' : true ); + } elseif ( in_array( $lower, array( 'disabled', 'hidden' ), true ) ) { + $value = $this->prng->chance( 70 ) ? true : $this->prng->choice( array( '', 'disabled', 'true' ) ); + } else { + $value = $this->prng->chance( 12 ) ? true : $this->random_attr_value(); + } + + $this->pools['attrNames'][] = ascii_strtolower( $name ); + if ( is_string( $value ) && 'class' !== $lower ) { + $this->pools['attrValues'][] = $value; + } + + $attrs[] = array( $name, $value ); + } + + return $attrs; + } + + private function random_class_value(): string { + $count = $this->prng->int( 1, 4 ); + $classes = array(); + for ( $i = 0; $i < $count; $i++ ) { + $class = $this->random_word( true ); + $raw_class = $this->maybe_inject_class_nul( $class ); + $classes[] = $raw_class; + foreach ( self::class_tokens( $raw_class ) as $token ) { + $this->pools['classes'][] = $token; + } + } + + $ws = array( ' ', ' ', ' ', "\t", "\n", "\f", ' ' ); + $value = $this->prng->chance( 20 ) ? $this->prng->choice( $ws ) : ''; + foreach ( $classes as $i => $class ) { + if ( $i > 0 ) { + $value .= $this->prng->choice( $ws ); + } + $value .= $class; + } + if ( $this->prng->chance( 20 ) ) { + $value .= $this->prng->choice( $ws ); + } + return $value; + } + + private function maybe_inject_class_nul( string $class ): string { + if ( '' === $class || ! $this->prng->chance( 12 ) ) { + return $class; + } + + $points = utf8_codepoints( $class ); + $at = $this->prng->int( 0, count( $points ) ); + $out = ''; + foreach ( $points as $i => $point ) { + if ( $i === $at ) { + $out .= "\0"; + } + $out .= $point[0]; + } + return $at === count( $points ) ? $out . "\0" : $out; + } + + private function random_id_value(): string { + $id = $this->random_word( true ); + $this->pools['ids'][] = $id; + return $id; + } + + private function random_attr_value(): string { + $kind = $this->prng->weighted( + array( + 'word' => 35, + 'words' => 20, + 'hyphenated' => 15, + 'empty' => 8, + 'spicy' => 12, + 'unicode' => 10, + ) + ); + + switch ( $kind ) { + case 'word': + return $this->random_word( true ); + case 'words': + $parts = array(); + $n = $this->prng->int( 2, 4 ); + for ( $i = 0; $i < $n; $i++ ) { + $parts[] = $this->random_word( true ); + } + return implode( $this->prng->choice( array( ' ', ' ', "\t", "\n" ) ), $parts ); + case 'hyphenated': + return $this->random_word( false ) . '-' . $this->random_word( false ); + case 'empty': + return ''; + case 'spicy': + $spice = array( 'a"b', "a'b", 'a&b', 'ab', 'a=b', 'a b c', '&', '"x', '100%', 'semi;colon', 'a,b' ); + return $this->prng->choice( $spice ); + case 'unicode': + $unicode = array( 'héllo', 'ÄÖÜ', '✓done', 'naïve', 'Ωmega', '\u{1F600}smile' ); + $value = $this->prng->choice( $unicode ); + return str_replace( '\u{1F600}', "\u{1F600}", $value ); + } + return 'fallback'; + } + + private function random_word( bool $allow_mixed_case ): string { + $stems = array( 'alpha', 'beta', 'gamma', 'delta', 'box', 'col', 'item', 'note', 'wide', 'main-item', 'x', 'a', '-lead', '--var', '_under', 'Über', 'mixedCase' ); + $word = $this->prng->choice( $stems ); + if ( $this->prng->chance( 30 ) ) { + $word .= (string) $this->prng->int( 0, 99 ); + } + if ( $allow_mixed_case && $this->prng->chance( 15 ) ) { + $word = $this->random_case( $word ); + } + return $word; + } + + private function random_case( string $input ): string { + $out = ''; + for ( $i = 0; $i < strlen( $input ); $i++ ) { + $c = $input[ $i ]; + $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c ); + } + return $out; + } + + /* + * --------- + * Rendering + * --------- + */ + + private function render_element( array $element ): string { + $out = '<' . $element['tag']; + + $rendered_attrs = array( ' data-fid="' . $element['fid'] . '"' ); + foreach ( $element['attrs'] as $attr ) { + $rendered_attrs[] = ' ' . $this->render_attr( $attr[0], $attr[1] ); + } + $out .= implode( '', $rendered_attrs ); + + if ( $element['void'] ) { + $out .= $this->prng->chance( 25 ) ? ' />' : '>'; + return $out; + } + + $out .= '>'; + + $child_bits = array(); + foreach ( $element['children'] as $child ) { + $child_bits[] = $this->render_element( $child ); + } + + /* + * Sprinkle text and comments between children — but never directly + * inside `html` or `head`, where character tokens would trigger + * insertion-mode changes (early body creation, head popping) that + * desynchronize the model from the parsed tree. + */ + $lower_tag = strtolower( $element['tag'] ); + $may_have_filler = ! in_array( $lower_tag, array( 'html', 'head' ), true ); + $filler_options = array( + '', + 'text', + ' more text ', + "\n ", + '& <escaped>', + '', + 'café ✓', + ); + $content = ''; + foreach ( $child_bits as $bit ) { + if ( $may_have_filler && $this->prng->chance( 40 ) ) { + $content .= $this->prng->choice( $filler_options ); + } + $content .= $bit; + } + if ( $may_have_filler && $this->prng->chance( 40 ) ) { + $content .= $this->prng->choice( $filler_options ); + } + if ( 'title' === $lower_tag ) { + // RAWTEXT: keep it plain. + $content = $this->prng->chance( 60 ) ? 'Fuzz Title' : ''; + } + + return $out . $content . ''; + } + + /** @param string|true $value */ + private function render_attr( string $name, $value ): string { + if ( true === $value ) { + return $name; + } + + $style = $this->prng->weighted( + array( + 'double' => 60, + 'single' => 20, + 'unquoted' => 20, + ) + ); + + if ( 'unquoted' === $style && ( '' === $value || strlen( $value ) !== strspn( $value, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._:-' ) ) ) { + $style = 'double'; + } + + switch ( $style ) { + case 'unquoted': + return $name . '=' . $value; + case 'single': + return $name . "='" . str_replace( array( '&', "'", '<' ), array( '&', ''', '<' ), $value ) . "'"; + default: + return $name . '="' . str_replace( array( '&', '"', '<' ), array( '&', '"', '<' ), $value ) . '"'; + } + } + + /* + * ---------------- + * Model utilities + * ---------------- + */ + + /** Pre-order (document order) list of elements. */ + public static function flatten( array $element ): array { + $out = array( $element ); + foreach ( $element['children'] as $child ) { + foreach ( self::flatten( $child ) as $descendant ) { + $out[] = $descendant; + } + } + return $out; + } + + /** + * Pre-order list of ( element, ancestors ) pairs where ancestors is the + * chain from nearest ancestor to root — the same orientation as + * WP_HTML_Processor::get_breadcrumbs() reversed past self. + */ + public static function flatten_with_ancestors( array $element, array $ancestors = array() ): array { + $out = array( array( $element, $ancestors ) ); + $next_ancestors = array_merge( array( $element ), $ancestors ); + foreach ( $element['children'] as $child ) { + foreach ( self::flatten_with_ancestors( $child, $next_ancestors ) as $pair ) { + $out[] = $pair; + } + } + return $out; + } + + /** + * Flat element rows ( the TreeCapture row shape ) derived from a model: + * pre-order, tags uppercased, attribute names lowercased with the first + * of duplicates winning — directly comparable to a TreeCapture of the + * rendered document. + */ + public static function rows_from_model( array $model ): array { + $rows = array(); + foreach ( self::flatten_with_ancestors( $model ) as $pair ) { + list( $element, $ancestors ) = $pair; + + $attrs = array(); + $seen = array(); + foreach ( $element['attrs'] as $attr ) { + $lower = ascii_strtolower( $attr[0] ); + if ( isset( $seen[ $lower ] ) ) { + continue; + } + $seen[ $lower ] = true; + $attrs[] = array( $lower, $attr[1] ); + } + + $ancestor_tags = array(); + foreach ( $ancestors as $ancestor ) { + $ancestor_tags[] = strtoupper( ascii_strtolower( $ancestor['tag'] ) ); + } + + $rows[] = array( + 'tag' => strtoupper( ascii_strtolower( $element['tag'] ) ), + 'fid' => $element['fid'], + 'attrs' => $attrs, + 'ancestorTags' => $ancestor_tags, + ); + } + return $rows; + } + + /** First attribute value for a name, ASCII case-insensitive; null if absent. */ + public static function get_attribute_value( array $element, string $name ) { + $comparable = ascii_strtolower( $name ); + foreach ( $element['attrs'] as $attr ) { + if ( ascii_strtolower( $attr[0] ) === $comparable ) { + return $attr[1]; + } + } + return null; + } + + /** + * Class tokens as seen by selector matching: ASCII whitespace separates + * tokens, and NUL inside a token is exposed as U+FFFD by class_list(). + * + * @return string[] + */ + public static function class_tokens( string $class_value ): array { + $tokens = array(); + $length = strlen( $class_value ); + $at = 0; + $ws = " \t\r\n\f"; + while ( $at < $length ) { + $at += strspn( $class_value, $ws, $at ); + if ( $at >= $length ) { + break; + } + + $token_length = strcspn( $class_value, $ws, $at ); + $tokens[] = str_replace( "\0", "\u{FFFD}", substr( $class_value, $at, $token_length ) ); + $at += $token_length; + } + return $tokens; + } +} diff --git a/tools/css-selector-fuzz/lib/LexborOracle.php b/tools/css-selector-fuzz/lib/LexborOracle.php new file mode 100644 index 0000000000000..afb508ddc1278 --- /dev/null +++ b/tools/css-selector-fuzz/lib/LexborOracle.php @@ -0,0 +1,215 @@ + fuzzer-oracle problem ( investigate + * the fuzzer, 'lexbor-divergence' ). + * reference == lexbor != WP => high-confidence WP finding ( the + * regular match-mismatch-html failure + * with no accompanying divergence ). + * + * Known bug compensated for: lexbor #368 — class and #id selectors match + * ASCII case-insensitively even in no-quirks mode ( attribute selectors + * like [id=x] are correctly case-sensitive ). Detected by probe at startup; + * when present, lexbor is compared against the reference matcher run with + * quirks-style class/ID folding. Quirks documents are compared only when + * the probe also confirms class and #id selectors fold in quirks mode. + */ +class LexborOracle { + + const READ_TIMEOUT_SECONDS = 5; + + /** @var resource|null */ + private static $process = null; + /** @var array|null */ + private static $pipes = null; + /** @var bool|null */ + private static $available = null; + /** @var bool */ + private static $issue368 = false; + /** @var bool */ + private static $quirks_class_id_reliable = false; + + public static function harness_path(): string { + return dirname( __DIR__ ) . '/lexbor/harness'; + } + + /** Whether the harness is built, starts, and answered the probes. */ + public static function available(): bool { + if ( null !== self::$available ) { + return self::$available; + } + + self::$available = false; + if ( ! is_executable( self::harness_path() ) || ! self::start() ) { + return false; + } + + // Probe: sanity plus class/#id case-sensitivity behavior. + $sane = self::query( '
', 'div.a' ); + if ( null === $sane || array( 'x' ) !== $sane['matches'] ) { + self::stop(); + return false; + } + + $no_quirks_class = self::query( '
', '.A' ); + $no_quirks_id = self::query( '
', '#A' ); + $quirks_class = self::query( '
', '.A' ); + $quirks_id = self::query( '
', '#A' ); + foreach ( array( $no_quirks_class, $no_quirks_id, $quirks_class, $quirks_id ) as $probe ) { + if ( null === $probe || null !== $probe['error'] ) { + self::stop(); + return false; + } + } + + self::$issue368 = array( 'x' ) === $no_quirks_class['matches'] + || array( 'x' ) === $no_quirks_id['matches']; + self::$quirks_class_id_reliable = ! self::$issue368 + && array() === $no_quirks_class['matches'] + && array() === $no_quirks_id['matches'] + && array( 'x' ) === $quirks_class['matches'] + && array( 'x' ) === $quirks_id['matches']; + self::$available = true; + return true; + } + + /** Whether the built lexbor exhibits issue #368 ( class/ID case folding ). */ + public static function has_issue_368(): bool { + return self::$issue368; + } + + /** Whether lexbor can be trusted on quirks class/#id case folding. */ + public static function quirks_class_id_reliable(): bool { + return self::$quirks_class_id_reliable; + } + + /** + * Runs one case through lexbor. + * + * @return array{ + * rows: array, + * matches: string[], + * error: string|null, + * }|null Null when the harness is unavailable or misbehaved ( the + * harness is stopped; the caller should skip the differential ). + */ + public static function query( string $html, string $selector ): ?array { + if ( null === self::$process && ! self::start() ) { + return null; + } + + $line = base64_encode( $html ) . "\t" . base64_encode( $selector ) . "\n"; + $written = fwrite( self::$pipes[0], $line ); + fflush( self::$pipes[0] ); + if ( strlen( $line ) !== $written ) { + self::stop(); + self::$available = false; + return null; + } + + $rows = array(); + $matches = array(); + $error = null; + + while ( true ) { + $response = self::read_line(); + if ( null === $response ) { + self::stop(); + self::$available = false; + return null; + } + if ( 'D' === $response ) { + break; + } + + $parts = explode( "\t", $response ); + switch ( $parts[0] ) { + case 'R': + $rows[] = array( + 'tag' => $parts[1] ?? '', + 'fid' => $parts[2] ?? '', + 'ancestorTags' => '' === ( $parts[3] ?? '' ) ? array() : explode( ',', $parts[3] ), + ); + break; + case 'M': + $matches[] = $parts[1] ?? ''; + break; + case 'X': + $error = $parts[1] ?? 'unknown'; + break; + } + } + + return array( + 'rows' => $rows, + 'matches' => $matches, + 'error' => $error, + ); + } + + private static function start(): bool { + $descriptors = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', '/dev/null', 'w' ), + ); + + $process = proc_open( array( self::harness_path() ), $descriptors, $pipes ); + if ( ! is_resource( $process ) ) { + return false; + } + + self::$process = $process; + self::$pipes = $pipes; + stream_set_blocking( $pipes[1], false ); + return true; + } + + private static function stop(): void { + if ( null === self::$process ) { + return; + } + @fclose( self::$pipes[0] ); + @fclose( self::$pipes[1] ); + @proc_terminate( self::$process, 9 ); + @proc_close( self::$process ); + self::$process = null; + self::$pipes = null; + } + + /** Reads one newline-terminated line with a timeout; null on failure. */ + private static function read_line(): ?string { + $line = ''; + $deadline = microtime( true ) + self::READ_TIMEOUT_SECONDS; + + while ( true ) { + $read = array( self::$pipes[1] ); + $write = null; + $except = null; + $left = $deadline - microtime( true ); + if ( $left <= 0 ) { + return null; + } + $ready = stream_select( $read, $write, $except, 0, (int) ( $left * 1e6 ) ); + if ( false === $ready || 0 === $ready ) { + return null; + } + $chunk = fgets( self::$pipes[1] ); + if ( false === $chunk ) { + return null; + } + $line .= $chunk; + if ( str_ends_with( $line, "\n" ) ) { + return substr( $line, 0, -1 ); + } + } + } +} diff --git a/tools/css-selector-fuzz/lib/Metamorph.php b/tools/css-selector-fuzz/lib/Metamorph.php new file mode 100644 index 0000000000000..159d8e8bedd52 --- /dev/null +++ b/tools/css-selector-fuzz/lib/Metamorph.php @@ -0,0 +1,150 @@ + + */ + public static function variants( array $list_ast, Prng $prng ): array { + /* + * from_selectors() scrubs invalid UTF-8 to U+FFFD before parsing, so + * parsed AST names are always valid UTF-8 and this guard should be + * unreachable. It stays as defense in depth: the renderer can only + * round-trip valid UTF-8 names, and a future AST source that skips + * normalization would otherwise corrupt the variants silently. + */ + if ( ! ast_strings_are_utf8( $list_ast ) ) { + return array(); + } + + $out = array(); + + $out[] = array( + 'name' => 'rerender', + 'selector' => SelectorGenerator::render( $prng->fork( 'rerender' ), $list_ast, true ), + 'ast' => $list_ast, + 'astMustMatch' => true, + ); + + $typecase = self::map_types( + $list_ast, + static function ( string $type ) use ( $prng ): string { + if ( '*' === $type ) { + return $type; + } + $out = ''; + for ( $i = 0; $i < strlen( $type ); $i++ ) { + $c = $type[ $i ]; + $out .= $prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c ); + } + return $out; + } + ); + if ( $typecase !== $list_ast ) { + $out[] = array( + 'name' => 'typecase', + 'selector' => SelectorGenerator::render( $prng->fork( 'typecase' ), $typecase ), + 'ast' => $typecase, + 'astMustMatch' => true, + ); + } + + $reordered = self::rotate_subs( $list_ast ); + if ( $reordered !== $list_ast ) { + $out[] = array( + 'name' => 'subs-reorder', + 'selector' => SelectorGenerator::render( $prng->fork( 'subs-reorder' ), $reordered ), + 'ast' => $reordered, + 'astMustMatch' => true, + ); + } + + $universal = self::explicit_universal( $list_ast ); + if ( $universal !== $list_ast ) { + $out[] = array( + 'name' => 'universal', + 'selector' => SelectorGenerator::render( $prng->fork( 'universal' ), $universal ), + 'ast' => $universal, + 'astMustMatch' => true, + ); + } + + $duplicated = $list_ast; + $duplicated[] = $list_ast[ $prng->int( 0, count( $list_ast ) - 1 ) ]; + $out[] = array( + 'name' => 'dup-branch', + 'selector' => SelectorGenerator::render( $prng->fork( 'dup-branch' ), $duplicated ), + 'ast' => $duplicated, + 'astMustMatch' => true, + ); + + return $out; + } + + /** Applies $fn to every type-selector name: compound types and context types. */ + private static function map_types( array $list_ast, callable $fn ): array { + foreach ( $list_ast as &$complex ) { + foreach ( $complex['context'] as &$pair ) { + $pair[0] = $fn( $pair[0] ); + } + unset( $pair ); + if ( null !== $complex['self']['type'] ) { + $complex['self']['type'] = $fn( $complex['self']['type'] ); + } + } + unset( $complex ); + return $list_ast; + } + + /** Rotates the subclass list of every compound that has two or more. */ + private static function rotate_subs( array $list_ast ): array { + foreach ( $list_ast as &$complex ) { + $subs = $complex['self']['subs']; + if ( is_array( $subs ) && count( $subs ) >= 2 ) { + $subs[] = array_shift( $subs ); + $complex['self']['subs'] = $subs; + } + } + unset( $complex ); + return $list_ast; + } + + /** Writes an explicit `*` wherever a compound omitted its type selector. */ + private static function explicit_universal( array $list_ast ): array { + foreach ( $list_ast as &$complex ) { + if ( null === $complex['self']['type'] && null !== $complex['self']['subs'] ) { + $complex['self']['type'] = '*'; + } + } + unset( $complex ); + return $list_ast; + } +} diff --git a/tools/css-selector-fuzz/lib/Prng.php b/tools/css-selector-fuzz/lib/Prng.php new file mode 100644 index 0000000000000..b8d8737304277 --- /dev/null +++ b/tools/css-selector-fuzz/lib/Prng.php @@ -0,0 +1,65 @@ +key = $seed . "\x1f" . $label; + } + + /** Derives an independent child stream; consuming it does not affect this stream. */ + public function fork( string $label ): Prng { + return new Prng( $this->key, $label . ':' . $this->uint32() ); + } + + public function bytes( int $length ): string { + while ( strlen( $this->buffer ) < $length ) { + $this->buffer .= hash( 'sha256', $this->key . ':' . $this->counter++, true ); + } + $out = substr( $this->buffer, 0, $length ); + $this->buffer = substr( $this->buffer, $length ); + return $out; + } + + public function uint32(): int { + $parts = unpack( 'Nvalue', $this->bytes( 4 ) ); + return (int) $parts['value']; + } + + public function int( int $min, int $max ): int { + if ( $max <= $min ) { + return $min; + } + return $min + ( $this->uint32() % ( $max - $min + 1 ) ); + } + + public function chance( int $numerator, int $denominator = 100 ): bool { + return $this->int( 1, $denominator ) <= $numerator; + } + + public function choice( array $values ) { + return $values[ $this->int( 0, count( $values ) - 1 ) ]; + } + + /** @param array $weights value => weight */ + public function weighted( array $weights ) { + $total = array_sum( $weights ); + $pick = $this->int( 1, max( 1, (int) $total ) ); + foreach ( $weights as $value => $weight ) { + $pick -= $weight; + if ( $pick <= 0 ) { + return $value; + } + } + return array_key_first( $weights ); + } +} diff --git a/tools/css-selector-fuzz/lib/ReferenceMatcher.php b/tools/css-selector-fuzz/lib/ReferenceMatcher.php new file mode 100644 index 0000000000000..ea422078e9893 --- /dev/null +++ b/tools/css-selector-fuzz/lib/ReferenceMatcher.php @@ -0,0 +1,323 @@ + true, + 'accept-charset' => true, + 'align' => true, + 'alink' => true, + 'axis' => true, + 'bgcolor' => true, + 'charset' => true, + 'checked' => true, + 'clear' => true, + 'codetype' => true, + 'color' => true, + 'compact' => true, + 'declare' => true, + 'defer' => true, + 'dir' => true, + 'direction' => true, + 'disabled' => true, + 'enctype' => true, + 'face' => true, + 'frame' => true, + 'hreflang' => true, + 'http-equiv' => true, + 'lang' => true, + 'language' => true, + 'link' => true, + 'media' => true, + 'method' => true, + 'multiple' => true, + 'nohref' => true, + 'noresize' => true, + 'noshade' => true, + 'nowrap' => true, + 'readonly' => true, + 'rel' => true, + 'rev' => true, + 'rules' => true, + 'scope' => true, + 'scrolling' => true, + 'selected' => true, + 'shape' => true, + 'target' => true, + 'text' => true, + 'type' => true, + 'valign' => true, + 'valuetype' => true, + 'vlink' => true, + ); + + /** + * Expected match list for WP_HTML_Processor::select(). + * + * @param array $list_ast Canonical complex selector list AST. + * @param array $rows Element rows in visit order, with ancestorTags. + * @param bool $quirks Whether the document parses in quirks mode. + * @param bool $html_attr_ci Whether HTML's case-insensitive attribute value + * list applies. True models WP/browsers; false + * models an engine without the rule ( lexbor ). + * @return string[] data-fid values in visit order. + */ + public static function expected_html_matches_rows( array $list_ast, array $rows, bool $quirks, bool $html_attr_ci = true ): array { + $out = array(); + foreach ( $rows as $row ) { + if ( self::list_matches_row( $list_ast, $row, $quirks, $html_attr_ci ) ) { + $out[] = $row['fid']; + } + } + return $out; + } + + /** + * Expected match list for WP_HTML_Tag_Processor::select() over the same + * markup. The tag processor never enters quirks mode on its own and a + * compound selector list never inspects ancestors. + * + * @param array $list_ast Canonical complex selector list AST ( contexts must be empty ). + * @param array $rows Tag-view element rows in token order. + * @return string[] data-fid values in token order. + */ + public static function expected_tag_matches_rows( array $list_ast, array $rows ): array { + $out = array(); + foreach ( $rows as $row ) { + $matched = false; + foreach ( $list_ast as $complex ) { + if ( self::compound_matches( $complex['self'], $row, false, true ) ) { + $matched = true; + break; + } + } + if ( $matched ) { + $out[] = $row['fid']; + } + } + return $out; + } + + /** Back-compat: expected html-processor matches from a generated model. */ + public static function expected_html_processor_matches( array $list_ast, array $model, bool $quirks ): array { + return self::expected_html_matches_rows( $list_ast, DocumentGenerator::rows_from_model( $model ), $quirks ); + } + + /** Back-compat: expected tag-processor matches from a generated model. */ + public static function expected_tag_processor_matches( array $list_ast, array $model ): array { + return self::expected_tag_matches_rows( $list_ast, DocumentGenerator::rows_from_model( $model ) ); + } + + public static function list_matches_row( array $list_ast, array $row, bool $quirks, bool $html_attr_ci = true ): bool { + foreach ( $list_ast as $complex ) { + if ( + self::compound_matches( $complex['self'], $row, $quirks, $html_attr_ci ) && + self::explore_context( $complex['context'], $row['ancestorTags'] ) + ) { + return true; + } + } + return false; + } + + /** + * @param array $context Right-to-left ( type, combinator ) pairs. + * @param string[] $ancestor_tags Nearest-ancestor-first tag names. + */ + private static function explore_context( array $context, array $ancestor_tags ): bool { + if ( array() === $context ) { + return true; + } + if ( array() === $ancestor_tags ) { + return false; + } + + list( $type, $combinator ) = $context[0]; + $rest = array_slice( $context, 1 ); + + if ( '>' === $combinator ) { + return self::type_matches( $type, $ancestor_tags[0] ) + && self::explore_context( $rest, array_slice( $ancestor_tags, 1 ) ); + } + + // Descendant: try every matching ancestor. + $count = count( $ancestor_tags ); + for ( $i = 0; $i < $count; $i++ ) { + if ( + self::type_matches( $type, $ancestor_tags[ $i ] ) && + self::explore_context( $rest, array_slice( $ancestor_tags, $i + 1 ) ) + ) { + return true; + } + } + return false; + } + + public static function compound_matches( array $compound, array $row, bool $quirks, bool $html_attr_ci = true ): bool { + if ( null !== $compound['type'] && ! self::type_matches( $compound['type'], $row['tag'] ) ) { + return false; + } + foreach ( (array) $compound['subs'] as $sub ) { + if ( ! self::sub_matches( $sub, $row, $quirks, $html_attr_ci ) ) { + return false; + } + } + return true; + } + + private static function type_matches( string $type, string $tag ): bool { + return '*' === $type || ascii_strtolower( $type ) === ascii_strtolower( $tag ); + } + + private static function sub_matches( array $sub, array $row, bool $quirks, bool $html_attr_ci ): bool { + switch ( $sub['kind'] ) { + case 'class': + return self::class_matches( $sub['name'], $row, $quirks ); + case 'id': + return self::id_matches( $sub['name'], $row, $quirks ); + case 'attr': + return self::attr_matches( $sub, $row, $html_attr_ci ); + } + return false; + } + + private static function class_matches( string $wanted, array $row, bool $quirks ): bool { + $class_value = DocumentGenerator::get_attribute_value( $row, 'class' ); + if ( ! is_string( $class_value ) ) { + return false; + } + + foreach ( DocumentGenerator::class_tokens( $class_value ) as $word ) { + if ( + $quirks + ? ascii_strtolower( $word ) === ascii_strtolower( $wanted ) + : $word === $wanted + ) { + return true; + } + } + return false; + } + + private static function id_matches( string $wanted, array $row, bool $quirks ): bool { + $id = DocumentGenerator::get_attribute_value( $row, 'id' ); + if ( ! is_string( $id ) ) { + return false; + } + return $quirks + ? ascii_strtolower( $id ) === ascii_strtolower( $wanted ) + : $id === $wanted; + } + + private static function attr_matches( array $sub, array $row, bool $html_attr_ci ): bool { + $attr_value = DocumentGenerator::get_attribute_value( $row, $sub['name'] ); + if ( null === $attr_value ) { + return false; + } + if ( null === $sub['matcher'] ) { + return true; + } + if ( true === $attr_value ) { + $attr_value = ''; + } + + $wanted = (string) $sub['value']; + $case_insensitive = 'case-insensitive' === $sub['modifier'] || ( + $html_attr_ci && + null === $sub['modifier'] && + 'html' === ( $row['namespace'] ?? 'html' ) && + isset( self::HTML_CASE_INSENSITIVE_ATTRIBUTES[ ascii_strtolower( $sub['name'] ) ] ) + ); + if ( $case_insensitive ) { + $attr_value = ascii_strtolower( $attr_value ); + $wanted = ascii_strtolower( $wanted ); + } + + switch ( $sub['matcher'] ) { + case 'exact': + return $attr_value === $wanted; + + case 'one-of': + if ( '' === $wanted || strlen( $wanted ) !== strcspn( $wanted, self::WHITESPACE ) ) { + return false; + } + $length = strlen( $attr_value ); + $at = 0; + while ( $at < $length ) { + $at += strspn( $attr_value, self::WHITESPACE, $at ); + if ( $at >= $length ) { + break; + } + $word_length = strcspn( $attr_value, self::WHITESPACE, $at ); + if ( substr( $attr_value, $at, $word_length ) === $wanted ) { + return true; + } + $at += $word_length; + } + return false; + + case 'exact-or-hyphen-suffixed': + if ( $attr_value === $wanted ) { + return true; + } + return 0 === strncmp( $attr_value, $wanted . '-', strlen( $wanted ) + 1 ); + + case 'prefixed': + if ( '' === $wanted ) { + return false; + } + return 0 === strncmp( $attr_value, $wanted, strlen( $wanted ) ); + + case 'suffixed': + if ( '' === $wanted ) { + return false; + } + return strlen( $attr_value ) >= strlen( $wanted ) + && substr( $attr_value, -strlen( $wanted ) ) === $wanted; + + case 'contains': + if ( '' === $wanted ) { + return false; + } + return false !== strpos( $attr_value, $wanted ); + } + + return false; + } +} diff --git a/tools/css-selector-fuzz/lib/SelectorGenerator.php b/tools/css-selector-fuzz/lib/SelectorGenerator.php new file mode 100644 index 0000000000000..668ad57bcace2 --- /dev/null +++ b/tools/css-selector-fuzz/lib/SelectorGenerator.php @@ -0,0 +1,1737 @@ + array( "\x80", 1 ), + 'truncated-2-byte' => array( "\xC3", 1 ), + 'truncated-3-byte' => array( "\xE2\x8C", 1 ), + 'truncated-4-byte' => array( "\xF0\x9F\x82", 1 ), + 'invalid-lead-f5' => array( "\xF5", 1 ), + 'invalid-lead-ff' => array( "\xFF", 1 ), + 'overlong-min' => array( "\xC0\x80", 2 ), + 'overlong-max' => array( "\xC1\xBF", 2 ), + 'surrogate-half' => array( "\xED\xA0\x80", 3 ), + 'beyond-max' => array( "\xF4\x90\x80\x80", 4 ), + ); + + /** @var Prng */ + private $prng; + /** @var array */ + private $pools; + /** @var bool Escape ident codepoints aggressively when rendering. */ + private $escape_boost = false; + + private function __construct( Prng $prng, array $pools ) { + $this->prng = $prng; + $this->pools = $pools; + } + + /** + * Renders a canonical complex-list AST to a selector string. Parsing the + * result must yield exactly the given AST. With $escape_boost, idents are + * escaped far more often (exercises the escape decoder on no-op escapes). + */ + public static function render( Prng $prng, array $list_ast, bool $escape_boost = false ): string { + $generator = new self( $prng, array() ); + $generator->escape_boost = $escape_boost; + return $generator->render_complex_list( $list_ast ); + } + + /** + * Renders a canonical complex-list AST deterministically with minimal + * escaping: single spaces around combinators, `, ` between branches, + * double-quoted attribute values, lowercase `i`/`s` modifiers, and all + * non-ASCII codepoints hex-escaped. Used to hand a semantically-identical + * selector to external engines: lexbor rejects some byte-level forms WP + * correctly accepts ( uppercase I/S attribute modifiers; raw non-ASCII + * ident codepoints in U+00B7, U+00C0-U+00F6 — its non-ASCII ident table + * starts at U+00F8 ). Escaping sidesteps codepoint classification. + */ + public static function render_canonical( array $list_ast ): string { + $branches = array(); + foreach ( $list_ast as $complex ) { + $out = ''; + foreach ( array_reverse( $complex['context'] ) as $pair ) { + list( $type, $combinator ) = $pair; + $out .= '*' === $type ? '*' : self::canonical_ident( $type ); + $out .= '>' === $combinator ? ' > ' : ' '; + } + + $compound = $complex['self']; + if ( null !== $compound['type'] ) { + $out .= '*' === $compound['type'] ? '*' : self::canonical_ident( $compound['type'] ); + } + foreach ( (array) $compound['subs'] as $sub ) { + switch ( $sub['kind'] ) { + case 'class': + $out .= '.' . self::canonical_ident( $sub['name'] ); + break; + case 'id': + $out .= '#' . self::canonical_ident( $sub['name'] ); + break; + case 'attr': + $out .= '[' . self::canonical_ident( $sub['name'] ); + if ( null !== $sub['matcher'] ) { + $matchers = array( + 'exact' => '=', + 'one-of' => '~=', + 'exact-or-hyphen-suffixed' => '|=', + 'prefixed' => '^=', + 'suffixed' => '$=', + 'contains' => '*=', + ); + $out .= $matchers[ $sub['matcher'] ] . self::canonical_string( (string) $sub['value'] ); + if ( 'case-insensitive' === $sub['modifier'] ) { + $out .= ' i'; + } elseif ( 'case-sensitive' === $sub['modifier'] ) { + $out .= ' s'; + } + } + $out .= ']'; + break; + } + } + $branches[] = $out; + } + return implode( ', ', $branches ); + } + + private static function canonical_ident( string $name ): string { + $points = utf8_codepoints( $name ); + $count = count( $points ); + $out = ''; + + foreach ( $points as $i => $point ) { + list( $char, $cp ) = $point; + + $is_digit = $cp >= 0x30 && $cp <= 0x39; + $is_ident_char = ( + '-' === $char || + '_' === $char || + $is_digit || + ( $cp >= 0x41 && $cp <= 0x5A ) || + ( $cp >= 0x61 && $cp <= 0x7A ) + ); + + $must_escape = ! $is_ident_char + || ( 0 === $i && $is_digit ) + || ( 1 === $i && '-' === $points[0][0] && $is_digit ) + || ( 1 === $count && '-' === $char ); + + $out .= $must_escape ? '\\' . dechex( $cp ) . ' ' : $char; + } + + return $out; + } + + private static function canonical_string( string $value ): string { + $out = '"'; + foreach ( utf8_codepoints( $value ) as $point ) { + list( $char, $cp ) = $point; + if ( '"' === $char || '\\' === $char || $cp < 0x20 || $cp > 0x7E ) { + $out .= '\\' . dechex( $cp ) . ' '; + } else { + $out .= $char; + } + } + return $out . '"'; + } + + /** + * @param array $pools Pools from DocumentGenerator ( tags, classes, ids, attrNames, attrValues ). + * @param array|null $rows Element rows ( TreeCapture shape ) with real + * fids; enables the path-directed bucket. + * @return array{ + * bucket: string, + * selector: string, + * expectCompound: bool|null, + * expectComplex: bool|null, + * ast: array|null, + * mustMatchFid: string|null, + * mustNotMatchFid: string|null, + * } + */ + public static function generate( Prng $prng, array $pools, ?array $rows = null, ?string $bucket = null ): array { + $generator = new self( $prng, $pools ); + + if ( null === $bucket ) { + $bucket = $prng->weighted( + null === $rows || array() === $rows + ? array( + 'supported-compound' => 28, + 'supported-complex' => 24, + 'unsupported' => 14, + 'invalid' => 11, + 'invalid-utf8' => 5, + 'chaos' => 8, + 'mutated' => 10, + 'edge-escape' => 5, + ) + : array( + 'supported-compound' => 23, + 'supported-complex' => 19, + 'path-directed' => 21, + 'unsupported' => 11, + 'invalid' => 9, + 'invalid-utf8' => 5, + 'chaos' => 6, + 'mutated' => 6, + 'edge-escape' => 5, + ) + ); + } + + if ( 'path-directed' === $bucket && ( null === $rows || array() === $rows ) ) { + $bucket = 'supported-complex'; + } + + switch ( $bucket ) { + case 'supported-compound': + $ast = $generator->gen_complex_list( false ); + return array( + 'bucket' => $bucket, + 'selector' => $generator->render_complex_list( $ast ), + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => $ast, + ); + + case 'supported-complex': + $ast = $generator->gen_complex_list( true ); + return array( + 'bucket' => $bucket, + 'selector' => $generator->render_complex_list( $ast ), + 'expectCompound' => false, + 'expectComplex' => true, + 'ast' => $ast, + ); + + case 'path-directed': + return $generator->gen_path_directed( $rows ); + + case 'edge-escape': + return $generator->gen_edge_escape(); + + case 'invalid-utf8': + return $generator->gen_invalid_utf8(); + + case 'unsupported': + return array( + 'bucket' => $bucket, + 'selector' => $generator->gen_unsupported(), + 'expectCompound' => false, + 'expectComplex' => false, + 'ast' => null, + ); + + case 'invalid': + return array( + 'bucket' => $bucket, + 'selector' => $generator->gen_invalid(), + 'expectCompound' => false, + 'expectComplex' => false, + 'ast' => null, + ); + + case 'chaos': + return array( + 'bucket' => $bucket, + 'selector' => $generator->gen_chaos(), + 'expectCompound' => null, + 'expectComplex' => null, + 'ast' => null, + ); + + case 'mutated': + default: + $ast = $generator->gen_complex_list( $generator->prng->chance( 50 ) ); + $rendered = $generator->render_complex_list( $ast ); + return array( + 'bucket' => 'mutated', + 'selector' => $generator->mutate( $rendered ), + 'expectCompound' => null, + 'expectComplex' => null, + 'ast' => null, + ); + } + } + + /* + * -------------- + * AST generation + * -------------- + * + * Canonical AST shapes (matching what AstExtractor produces from + * parsed WP_CSS_* objects): + * + * list: array of complex + * complex: array( 'context' => array( array( type, combinator ) ... right-to-left ), 'self' => compound ) + * compound: array( 'type' => string|null, 'subs' => array|null ) + * sub: array( 'kind' => 'class'|'id', 'name' => string ) + * | array( 'kind' => 'attr', 'name' => string, 'matcher' => string|null, + * 'value' => string|null, 'modifier' => string|null ) + */ + + private function gen_complex_list( bool $require_combinator ): array { + $count = $this->prng->weighted( + array( + 1 => 55, + 2 => 30, + 3 => 15, + ) + ); + + $list = array(); + $combinator_at = $require_combinator ? $this->prng->int( 0, $count - 1 ) : -1; + for ( $i = 0; $i < $count; $i++ ) { + $wants_combinators = $i === $combinator_at || ( $require_combinator && $this->prng->chance( 30 ) ); + $list[] = $this->gen_complex( $require_combinator ? $wants_combinators : false ); + } + return $list; + } + + private function gen_complex( bool $with_combinators ): array { + $context = array(); + if ( $with_combinators ) { + $context_count = $this->prng->int( 1, 3 ); + for ( $i = 0; $i < $context_count; $i++ ) { + $context[] = array( + $this->gen_type_name( true ), + $this->prng->chance( 50 ) ? ' ' : '>', + ); + } + } + + return array( + 'context' => $context, + 'self' => $this->gen_compound(), + ); + } + + private function gen_compound(): array { + $has_type = $this->prng->chance( 65 ); + $sub_count = $this->prng->weighted( + array( + 0 => 30, + 1 => 40, + 2 => 20, + 3 => 10, + ) + ); + if ( ! $has_type && 0 === $sub_count ) { + if ( $this->prng->chance( 50 ) ) { + $has_type = true; + } else { + $sub_count = 1; + } + } + + $subs = array(); + for ( $i = 0; $i < $sub_count; $i++ ) { + $subs[] = $this->gen_subclass(); + } + + return array( + 'type' => $has_type ? $this->gen_type_name( false ) : null, + 'subs' => array() === $subs ? null : $subs, + ); + } + + private function gen_type_name( bool $for_context ): string { + if ( $this->prng->chance( $for_context ? 25 : 12 ) ) { + return '*'; + } + $pool = $this->pools['tags'] ?? array(); + if ( array() !== $pool && $this->prng->chance( 70 ) ) { + $name = $this->prng->choice( $pool ); + return $this->prng->chance( 25 ) ? $this->random_case( $name ) : $name; + } + return $this->prng->choice( array( 'video', 'table', 'x-absent', 'object', 'span' ) ); + } + + private function gen_subclass(): array { + $kind = $this->prng->weighted( + array( + 'class' => 40, + 'id' => 25, + 'attr' => 35, + ) + ); + + switch ( $kind ) { + case 'class': + return array( + 'kind' => 'class', + 'name' => $this->pick_name( 'classes' ), + ); + case 'id': + return array( + 'kind' => 'id', + 'name' => $this->pick_name( 'ids' ), + ); + default: + return $this->gen_attr_selector(); + } + } + + private function gen_attr_selector(): array { + $name = $this->pick_name( 'attrNames' ); + + $matcher = $this->prng->weighted( + array( + '' => 25, + 'exact' => 20, + 'one-of' => 12, + 'exact-or-hyphen-suffixed' => 11, + 'prefixed' => 11, + 'suffixed' => 11, + 'contains' => 10, + ) + ); + $matcher = '' === $matcher ? null : $matcher; + + if ( null === $matcher ) { + return array( + 'kind' => 'attr', + 'name' => $name, + 'matcher' => null, + 'value' => null, + 'modifier' => null, + ); + } + + $modifier = $this->prng->weighted( + array( + '' => 70, + 'case-insensitive' => 18, + 'case-sensitive' => 12, + ) + ); + + $value = $this->gen_attr_value(); + + /* + * HTML's case-insensitive attribute value list: with no modifier, + * the values of listed attributes ( type, rel, lang, dir, ... ) + * match ASCII case-insensitively on HTML elements. Sometimes flip + * the case of the selector value for a listed attribute so the + * differential exercises that rule rather than relying on sampled + * values happening to differ in case. + */ + if ( + '' === $modifier && + isset( ReferenceMatcher::HTML_CASE_INSENSITIVE_ATTRIBUTES[ ascii_strtolower( $name ) ] ) && + $this->prng->chance( 40 ) + ) { + $value = $this->prng->chance( 50 ) ? ascii_strtoupper( $value ) : str_shuffle_case( $value, $this->prng ); + } + + return array( + 'kind' => 'attr', + 'name' => $name, + 'matcher' => $matcher, + 'value' => $value, + 'modifier' => '' === $modifier ? null : $modifier, + ); + } + + private function gen_attr_value(): string { + $pool = $this->pools['attrValues'] ?? array(); + + $kind = $this->prng->weighted( + array( + 'pool' => 35, + 'pool-part' => 20, + 'pool-case' => 10, + 'empty' => 10, + 'word' => 15, + 'tricky' => 10, + ) + ); + + if ( in_array( $kind, array( 'pool', 'pool-part', 'pool-case' ), true ) && array() === $pool ) { + $kind = 'word'; + } + + switch ( $kind ) { + case 'pool': + return $this->prng->choice( $pool ); + + case 'pool-part': + $value = $this->prng->choice( $pool ); + if ( '' === $value ) { + return ''; + } + $points = utf8_codepoints( $value ); + $total = count( $points ); + $start = $this->prng->int( 0, max( 0, $total - 1 ) ); + $length = $this->prng->int( 1, $total - $start ); + $part = ''; + for ( $i = $start; $i < $start + $length; $i++ ) { + $part .= $points[ $i ][0]; + } + return $part; + + case 'pool-case': + return $this->random_case( $this->prng->choice( $pool ) ); + + case 'empty': + return ''; + + case 'word': + return $this->prng->choice( array( 'alpha', 'beta9', 'value', 'main-item', 'Z', 'i', 's', 'one two', 'x-y-z' ) ); + + case 'tricky': + default: + return $this->prng->choice( + array( + 'a b', + " lead", + "trail ", + "tab\there", + "line\nbreak", + 'quote"inside', + "apos'inside", + 'back\\slash', + '-', + '--', + '0digit', + 'ünïcode', + ) + ); + } + } + + private function pick_name( string $pool_key ): string { + $pool = $this->pools[ $pool_key ] ?? array(); + if ( array() !== $pool && $this->prng->chance( 65 ) ) { + $name = $this->prng->choice( $pool ); + if ( '' !== $name && $this->prng->chance( 20 ) ) { + $name = $this->random_case( $name ); + } + if ( '' !== $name ) { + return $name; + } + } + return $this->prng->choice( + array( + 'absent', + 'no-such-thing', + 'x', + '-lead', + '--double', + '_under', + 'Ünïcode', + 'with space', + '9starts-with-digit', + '-9hyphen-digit', + 'mixedCase', + ) + ); + } + + /* + * --------------------------- + * Edge-case escapes and input + * --------------------------- + * + * Targets parser branches the structural generators can't reach: + * - hex escapes whose codepoint is NUL / a surrogate / over-max, which + * `consume_escaped_codepoint` must decode to U+FFFD; + * - raw NUL / CR / CRLF / FF bytes in the selector input, which + * `normalize_selector_input` rewrites ( NUL→U+FFFD, the rest→LF ). + * + * These carry a known intended AST: the decoded ident is the U+FFFD + * replacement character ( or, for input normalization, the same selector + * with whitespace normalized ), so the AST round-trip still applies. + */ + private function gen_edge_escape(): array { + $kind = $this->prng->weighted( + array( + 'fffd-ident' => 35, + 'eof-escape' => 20, + 'eof-truncated' => 15, + 'nul-input' => 15, + 'ws-input' => 15, + ) + ); + + if ( 'eof-truncated' === $kind ) { + /* + * The end of input auto-closes an unterminated attribute selector + * block ( and an unterminated string inside it ): `[a=b` is the + * same selector as `[a=b]`. + * + * https://www.w3.org/TR/css-syntax-3/#consume-simple-block + */ + $matcher = $this->prng->choice( array( null, 'exact', 'one-of', 'exact-or-hyphen-suffixed', 'prefixed', 'suffixed', 'contains' ) ); + $value = null === $matcher ? null : $this->prng->choice( array( 'v' . $this->prng->int( 0, 99 ), 'a b', '', 'x,y', "caf\u{E9}" ) ); + $modifier = null !== $matcher && $this->prng->chance( 30 ) + ? $this->prng->choice( array( 'case-insensitive', 'case-sensitive' ) ) + : null; + $compound = array( + 'type' => $this->prng->chance( 50 ) ? 'div' : null, + 'subs' => array( + array( + 'kind' => 'attr', + 'name' => 'a' . $this->prng->int( 0, 99 ), + 'matcher' => $matcher, + 'value' => $value, + 'modifier' => $modifier, + ), + ), + ); + + // The attribute selector is the final rendered unit, so the render always ends with ']'. + $rendered = $this->render_compound( $compound ); + $truncated = substr( $rendered, 0, -1 ); + + // Sometimes also drop a closing string quote: EOF terminates the string, then closes the block. + $last_byte = substr( $truncated, -1 ); + if ( ( '"' === $last_byte || "'" === $last_byte ) && $this->prng->chance( 50 ) ) { + $truncated = substr( $truncated, 0, -1 ); + + // A backslash at the end of an unterminated string "does nothing": the value is unchanged. + if ( $this->prng->chance( 40 ) ) { + $truncated .= '\\'; + } + } + + return array( + 'bucket' => 'edge-escape', + 'selector' => $truncated, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => array( + array( + 'context' => array(), + 'self' => $compound, + ), + ), + ); + } + + if ( 'eof-escape' === $kind ) { + /* + * A backslash at the end of input is a valid escape ( EOF is not + * a newline ) and decodes to U+FFFD, in ident context only: + * `.foo\` is the class `foo\u{FFFD}`. + * + * https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point + */ + $name = $this->prng->chance( 30 ) ? '' : 'a' . $this->prng->int( 0, 99 ); + list( $selector, $self ) = $this->prng->choice( + array( + array( + '.' . $name . '\\', + array( + 'type' => null, + 'subs' => array( array( 'kind' => 'class', 'name' => $name . "\u{FFFD}" ) ), + ), + ), + array( + '#' . $name . '\\', + array( + 'type' => null, + 'subs' => array( array( 'kind' => 'id', 'name' => $name . "\u{FFFD}" ) ), + ), + ), + array( + $name . '\\', + array( + 'type' => $name . "\u{FFFD}", + 'subs' => null, + ), + ), + ) + ); + return array( + 'bucket' => 'edge-escape', + 'selector' => $selector, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => array( + array( + 'context' => array(), + 'self' => $self, + ), + ), + ); + } + + if ( 'fffd-ident' === $kind ) { + // A class selector whose name is a single U+FFFD, produced by a + // hex escape for an out-of-range codepoint. + $hex = $this->prng->choice( + array( + '0', + '00', + '000000', + dechex( $this->prng->int( 0xD800, 0xDFFF ) ), // surrogate + dechex( $this->prng->int( 0x110000, 0xFFFFFF ) ), // over-max + ) + ); + if ( $this->prng->chance( 40 ) ) { + $hex = strtoupper( $hex ); + } + $selector = '.\\' . $hex . ' '; + $ast = array( + array( + 'context' => array(), + 'self' => array( + 'type' => null, + 'subs' => array( array( 'kind' => 'class', 'name' => "\u{FFFD}" ) ), + ), + ), + ); + return array( + 'bucket' => 'edge-escape', + 'selector' => $selector, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => $ast, + ); + } + + /* + * Raw control bytes in the selector input. A small fixed compound + * keeps the case focused on normalize_selector_input and avoids + * entangling with unrelated attribute-selector edge cases. + */ + $compound = array( + 'type' => $this->prng->chance( 50 ) ? 'span' : null, + 'subs' => array( + array( 'kind' => 'class', 'name' => 'foo' ), + array( 'kind' => 'id', 'name' => 'bar' ), + ), + ); + if ( null === $compound['type'] && $this->prng->chance( 50 ) ) { + array_pop( $compound['subs'] ); + } + $rendered = $this->render_compound( $compound ); + + if ( 'nul-input' === $kind ) { + // A NUL between a class dot's selectors becomes part of an ident + // only in limited spots; simplest reliable case: a class whose + // name contains a NUL ( → U+FFFD ). + $ast = array( + array( + 'context' => array(), + 'self' => array( + 'type' => null, + 'subs' => array( array( 'kind' => 'class', 'name' => "a\u{FFFD}b" ) ), + ), + ), + ); + return array( + 'bucket' => 'edge-escape', + 'selector' => ".a\0b", + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => $ast, + ); + } + + // ws-input: wrap/insert CR, CRLF, FF as insignificant whitespace. + $lead = $this->prng->choice( array( "\r", "\f", "\r\n", "\r\r", "\f\f" ) ); + $trail = $this->prng->choice( array( "\r", "\f", "\r\n", '' ) ); + return array( + 'bucket' => 'edge-escape', + 'selector' => $lead . $rendered . $trail, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => array( + array( + 'context' => array(), + 'self' => $compound, + ), + ), + ); + } + + /* + * ----------------------- + * Invalid-UTF-8 injection + * ----------------------- + * + * Raw ill-formed UTF-8 byte sequences in the selector input, mirroring + * the nul-input pattern: a small fixed simple selector keeps the case + * focused on the normalize_selector_input() scrub. Each maximal subpart + * of the injected sequence decodes to one U+FFFD ( per-class counts + * pinned in INVALID_UTF8_CLASSES ), and U+FFFD is a valid ident + * codepoint — including in start position — so the scrubbed selector + * must parse and the post-scrub AST is known by construction. + */ + private function gen_invalid_utf8(): array { + list( $bytes, $subparts ) = $this->prng->choice( array_values( self::INVALID_UTF8_CLASSES ) ); + + $position = $this->prng->choice( array( 'lead', 'mid', 'trail', 'whole' ) ); + $prefix = in_array( $position, array( 'lead', 'whole' ), true ) ? '' : 'a' . $this->prng->int( 0, 9 ); + $suffix = in_array( $position, array( 'trail', 'whole' ), true ) ? '' : 'z' . $this->prng->int( 0, 9 ); + $raw = $prefix . $bytes . $suffix; + $decoded = $prefix . str_repeat( "\u{FFFD}", $subparts ) . $suffix; + + switch ( $this->prng->choice( array( 'class', 'id', 'attr-name', 'attr-value' ) ) ) { + case 'class': + $rendered = '.' . $raw; + $sub = array( + 'kind' => 'class', + 'name' => $decoded, + ); + break; + + case 'id': + $rendered = '#' . $raw; + $sub = array( + 'kind' => 'id', + 'name' => $decoded, + ); + break; + + case 'attr-name': + $rendered = '[' . $raw . ']'; + $sub = array( + 'kind' => 'attr', + 'name' => $decoded, + 'matcher' => null, + 'value' => null, + 'modifier' => null, + ); + break; + + case 'attr-value': + default: + $name = 'a' . $this->prng->int( 0, 99 ); + $quote = $this->prng->chance( 50 ) ? '"' : "'"; + $rendered = '[' . $name . '=' . $quote . $raw . $quote . ']'; + $sub = array( + 'kind' => 'attr', + 'name' => $name, + 'matcher' => 'exact', + 'value' => $decoded, + 'modifier' => null, + ); + break; + } + + $type = $this->prng->chance( 40 ) ? 'span' : null; + + return array( + 'bucket' => 'invalid-utf8', + 'selector' => ( null === $type ? '' : $type ) . $rendered, + 'expectCompound' => true, + 'expectComplex' => true, + 'ast' => array( + array( + 'context' => array(), + 'self' => array( + 'type' => $type, + 'subs' => array( $sub ), + ), + ), + ), + ); + } + + /* + * ------------------------ + * Path-directed generation + * ------------------------ + * + * Synthesizes a selector from a real element of the model tree so that + * the selector is guaranteed (by construction) to match that element: + * the type comes from its tag, subclasses from its actual classes / id / + * attributes, and the context chain from its actual ancestor tags with + * combinators consistent with the real nesting. Optionally one feature + * is then flipped into a "near-miss" that is guaranteed NOT to match + * the element ( or, for combinator loosening, still guaranteed to ). + */ + + private function gen_path_directed( array $rows ): array { + // Bias toward elements deep enough for a meaningful context chain. + $deep = array(); + foreach ( $rows as $row ) { + if ( count( $row['ancestorTags'] ) >= 2 ) { + $deep[] = $row; + } + } + $element = array() !== $deep && $this->prng->chance( 75 ) + ? $this->prng->choice( $deep ) + : $this->prng->choice( $rows ); + + $compound = $this->path_compound_for( $element ); + $context = array() !== $element['ancestorTags'] && $this->prng->chance( 75 ) + ? $this->path_context_for( $element['ancestorTags'] ) + : array(); + + $list = array( + array( + 'context' => $context, + 'self' => $compound, + ), + ); + + $must_match = $element['fid']; + $must_not_match = null; + + if ( $this->prng->chance( 40 ) ) { + list( $list, $must_match, $must_not_match ) = $this->path_near_miss( $list, $element ); + } elseif ( $this->prng->chance( 20 ) ) { + // Extra unrelated branch: a list union can only add matches. + $list[] = $this->gen_complex( $this->prng->chance( 30 ) ); + } + + $has_context = false; + foreach ( $list as $complex ) { + if ( array() !== $complex['context'] ) { + $has_context = true; + break; + } + } + + return array( + 'bucket' => 'path-directed', + 'selector' => $this->render_complex_list( $list ), + 'expectCompound' => ! $has_context, + 'expectComplex' => true, + 'ast' => $list, + 'mustMatchFid' => $must_match, + 'mustNotMatchFid' => $must_not_match, + ); + } + + /** A compound selector built only from features the element row really has. */ + private function path_compound_for( array $element ): array { + $tag = ascii_strtolower( $element['tag'] ); + + $features = array(); + + $class_value = DocumentGenerator::get_attribute_value( $element, 'class' ); + if ( is_string( $class_value ) ) { + foreach ( DocumentGenerator::class_tokens( $class_value ) as $word ) { + $features[] = array( 'kind' => 'class', 'name' => $word ); + } + } + + $id_value = DocumentGenerator::get_attribute_value( $element, 'id' ); + if ( is_string( $id_value ) && '' !== $id_value ) { + $features[] = array( 'kind' => 'id', 'name' => $id_value ); + } + + $seen_attrs = array(); + foreach ( $element['attrs'] as $attr ) { + $lower = ascii_strtolower( $attr[0] ); + if ( isset( $seen_attrs[ $lower ] ) ) { + continue; + } + $seen_attrs[ $lower ] = true; + if ( 'class' === $lower && is_string( $attr[1] ) && false !== strpos( $attr[1], "\0" ) ) { + continue; + } + $features[] = $this->path_attr_feature( $lower, $attr[1], 'html' === ( $element['namespace'] ?? 'html' ) ); + } + + $subs = array(); + $available = count( $features ); + if ( $available > 0 ) { + $want = min( $available, $this->prng->weighted( array( 0 => 25, 1 => 40, 2 => 25, 3 => 10 ) ) ); + for ( $i = 0; $i < $want; $i++ ) { + $at = $this->prng->int( 0, count( $features ) - 1 ); + $subs[] = $features[ $at ]; + array_splice( $features, $at, 1 ); + } + } + + $type = null; + if ( array() === $subs || $this->prng->chance( 70 ) ) { + $type = $this->prng->chance( 12 ) ? '*' : ( $this->prng->chance( 30 ) ? $this->random_case( $tag ) : $tag ); + } + + return array( + 'type' => $type, + 'subs' => array() === $subs ? null : $subs, + ); + } + + /** An attribute selector that the (name, value) pair satisfies. */ + private function path_attr_feature( string $name, $value, bool $is_html_namespace = true ): array { + $presence = array( + 'kind' => 'attr', + 'name' => $this->prng->chance( 15 ) ? $this->random_case( $name ) : $name, + 'matcher' => null, + 'value' => null, + 'modifier' => null, + ); + + if ( true === $value ) { + // A boolean attribute has the empty string as its value. + $value = ''; + } + if ( ! is_string( $value ) || $this->prng->chance( 30 ) ) { + return $presence; + } + + $points = utf8_codepoints( $value ); + $total = count( $points ); + + $candidates = array( array( 'exact', $value ) ); + + foreach ( preg_split( '/[ \t\n\f\r]+/', $value, -1, PREG_SPLIT_NO_EMPTY ) as $word ) { + $candidates[] = array( 'one-of', $word ); + break; + } + + $hyphen_at = strpos( $value, '-' ); + $candidates[] = array( 'exact-or-hyphen-suffixed', false === $hyphen_at ? $value : substr( $value, 0, $hyphen_at ) ); + + if ( $total > 0 ) { + $slice = static function ( array $points, int $start, int $length ): string { + $out = ''; + for ( $i = $start; $i < $start + $length; $i++ ) { + $out .= $points[ $i ][0]; + } + return $out; + }; + + $candidates[] = array( 'prefixed', $slice( $points, 0, $this->prng->int( 1, $total ) ) ); + $length = $this->prng->int( 1, $total ); + $candidates[] = array( 'suffixed', $slice( $points, $total - $length, $length ) ); + $start = $this->prng->int( 0, $total - 1 ); + $candidates[] = array( 'contains', $slice( $points, $start, $this->prng->int( 1, $total - $start ) ) ); + } + + list( $matcher, $operand ) = $this->prng->choice( $candidates ); + + /* + * `|=` with an operand cut at a hyphen only matches when the operand + * is non-empty and actually a value prefix; an operand equal to the + * value always matches. Guard the degenerate empty-operand cases. + */ + if ( 'exact-or-hyphen-suffixed' === $matcher && '' === $operand && '' !== $value ) { + $matcher = 'exact'; + $operand = $value; + } + if ( in_array( $matcher, array( 'one-of', 'prefixed', 'suffixed', 'contains' ), true ) && '' === $operand ) { + return $presence; + } + + $modifier = null; + if ( $this->prng->chance( 25 ) ) { + if ( $this->prng->chance( 60 ) ) { + $modifier = 'case-insensitive'; + $operand = $this->random_case( $operand ); + } else { + $modifier = 'case-sensitive'; + } + } elseif ( + $is_html_namespace && + isset( ReferenceMatcher::HTML_CASE_INSENSITIVE_ATTRIBUTES[ $name ] ) && + $this->prng->chance( 50 ) + ) { + /* + * HTML's case-insensitive attribute value list: with no modifier + * the flipped operand still satisfies the (name, value) pair on + * an html-namespace element, which makes the folding rule + * load-bearing for the mustMatchFid invariant — name and value + * here come from the same real element, unlike the independent + * pools in gen_attr_selector. + */ + $operand = $this->random_case( $operand ); + } + + return array( + 'kind' => 'attr', + 'name' => $presence['name'], + 'matcher' => $matcher, + 'value' => $operand, + 'modifier' => $modifier, + ); + } + + /** + * A context chain ( right-to-left ( type, combinator ) pairs ) drawn from + * the element's real ancestors so the chain is satisfied by construction: + * `>` is only used for the immediately-next ancestor, descendant + * combinators may skip generations. + * + * @param string[] $ancestor_tags Nearest-first ancestor tag names. + */ + private function path_context_for( array $ancestor_tags ): array { + $chain = array(); + $pos = 0; + $count = count( $ancestor_tags ); + + while ( $pos < $count && ( array() === $chain || $this->prng->chance( 45 ) ) ) { + $jump = $this->prng->chance( 65 ) ? 0 : $this->prng->int( 0, $count - 1 - $pos ); + $at = $pos + $jump; + + $combinator = ( 0 === $jump && $this->prng->chance( 55 ) ) ? '>' : ' '; + $tag = ascii_strtolower( $ancestor_tags[ $at ] ); + $type = $this->prng->chance( 12 ) + ? '*' + : ( $this->prng->chance( 25 ) ? $this->random_case( $tag ) : $tag ); + + $chain[] = array( $type, $combinator ); + $pos = $at + 1; + } + + return $chain; + } + + /** + * Flips one feature of the guaranteed-match selector. Most flips + * guarantee the element no longer matches; loosening a `>` to a + * descendant combinator must keep it matching. + * + * @return array{0: array, 1: string|null, 2: string|null} list, mustMatchFid, mustNotMatchFid. + */ + private function path_near_miss( array $list, array $element ): array { + $complex = $list[0]; + $compound = $complex['self']; + $fid = $element['fid']; + + $flips = array( 'wrong-class', 'wrong-attr' ); + if ( null !== $compound['type'] && '*' !== $compound['type'] ) { + $flips[] = 'wrong-type'; + } + foreach ( $complex['context'] as $pair ) { + if ( '>' === $pair[1] ) { + $flips[] = 'loosen-combinator'; + } + $flips[] = 'tighten-combinator'; + break; + } + + switch ( $this->prng->choice( $flips ) ) { + case 'wrong-type': + $tag = ascii_strtolower( $element['tag'] ); + do { + $other = $this->prng->choice( DocumentGenerator::SAFE_TAGS ); + } while ( $other === $tag ); + $complex['self']['type'] = $this->prng->chance( 25 ) ? $this->random_case( $other ) : $other; + return array( array( $complex ), null, $fid ); + + case 'wrong-attr': + $subs = (array) $complex['self']['subs']; + $subs[] = array( + 'kind' => 'attr', + 'name' => 'zz-no-such-attr', + 'matcher' => null, + 'value' => null, + 'modifier' => null, + ); + $complex['self']['subs'] = $subs; + return array( array( $complex ), null, $fid ); + + case 'loosen-combinator': + // Replacing every `>` with a descendant combinator can only + // widen the context; the element must still match. + foreach ( $complex['context'] as &$pair ) { + $pair[1] = ' '; + } + unset( $pair ); + $list[0] = $complex; + return array( $list, $fid, null ); + + case 'tighten-combinator': + // May or may not still match; no membership expectation. + $at = $this->prng->int( 0, count( $complex['context'] ) - 1 ); + $complex['context'][ $at ][1] = '>'; + $list[0] = $complex; + return array( $list, null, null ); + + case 'wrong-class': + default: + $subs = (array) $complex['self']['subs']; + $subs[] = array( + 'kind' => 'class', + 'name' => 'zz-no-such-class', + ); + $complex['self']['subs'] = $subs; + return array( array( $complex ), null, $fid ); + } + } + + /* + * --------- + * Rendering + * --------- + */ + + private function render_complex_list( array $list ): string { + $bits = array(); + foreach ( $list as $complex ) { + $bits[] = $this->render_complex( $complex ); + } + + $out = $this->maybe_ws( 25 ); + foreach ( $bits as $i => $bit ) { + if ( $i > 0 ) { + $out .= $this->maybe_ws( 40 ) . ',' . $this->maybe_ws( 60 ); + } + $out .= $bit; + } + return $out . $this->maybe_ws( 25 ); + } + + private function render_complex( array $complex ): string { + $out = ''; + // Context selectors are stored right-to-left; render left-to-right. + $reversed = array_reverse( $complex['context'] ); + foreach ( $reversed as $pair ) { + list( $type, $combinator ) = $pair; + $out .= '*' === $type ? '*' : $this->render_ident( $type ); + if ( '>' === $combinator ) { + $out .= $this->maybe_ws( 50 ) . '>' . $this->maybe_ws( 50 ); + } else { + $out .= $this->ws(); + } + } + return $out . $this->render_compound( $complex['self'] ); + } + + private function render_compound( array $compound ): string { + $out = ''; + if ( null !== $compound['type'] ) { + $out .= '*' === $compound['type'] ? '*' : $this->render_ident( $compound['type'] ); + } + foreach ( (array) $compound['subs'] as $sub ) { + switch ( $sub['kind'] ) { + case 'class': + $out .= '.' . $this->render_ident( $sub['name'] ); + break; + case 'id': + $out .= '#' . $this->render_ident( $sub['name'] ); + break; + case 'attr': + $out .= $this->render_attr_selector( $sub ); + break; + } + } + return $out; + } + + private function render_attr_selector( array $sub ): string { + $out = '[' . $this->maybe_ws( 20 ) . $this->render_ident( $sub['name'] ) . $this->maybe_ws( 20 ); + + if ( null === $sub['matcher'] ) { + return $out . ']'; + } + + $matcher_strings = array( + 'exact' => '=', + 'one-of' => '~=', + 'exact-or-hyphen-suffixed' => '|=', + 'prefixed' => '^=', + 'suffixed' => '$=', + 'contains' => '*=', + ); + $out .= $matcher_strings[ $sub['matcher'] ] . $this->maybe_ws( 25 ); + + $value = $sub['value']; + $value_as_ident = '' !== $value && $this->can_render_as_ident( $value ) && $this->prng->chance( 45 ); + if ( $value_as_ident ) { + $out .= $this->render_ident( $value ); + } else { + $out .= $this->render_string( $value ); + } + + if ( null !== $sub['modifier'] ) { + // After an ident value, whitespace is mandatory before the modifier. + $out .= $value_as_ident ? $this->ws() : $this->maybe_ws( 60 ); + + if ( 'case-insensitive' === $sub['modifier'] ) { + $out .= $this->prng->chance( 70 ) ? 'i' : 'I'; + } else { + $out .= $this->prng->chance( 70 ) ? 's' : 'S'; + } + } + + return $out . $this->maybe_ws( 25 ) . ']'; + } + + /** + * Whether a value contains only codepoints this renderer is willing to + * put in an ident token (everything can be escaped, but a value ending + * in whitespace as an ident is fragile to read — strings handle those). + */ + private function can_render_as_ident( string $value ): bool { + return '' !== $value; + } + + /** + * Renders a name as a CSS ident token, escaping wherever required and + * sometimes where merely allowed. Parsing the result must yield $name. + */ + private function render_ident( string $name ): string { + $points = utf8_codepoints( $name ); + $count = count( $points ); + $out = ''; + + foreach ( $points as $i => $point ) { + list( $char, $cp ) = $point; + + $is_digit = $cp >= 0x30 && $cp <= 0x39; + $is_ident_char = ( + '-' === $char || + '_' === $char || + $is_digit || + ( $cp >= 0x41 && $cp <= 0x5A ) || + ( $cp >= 0x61 && $cp <= 0x7A ) || + $cp > 0x7F + ); + + $must_escape = ! $is_ident_char + || ( 0 === $i && $is_digit ) + || ( 1 === $i && '-' === $points[0][0] && $is_digit ) + || ( 1 === $count && '-' === $char ); + + if ( $must_escape || $this->prng->chance( $this->escape_boost ? 50 : 8 ) ) { + $out .= $this->render_escape( $char, $cp ); + } else { + $out .= $char; + } + } + + return $out; + } + + /** + * Renders one codepoint as a CSS escape sequence that decodes back to it. + */ + private function render_escape( string $char, int $cp ): string { + $is_hex_digit = ( $cp >= 0x30 && $cp <= 0x39 ) + || ( $cp >= 0x41 && $cp <= 0x46 ) + || ( $cp >= 0x61 && $cp <= 0x66 ); + $is_newline_like = "\n" === $char || "\r" === $char || "\f" === $char; + + /* + * Identity escapes are only safe for single-byte chars that are not + * hex digits (they would start a hex escape) and not newlines + * (backslash-newline is not a valid escape). + */ + $identity_ok = ! $is_hex_digit && ! $is_newline_like && $cp >= 0x20; + + if ( $identity_ok && $this->prng->chance( 35 ) ) { + return '\\' . $char; + } + + $hex = dechex( $cp ); + if ( $this->prng->chance( 25 ) && strlen( $hex ) < 6 ) { + $hex = str_pad( $hex, $this->prng->int( strlen( $hex ), 6 ), '0', STR_PAD_LEFT ); + } + if ( $this->prng->chance( 30 ) ) { + $hex = strtoupper( $hex ); + } + + // The trailing space is always emitted; it is consumed by the escape. + return '\\' . $hex . ' '; + } + + /** + * Renders a value as a CSS string token. Parsing must yield $value. + */ + private function render_string( string $value ): string { + $quote = $this->prng->chance( 60 ) ? '"' : "'"; + $out = $quote; + $points = utf8_codepoints( $value ); + + foreach ( $points as $point ) { + list( $char, $cp ) = $point; + + if ( "\n" === $char || "\r" === $char || "\f" === $char ) { + // Literal newlines end (break) the string; always hex-escape. + $out .= '\\' . dechex( $cp ) . ' '; + continue; + } + if ( $char === $quote || '\\' === $char ) { + $out .= $this->prng->chance( 60 ) ? '\\' . $char : '\\' . dechex( $cp ) . ' '; + continue; + } + if ( $this->prng->chance( 5 ) ) { + $out .= $this->render_escape( $char, $cp ); + continue; + } + $out .= $char; + } + + // Rarely add a backslash-newline line continuation (decodes to nothing). + if ( $this->prng->chance( 4 ) ) { + $out .= "\\\n"; + } + + return $out . $quote; + } + + private function ws(): string { + $options = array( ' ', ' ', ' ', "\t", "\n", "\f", "\r", ' ', " \t " ); + return $this->prng->choice( $options ); + } + + private function maybe_ws( int $percent ): string { + return $this->prng->chance( $percent ) ? $this->ws() : ''; + } + + private function random_case( string $input ): string { + $out = ''; + for ( $i = 0; $i < strlen( $input ); $i++ ) { + $c = $input[ $i ]; + $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c ); + } + return $out; + } + + /* + * ------------------- + * Unsupported selectors + * ------------------- + */ + + private function gen_unsupported(): string { + $kind = $this->prng->weighted( + array( + 'pseudo-class' => 25, + 'pseudo-element' => 15, + 'sibling-combinator' => 20, + 'column-combinator' => 8, + 'namespace-type' => 12, + 'namespace-attr' => 8, + 'non-type-context' => 12, + ) + ); + + switch ( $kind ) { + case 'pseudo-class': + $pseudo = $this->prng->choice( + array( + ':hover', + ':focus', + ':first-child', + ':last-child', + ':nth-child(2n+1)', + ':nth-of-type(3)', + ':not(.excluded)', + ':is(div, span)', + ':where(*)', + ':root', + ':empty', + ':checked', + ':lang(en)', + ':has(> img)', + ) + ); + return $this->render_compound( $this->gen_compound() ) . $pseudo; + + case 'pseudo-element': + $pseudo = $this->prng->choice( array( '::before', '::after', '::first-line', '::first-letter', '::marker', '::placeholder' ) ); + return $this->render_compound( $this->gen_compound() ) . $pseudo; + + case 'sibling-combinator': + $combinator = $this->prng->choice( array( '+', '~' ) ); + return $this->render_compound( $this->gen_compound() ) + . $this->maybe_ws( 60 ) . $combinator . $this->maybe_ws( 60 ) + . $this->render_compound( $this->gen_compound() ); + + case 'column-combinator': + return $this->gen_type_name( true ) + . $this->maybe_ws( 50 ) . '||' . $this->maybe_ws( 50 ) + . $this->gen_type_name( true ); + + case 'namespace-type': + $ns = $this->prng->choice( array( 'svg', 'html', '*', '' ) ); + return $ns . '|' . $this->prng->choice( array( 'title', 'a', 'circle', 'div' ) ); + + case 'namespace-attr': + // `[ns|name]` — must not be confused with the `|=` matcher, + // so the char after `|` must not be `=`. + $ns = $this->prng->choice( array( 'xlink', 'svg', 'xml' ) ); + return '[' . $ns . '|href]'; + + case 'non-type-context': + default: + // A context selector that is not a bare type selector. + $context = $this->prng->choice( array( '.ctx', '#ctx', '[ctx]', 'div.ctx', 'div#ctx', 'div[ctx]', '*.ctx' ) ); + $joiner = $this->prng->chance( 50 ) + ? $this->ws() + : $this->maybe_ws( 50 ) . '>' . $this->maybe_ws( 50 ); + return $context . $joiner . $this->render_compound( $this->gen_compound() ); + } + } + + /* + * ----------------- + * Invalid selectors + * ----------------- + */ + + private function gen_invalid(): string { + $kind = $this->prng->weighted( + array( + 'template' => 45, + 'trailing-garbage' => 25, + 'leading-garbage' => 15, + 'comma-trouble' => 15, + ) + ); + + switch ( $kind ) { + case 'template': + return $this->prng->choice( + array( + '', + ' ', + "\t\n\f ", + '.', + 'a.', + '#', + '[', + ']', + '[]', + '[ ]', + '.5x', + '#5', + '. x', + '..a', + '.#a', + /* + * EOF auto-closes an open attribute selector block + * ( '[a', '[a=b', '[a="b]', '[a=b i' are valid ), but + * grammar-level truncation is still invalid. + */ + '[a=', + '[a= ', + '[a~', + '[a^', + '[a=]', + '[=b]', + '[a==b]', + '[a~b]', + '[a!=b]', + "[a=\"b\nc\"]", + "[a=\"b\nc", + '[a=b x]', + '[a=b x', + '[a=b ix]', + '[a=b ix', + '[a=b i x', + '[5=b]', + '[5=b', + 'a >', + '> a', + 'a > > b', + 'a >> b', + '>', + '-', + // A lone '\' is a valid escape at EOF ( type selector U+FFFD ); + // '\' before a newline is not a valid escape. + "\\\n", + "a\\\nb", + 'a/**/b', + '/* comment */ a', + '!important', + '@media screen', + '{}', + ';', + 'a;b', + 'a{color:red}', + '()', + 'a()', + '*5', + '%', + 'a%', + ) + ); + + case 'trailing-garbage': + $garbage = $this->prng->choice( array( ':', '(', ')', '{', '}', ';', '!', '@', '%', '/', '=', '|', '^', '$' ) ); + return $this->render_compound( $this->gen_compound() ) . $garbage; + + case 'leading-garbage': + $garbage = $this->prng->choice( array( '%', ';', ')', '}', '=', '~', '+', '/', ',' ) ); + return $garbage . $this->render_compound( $this->gen_compound() ); + + case 'comma-trouble': + default: + $compound = $this->render_compound( $this->gen_compound() ); + return $this->prng->choice( + array( + $compound . ',', + ',' . $compound, + $compound . ',,' . $compound, + $compound . ', ,' . $compound, + $compound . ' , ', + ) + ); + } + } + + /* + * ----- + * Chaos + * ----- + */ + + private function gen_chaos(): string { + $alphabets = array( + 'css' => '.#[]=~|^$*>+,:()"\'\\ \t\n-_', + 'ident' => 'abcXYZ019-_', + 'mixed' => '.#[]=~|^$*>+,:()"\'\\ abcXYZ019-_iIsS', + 'unicode' => '✓Ωé🙂', + ); + + $alphabet = $alphabets[ $this->prng->weighted( + array( + 'css' => 25, + 'ident' => 15, + 'mixed' => 45, + 'unicode' => 15, + ) + ) ]; + + if ( 'unicode' === $alphabet ) { + $points = utf8_codepoints( $alphabet . '.#[]= aZ9' ); + $length = $this->prng->int( 0, 24 ); + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + $out .= $this->prng->choice( $points )[0]; + } + return $out; + } + + $length = $this->prng->int( 0, 40 ); + $out = ''; + for ( $i = 0; $i < $length; $i++ ) { + $out .= $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ]; + } + return $out; + } + + /* + * -------- + * Mutation + * -------- + */ + + private function mutate( string $selector ): string { + $mutation_count = $this->prng->int( 1, 4 ); + $alphabet = '.#[]=~|^$*>+,:()"\'\\ \t\niIsSabcXYZ019-_'; + + for ( $m = 0; $m < $mutation_count; $m++ ) { + $length = strlen( $selector ); + $kind = $this->prng->weighted( + array( + 'insert' => 30, + 'delete' => 25, + 'replace' => 25, + 'duplicate' => 10, + 'case-flip' => 10, + 'invalid-utf8' => 12, + ) + ); + + switch ( $kind ) { + case 'insert': + $at = $this->prng->int( 0, $length ); + $char = $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ]; + $selector = substr( $selector, 0, $at ) . $char . substr( $selector, $at ); + break; + + case 'delete': + if ( $length > 0 ) { + $at = $this->prng->int( 0, $length - 1 ); + $selector = substr( $selector, 0, $at ) . substr( $selector, $at + 1 ); + } + break; + + case 'replace': + if ( $length > 0 ) { + $at = $this->prng->int( 0, $length - 1 ); + $char = $alphabet[ $this->prng->int( 0, strlen( $alphabet ) - 1 ) ]; + $selector = substr( $selector, 0, $at ) . $char . substr( $selector, $at + 1 ); + } + break; + + case 'duplicate': + if ( $length > 0 ) { + $start = $this->prng->int( 0, $length - 1 ); + $span = $this->prng->int( 1, min( 6, $length - $start ) ); + $selector = substr( $selector, 0, $start + $span ) + . substr( $selector, $start, $span ) + . substr( $selector, $start + $span ); + } + break; + + case 'case-flip': + if ( $length > 0 ) { + $at = $this->prng->int( 0, $length - 1 ); + $char = $selector[ $at ]; + $flip = ctype_lower( $char ) ? strtoupper( $char ) : strtolower( $char ); + $selector = substr( $selector, 0, $at ) . $flip . substr( $selector, $at + 1 ); + } + break; + + case 'invalid-utf8': + // Splice a raw ill-formed sequence at an arbitrary byte + // offset — possibly splitting an existing multibyte + // character or landing before a continuation byte that + // completes a truncated lead. No expectations here; these + // exercise crash / scrub-notice / differential paths. + $bytes = $this->prng->choice( array_column( self::INVALID_UTF8_CLASSES, 0 ) ); + $at = $this->prng->int( 0, $length ); + $selector = substr( $selector, 0, $at ) . $bytes . substr( $selector, $at ); + break; + } + } + + return $selector; + } +} diff --git a/tools/css-selector-fuzz/lib/TreeCapture.php b/tools/css-selector-fuzz/lib/TreeCapture.php new file mode 100644 index 0000000000000..2350db024c8ad --- /dev/null +++ b/tools/css-selector-fuzz/lib/TreeCapture.php @@ -0,0 +1,145 @@ +, + * ancestorTags: string[] nearest-first ) + * tag row: same without ancestorTags. + */ +class TreeCapture { + + const CAPTURE_ITERATION_LIMIT = 20000; + + /** + * Captures the processor's view of a document or a fragment. + * + * @param string $html The markup ( full document or fragment ). + * @param string|null $context When set, parse as a fragment in this + * context ( e.g. '' ); the tag + * processor has no fragment mode, so tagRows + * is null in that case. + * @return array{ + * htmlRows: array|null, + * tagRows: array|null, + * quirks: bool, + * error: string|null, + * } + */ + public static function capture( string $html, ?string $context = null ): array { + $out = array( + 'htmlRows' => null, + 'tagRows' => null, + 'quirks' => false, + 'error' => null, + ); + + $processor = null === $context + ? \WP_HTML_Processor::create_full_parser( $html ) + : \WP_HTML_Processor::create_fragment( $html, $context ); + if ( null === $processor ) { + $out['error'] = 'fragment-context-unsupported'; + return $out; + } + $rows = array(); + $iterations = 0; + while ( $processor->next_tag() ) { + if ( ++$iterations > self::CAPTURE_ITERATION_LIMIT ) { + $out['error'] = 'html-capture-iteration-limit'; + return $out; + } + $breadcrumbs = $processor->get_breadcrumbs(); + array_pop( $breadcrumbs ); + $rows[] = array( + 'tag' => (string) $processor->get_tag(), + 'fid' => self::fid_of( $processor ), + 'attrs' => self::attrs_of( $processor ), + 'ancestorTags' => array_reverse( $breadcrumbs ), + 'namespace' => $processor->get_namespace(), + ); + } + + if ( null !== $processor->get_last_error() ) { + $out['error'] = 'html-processor-error: ' . $processor->get_last_error(); + return $out; + } + if ( null !== $processor->get_unsupported_exception() ) { + $out['error'] = 'html-processor-unsupported: ' . $processor->get_unsupported_exception()->getMessage(); + return $out; + } + + $out['htmlRows'] = $rows; + $out['quirks'] = $processor->is_quirks_mode(); + + // The tag processor has no fragment mode; a fragment case exercises + // the html processor's select() only. + if ( null !== $context ) { + return $out; + } + + $tag_processor = new \WP_HTML_Tag_Processor( $html ); + $tag_rows = array(); + $iterations = 0; + while ( $tag_processor->next_tag() ) { + if ( ++$iterations > self::CAPTURE_ITERATION_LIMIT ) { + $out['error'] = 'tag-capture-iteration-limit'; + return $out; + } + $tag_rows[] = array( + 'tag' => (string) $tag_processor->get_tag(), + 'fid' => self::fid_of( $tag_processor ), + 'attrs' => self::attrs_of( $tag_processor ), + ); + } + $out['tagRows'] = $tag_rows; + + return $out; + } + + /** The element's data-fid, or the same placeholder collect_matches() uses. */ + private static function fid_of( $processor ): string { + $fid = $processor->get_attribute( 'data-fid' ); + return is_string( $fid ) ? self::sanitize_fid( $fid ) : '(missing-fid:' . $processor->get_tag() . ')'; + } + + /** + * Replaces the lexbor protocol framing bytes ( TAB / LF / CR ) in a fid + * with '?'. Generated fids never contain these, but the lexbor harness + * applies the same replacement, so matching this here keeps the two trees + * comparable even for a hypothetical control-char fid ( the worst case is + * a benign tree-gated skip, never a false divergence ). + */ + public static function sanitize_fid( string $fid ): string { + return strtr( $fid, "\t\n\r", '???' ); + } + + /** + * All attributes as ( lowercase name, decoded value ) pairs, excluding + * data-fid ( stored separately, mirroring the generated model's shape ). + * + * @return array + */ + private static function attrs_of( $processor ): array { + $attrs = array(); + foreach ( (array) $processor->get_attribute_names_with_prefix( '' ) as $name ) { + if ( 'data-fid' === $name ) { + continue; + } + $value = $processor->get_attribute( $name ); + $attrs[] = array( $name, true === $value ? true : (string) $value ); + } + return $attrs; + } +} diff --git a/tools/css-selector-fuzz/lib/WildDocumentGenerator.php b/tools/css-selector-fuzz/lib/WildDocumentGenerator.php new file mode 100644 index 0000000000000..3cd7dada17cc6 --- /dev/null +++ b/tools/css-selector-fuzz/lib/WildDocumentGenerator.php @@ -0,0 +1,381 @@ + '', + 'html' => '', + 'legacy-compat' => '', + 'quirky' => '', + 'limited' => '', + ); + + /** @var Prng */ + private $prng; + private $fid_counter = 0; + private $pools; + + private function __construct( Prng $prng ) { + $this->prng = $prng; + $this->pools = array( + 'tags' => array( 'html', 'head', 'body' ), + 'classes' => array(), + 'ids' => array(), + 'attrNames' => array(), + 'attrValues' => array(), + ); + } + + /** + * @return array{model: null, html: string, pools: array, wild: true, doctype: string} + */ + public static function generate( Prng $prng ): array { + $generator = new self( $prng ); + return $generator->build(); + } + + private function build(): array { + $doctype_kind = $this->prng->weighted( + array( + 'none' => 25, + 'html' => 45, + 'legacy-compat' => 10, + 'quirky' => 12, + 'limited' => 8, + ) + ); + + $out = self::DOCTYPES[ $doctype_kind ]; + + if ( $this->prng->chance( 15 ) ) { + $out .= 'render_attrs( $this->random_attrs() ) . '>'; + } + if ( $this->prng->chance( 10 ) ) { + $out .= 'render_attrs( $this->random_attrs() ) . '>'; + } + + $max_elements = $this->prng->int( 4, 35 ); + $token_budget = $this->prng->int( 8, 70 ); + $open = array(); + + for ( $i = 0; $i < $token_budget; $i++ ) { + $in_table = $this->in_table_context( $open ); + + $kind = $this->prng->weighted( + array( + 'start' => 42, + 'void' => $in_table ? 0 : 8, + 'end' => 24, + 'text' => 16, + 'comment' => 5, + 'stray' => $in_table ? 0 : 5, + ) + ); + + switch ( $kind ) { + case 'start': + if ( $this->fid_counter >= $max_elements ) { + break; + } + $tag = $in_table + ? $this->prng->choice( array( 'caption', 'colgroup', 'thead', 'tbody', 'tfoot', 'tr', 'tr', 'td', 'td', 'th' ) ) + : $this->prng->choice( self::TAGS ); + if ( 'a' === $tag && in_array( 'a', $open, true ) ) { + // A nested
immediately runs the adoption agency. + $tag = 'span'; + } + $this->pools['tags'][] = $tag; + $out .= '<' . $this->maybe_case( $tag ) + . ' data-fid="w' . $this->fid_counter++ . '"' + . $this->render_attrs( $this->random_attrs() ) . '>'; + $open[] = $tag; + break; + + case 'void': + if ( $this->fid_counter >= $max_elements ) { + break; + } + $tag = $this->prng->choice( self::VOID_TAGS ); + $this->pools['tags'][] = $tag; + $out .= '<' . $this->maybe_case( $tag ) + . ' data-fid="w' . $this->fid_counter++ . '"' + . $this->render_attrs( $this->random_attrs() ) + . ( $this->prng->chance( 25 ) ? ' />' : '>' ); + break; + + case 'end': + if ( array() === $open ) { + break; + } + $pick = $this->prng->weighted( + array( + 'top' => 60, + 'random' => 40, + ) + ); + if ( 'top' === $pick ) { + $tag = array_pop( $open ); + } else { + /* + * Close a non-top open element: misnesting. Never + * across a formatting element — the processor only + * supports the trivial adoption-agency cases and + * bails on the rest ( "any other end tag" / + * "common ancestor" / reconstruction-with-rewind ). + */ + $formatting = array( 'a', 'b', 'i', 'em', 'strong', 'u', 's', 'code', 'small' ); + $lowest = count( $open ) - 1; + while ( $lowest > 0 && ! in_array( $open[ $lowest ], $formatting, true ) ) { + $lowest--; + } + if ( in_array( $open[ $lowest ], $formatting, true ) ) { + $lowest++; + } + if ( $lowest > count( $open ) - 1 ) { + $tag = array_pop( $open ); + } else { + $at = $this->prng->int( $lowest, count( $open ) - 1 ); + $tag = $open[ $at ]; + array_splice( $open, $at, 1 ); + } + } + $out .= 'maybe_case( $tag ) . '>'; + break; + + case 'text': + // Non-whitespace text in table context is unsupported + // (pending-table-character-tokens), keep it whitespace. + $out .= $in_table + ? "\n " + : $this->prng->choice( + array( + 'text', + ' wild text ', + "\n", + '& <x>', + 'café ✓', + 'a < b', + ) + ); + break; + + case 'comment': + $out .= ''; + break; + + case 'stray': + // An end tag for something that is not open. + // No formatting tags here: a stray formatting end tag + // runs the adoption agency's unsupported branches. + $out .= 'prng->choice( array( 'div', 'p', 'table', 'tr', 'li', 'span', 'x-wild' ) ) . '>'; + break; + } + } + + // Leave roughly half of the still-open elements unclosed. + foreach ( array_reverse( $open ) as $tag ) { + if ( $this->prng->chance( 50 ) ) { + $out .= 'maybe_case( $tag ) . '>'; + } + } + + foreach ( $this->pools as $key => $values ) { + $this->pools[ $key ] = array_values( array_unique( $values ) ); + } + + return array( + 'model' => null, + 'html' => $out, + 'pools' => $this->pools, + 'wild' => true, + 'doctype' => $doctype_kind, + ); + } + + /** + * Whether the insertion point is in table context outside any cell or + * caption — where arbitrary content would foster-parent (unsupported). + */ + private function in_table_context( array $open ): bool { + for ( $i = count( $open ) - 1; $i >= 0; $i-- ) { + $tag = $open[ $i ]; + if ( in_array( $tag, array( 'td', 'th', 'caption' ), true ) ) { + return false; + } + if ( in_array( $tag, array( 'table', 'thead', 'tbody', 'tfoot', 'tr', 'colgroup' ), true ) ) { + return true; + } + } + return false; + } + + /** @return array */ + private function random_attrs(): array { + $attrs = array(); + $count = $this->prng->weighted( array( 0 => 30, 1 => 35, 2 => 25, 3 => 10 ) ); + + for ( $i = 0; $i < $count; $i++ ) { + $name = $this->prng->choice( DocumentGenerator::ATTR_NAMES ); + + $lower = ascii_strtolower( $name ); + if ( 'class' === $lower ) { + $words = array(); + $n = $this->prng->int( 1, 3 ); + for ( $j = 0; $j < $n; $j++ ) { + $word = $this->maybe_inject_class_nul( $this->random_word() ); + $words[] = $word; + foreach ( DocumentGenerator::class_tokens( $word ) as $token ) { + $this->pools['classes'][] = $token; + } + } + $value = implode( ' ', $words ); + } elseif ( 'id' === $lower ) { + $value = $this->random_word(); + $this->pools['ids'][] = $value; + } elseif ( $this->prng->chance( 15 ) ) { + $value = true; + } else { + $value = $this->random_word(); + if ( $this->prng->chance( 20 ) ) { + $value .= ' ' . $this->random_word(); + } + } + + $this->pools['attrNames'][] = $lower; + if ( is_string( $value ) && 'class' !== $lower ) { + $this->pools['attrValues'][] = $value; + } + $attrs[] = array( $name, $value ); + } + + return $attrs; + } + + private function maybe_inject_class_nul( string $class ): string { + if ( '' === $class || ! $this->prng->chance( 12 ) ) { + return $class; + } + + $points = utf8_codepoints( $class ); + $at = $this->prng->int( 0, count( $points ) ); + $out = ''; + foreach ( $points as $i => $point ) { + if ( $i === $at ) { + $out .= "\0"; + } + $out .= $point[0]; + } + return $at === count( $points ) ? $out . "\0" : $out; + } + + private function render_attrs( array $attrs ): string { + $out = ''; + foreach ( $attrs as $attr ) { + list( $name, $value ) = $attr; + if ( true === $value ) { + $out .= ' ' . $name; + continue; + } + $out .= ' ' . $name . '="' . str_replace( array( '&', '"', '<' ), array( '&', '"', '<' ), $value ) . '"'; + } + return $out; + } + + private function random_word(): string { + $stems = array( 'wild', 'soup', 'alpha', 'beta', 'item', 'note', 'x', 'mixedCase', 'Über', 'main-thing', '--var', '_u' ); + $word = $this->prng->choice( $stems ); + if ( $this->prng->chance( 30 ) ) { + $word .= (string) $this->prng->int( 0, 99 ); + } + return $word; + } + + private function maybe_case( string $tag ): string { + if ( ! $this->prng->chance( 15 ) ) { + return $tag; + } + $out = ''; + for ( $i = 0; $i < strlen( $tag ); $i++ ) { + $c = $tag[ $i ]; + $out .= $this->prng->chance( 50 ) ? strtoupper( $c ) : strtolower( $c ); + } + return $out; + } +} diff --git a/tools/css-selector-fuzz/lib/Worker.php b/tools/css-selector-fuzz/lib/Worker.php new file mode 100644 index 0000000000000..f701d87974657 --- /dev/null +++ b/tools/css-selector-fuzz/lib/Worker.php @@ -0,0 +1,1308 @@ +chance( 30 ); + $is_fragment = ! $is_wild && $prng->chance( 20 ); + + $failures = array(); + $record = static function ( string $invariant, array $detail ) use ( &$failures ) { + $failures[] = array( + 'invariant' => $invariant, + 'detail' => $detail, + ); + }; + $match_stats = array(); + + /* + * The processor's own parse is the matching oracle's ground truth. + * For safe (model-built) documents the model must agree with the + * capture — that soundness check is what lets the capture be trusted + * on wild documents, where no model exists. + * + * Wild documents that hit one of the processor's unsupported + * constructs (it bails on foster parenting, complex adoption-agency + * runs, …) are deterministically regenerated a bounded number of + * times so nearly every wild case carries a usable ground truth. + */ + $document = null; + $capture = null; + $capture_error = null; + $attempts = $is_wild ? 8 : 1; + for ( $attempt = 0; $attempt < $attempts; $attempt++ ) { + if ( $is_wild ) { + $document = WildDocumentGenerator::generate( $prng->fork( "wild-document:{$attempt}" ) ); + } elseif ( $is_fragment ) { + $document = DocumentGenerator::generate_fragment( $prng->fork( 'fragment' ) ); + } else { + $document = DocumentGenerator::generate( $prng->fork( 'document' ) ); + } + + $context = ( $document['fragment'] ?? false ) ? $document['context'] : null; + list( $capture, $capture_error ) = self::guard( + static function () use ( $document, $context ) { + return TreeCapture::capture( $document['html'], $context ); + } + ); + + if ( null === $capture_error && null === $capture['error'] ) { + break; + } + } + + $rows = null; + $tag_rows = null; + $quirks = false; + + if ( null !== $capture_error ) { + $record( 'model-desync', array( 'phase' => 'capture', 'error' => self::describe_throwable( $capture_error ) ) ); + } elseif ( null !== $capture['error'] ) { + if ( ! $is_wild ) { + $record( 'model-desync', array( 'phase' => 'capture', 'error' => $capture['error'] ) ); + } + // Wild markup the processor cannot fully visit is skipped: + // parsing invariants still run, matching has no ground truth. + } else { + $rows = $capture['htmlRows']; + $tag_rows = $capture['tagRows']; + $quirks = $capture['quirks']; + + if ( $is_fragment ) { + self::check_fragment_capture_against_model( $document, $capture, $record ); + } elseif ( ! $is_wild ) { + self::check_capture_against_model( $document, $capture, $record ); + } + } + + $path_rows = null; + if ( null !== $rows ) { + $path_rows = array(); + foreach ( $rows as $row ) { + if ( 0 !== strpos( $row['fid'], '(missing-fid:' ) ) { + $path_rows[] = $row; + } + } + } + + $selector = SelectorGenerator::generate( $prng->fork( 'selector' ), $document['pools'], $path_rows ); + + $selector_string = $selector['selector']; + + // --- Parse phase ------------------------------------------------- + + list( $compound_list, $compound_error ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string ); + } + ); + list( $complex_list, $complex_error ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string ); + } + ); + + if ( null !== $compound_error ) { + $record( 'parse-error', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $compound_error ) ) ); + } + if ( null !== $complex_error ) { + $record( 'parse-error', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $complex_error ) ) ); + } + + if ( null === $compound_error && null !== $selector['expectCompound'] && $selector['expectCompound'] !== ( null !== $compound_list ) ) { + $record( + 'parse-expectation', + array( + 'grammar' => 'compound', + 'expected' => $selector['expectCompound'] ? 'parse' : 'null', + 'actual' => null !== $compound_list ? 'parse' : 'null', + ) + ); + } + if ( null === $complex_error && null !== $selector['expectComplex'] && $selector['expectComplex'] !== ( null !== $complex_list ) ) { + $record( + 'parse-expectation', + array( + 'grammar' => 'complex', + 'expected' => $selector['expectComplex'] ? 'parse' : 'null', + 'actual' => null !== $complex_list ? 'parse' : 'null', + ) + ); + } + + if ( null !== $compound_list && null === $complex_list && null === $complex_error ) { + $record( 'compound-implies-complex', array() ); + } + + // Parse determinism: a second parse must agree with the first. + list( $compound_again, ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string ); + } + ); + list( $complex_again, ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string ); + } + ); + if ( ( null === $compound_list ) !== ( null === $compound_again ) || ( null === $complex_list ) !== ( null === $complex_again ) ) { + $record( 'parse-determinism', array( 'note' => 'null-ness changed between identical parses' ) ); + } + + // --- AST extraction ---------------------------------------------- + + $compound_ast = null; + $complex_ast = null; + + if ( null !== $compound_list ) { + list( $compound_ast, $shape_error ) = self::guard( + static function () use ( $compound_list ) { + return AstExtractor::from_compound_list( $compound_list ); + } + ); + if ( null !== $shape_error ) { + $record( 'ast-shape', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $shape_error ) ) ); + } + } + if ( null !== $complex_list ) { + list( $complex_ast, $shape_error ) = self::guard( + static function () use ( $complex_list ) { + return AstExtractor::from_complex_list( $complex_list ); + } + ); + if ( null !== $shape_error ) { + $record( 'ast-shape', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $shape_error ) ) ); + } + } + + if ( null !== $compound_ast && null !== $complex_ast && $compound_ast !== $complex_ast ) { + $record( + 'ast-cross-grammar', + array( + 'compoundAst' => $compound_ast, + 'complexAst' => $complex_ast, + ) + ); + } + + if ( null !== $selector['ast'] && null !== $complex_ast && $selector['ast'] !== $complex_ast ) { + $record( + 'ast-mismatch', + array( + 'generatedAst' => $selector['ast'], + 'parsedAst' => $complex_ast, + ) + ); + } + + // --- Match phase --------------------------------------------------- + + $html_matches = null; + // 'n/a' = the lexbor differential does not apply to this case + // ( unparseable selector, fragment, no captured tree ). Distinct from + // 'unavailable', which check_lexbor_differential reports only when the + // harness itself is missing or died — so a silently-dropped third + // oracle shows up in the per-batch tally instead of hiding in 'off'. + $lexbor_state = 'n/a'; + if ( null !== $complex_ast && null !== $rows ) { + $expected = ReferenceMatcher::expected_html_matches_rows( $complex_ast, $rows, $quirks ); + + /* + * Path-directed selectors are guaranteed by construction to match + * ( or, for near-misses, not to match ) a specific element. The + * reference matcher disagreeing means the generator or the + * reference matcher itself is wrong — a fuzzer-side defect. + */ + $must_match = $selector['mustMatchFid'] ?? null; + $must_not_match = $selector['mustNotMatchFid'] ?? null; + if ( null !== $must_match && ! in_array( $must_match, $expected, true ) ) { + $record( + 'path-expectation', + array( + 'expectation' => 'must-match', + 'fid' => $must_match, + 'expected' => $expected, + ) + ); + } + if ( null !== $must_not_match && in_array( $must_not_match, $expected, true ) ) { + $record( + 'path-expectation', + array( + 'expectation' => 'must-not-match', + 'fid' => $must_not_match, + 'expected' => $expected, + ) + ); + } + + $html_matches = self::check_select_matches( 'html', $selector_string, $document, $expected, $record ); + if ( null !== $html_matches ) { + self::note_match_assertion( $match_stats, 'html', $expected, $html_matches ); + } + + // lexbor parses full documents only; fragments skip it. + if ( ! ( $document['fragment'] ?? false ) ) { + $lexbor_state = self::check_lexbor_differential( $complex_ast, $selector_string, $document, $rows, $quirks, $expected, $record ); + } + } elseif ( null === $complex_list && null === $complex_error ) { + self::check_select_rejection( 'html', $selector_string, $document, $record ); + } + + if ( null !== $compound_ast && null !== $tag_rows ) { + $expected = ReferenceMatcher::expected_tag_matches_rows( $compound_ast, $tag_rows ); + $tag_matches = self::check_select_matches( 'tag', $selector_string, $document, $expected, $record ); + if ( null !== $tag_matches ) { + self::note_match_assertion( $match_stats, 'tag', $expected, $tag_matches ); + } + } elseif ( null === $compound_list && null === $compound_error ) { + self::check_select_rejection( 'tag', $selector_string, $document, $record ); + } + + // --- Metamorphic phase ---------------------------------------------- + // Oracle-free relations: meaning-preserving transforms of the selector + // must select exactly the same elements. Run only on otherwise-clean + // cases so a single root cause does not multiply into noise. + + if ( null !== $complex_ast && null !== $html_matches && array() === $failures ) { + self::check_metamorphic( $complex_ast, $html_matches, $document, $prng->fork( 'metamorph' ), $record ); + } + + $digest = sha1( + json_encode_safe( + array( + $selector_string, + $document['html'], + null !== $compound_list, + null !== $complex_list, + $compound_ast, + $complex_ast, + array_map( + static function ( $failure ) { + return $failure['invariant']; + }, + $failures + ), + ) + ) + ); + + $signatures = array(); + foreach ( $failures as $failure ) { + $signatures[] = self::signature( $failure ); + } + + return array( + 'seed' => $seed, + 'bucket' => $selector['bucket'], + 'digest' => $digest, + 'failures' => $failures, + 'signatures' => array_values( array_unique( $signatures ) ), + 'selector' => $selector_string, + 'html' => $document['html'], + 'lexbor' => $lexbor_state, + 'matchStats' => $match_stats, + ); + } + + /** + * Runs the SELF-CONTAINED invariants on an explicit ( selector, html ) + * pair — no generated model, intended AST, or parse expectation. This is + * what the minimizer drives: every checked property is computable from + * the pair alone ( WP select() vs the reference matcher over WP's own + * parsed AST and the captured tree; metamorphic relations; the lexbor + * differential; parse/shape/cross-grammar invariants; rejection + * bookkeeping for unparseable selectors ). + * + * Bug 1 surfaces here as metamorphic-ast, Bug 2 as match-mismatch-*, + * Bug 3 as metamorphic-parse — so all three known bugs are minimizable + * without the generator. + * + * @return array{ + * failures: array, + * signatures: string[], + * } + */ + public static function run_pair( string $selector_string, string $html, ?string $target = null ): array { + Bootstrap::load(); + + $failures = array(); + $record = static function ( string $invariant, array $detail ) use ( &$failures ) { + $failures[] = array( + 'invariant' => $invariant, + 'detail' => $detail, + ); + }; + + // When the minimizer fixes a target signature, the metamorphic loop + // ( the only expensive, multi-draw stage ) is only worth running if + // the target is itself a metamorphic signature. + $target_invariant = null === $target ? null : substr( strrchr( $target, ':' ), 1 ); + $target_is_metamorph = null !== $target_invariant && 0 === strpos( $target_invariant, 'metamorphic' ); + $has_target_signature = static function () use ( &$failures, $target ) { + if ( null === $target ) { + return false; + } + foreach ( $failures as $failure ) { + if ( self::signature( $failure ) === $target ) { + return true; + } + } + return false; + }; + + list( $capture, $capture_error ) = self::guard( + static function () use ( $html ) { + return TreeCapture::capture( $html ); + } + ); + + $rows = null; + $tag_rows = null; + $quirks = false; + if ( null === $capture_error && null === $capture['error'] ) { + $rows = $capture['htmlRows']; + $tag_rows = $capture['tagRows']; + $quirks = $capture['quirks']; + } + + $document = array( 'html' => $html ); + + list( $compound_list, $compound_error ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Compound_Selector_List::from_selectors( $selector_string ); + } + ); + list( $complex_list, $complex_error ) = self::guard( + static function () use ( $selector_string ) { + return \WP_CSS_Complex_Selector_List::from_selectors( $selector_string ); + } + ); + + if ( null !== $compound_error ) { + $record( 'parse-error', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $compound_error ) ) ); + } + if ( null !== $complex_error ) { + $record( 'parse-error', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $complex_error ) ) ); + } + if ( null !== $compound_list && null === $complex_list && null === $complex_error ) { + $record( 'compound-implies-complex', array() ); + } + + $compound_ast = null; + $complex_ast = null; + if ( null !== $compound_list ) { + list( $compound_ast, $shape_error ) = self::guard( + static function () use ( $compound_list ) { + return AstExtractor::from_compound_list( $compound_list ); + } + ); + if ( null !== $shape_error ) { + $record( 'ast-shape', array( 'grammar' => 'compound', 'error' => self::describe_throwable( $shape_error ) ) ); + } + } + if ( null !== $complex_list ) { + list( $complex_ast, $shape_error ) = self::guard( + static function () use ( $complex_list ) { + return AstExtractor::from_complex_list( $complex_list ); + } + ); + if ( null !== $shape_error ) { + $record( 'ast-shape', array( 'grammar' => 'complex', 'error' => self::describe_throwable( $shape_error ) ) ); + } + } + if ( null !== $compound_ast && null !== $complex_ast && $compound_ast !== $complex_ast ) { + $record( 'ast-cross-grammar', array( 'compoundAst' => $compound_ast, 'complexAst' => $complex_ast ) ); + } + + $html_matches = null; + if ( null !== $complex_ast && null !== $rows ) { + $expected = ReferenceMatcher::expected_html_matches_rows( $complex_ast, $rows, $quirks ); + $html_matches = self::check_select_matches( 'html', $selector_string, $document, $expected, $record ); + self::check_lexbor_differential( $complex_ast, $selector_string, $document, $rows, $quirks, $expected, $record ); + } elseif ( null === $complex_list && null === $complex_error && null !== $rows ) { + self::check_select_rejection( 'html', $selector_string, $document, $record ); + } + + if ( null !== $compound_ast && null !== $tag_rows ) { + $expected = ReferenceMatcher::expected_tag_matches_rows( $compound_ast, $tag_rows ); + self::check_select_matches( 'tag', $selector_string, $document, $expected, $record ); + } elseif ( null === $compound_list && null === $compound_error && null !== $tag_rows ) { + self::check_select_rejection( 'tag', $selector_string, $document, $record ); + } + + $run_metamorph = ( null === $target || $target_is_metamorph ) + && null !== $complex_ast && null !== $html_matches && array() === $failures; + if ( $run_metamorph ) { + /* + * Metamorphic transforms randomize escapes / case / order, so a + * transform-sensitive bug ( e.g. Bug 1 and Bug 3 ) only fires for + * some PRNG draws. run_case sees one draw; here several fixed + * draws are tried so minimization can reliably preserve such a + * signature regardless of which draw first exposed it. With a + * target fixed, stop at the first draw that reproduces it. + */ + for ( $i = 0; $i < self::PAIR_METAMORPH_DRAWS && array() === $failures; $i++ ) { + // A FIXED draw seed ( not derived from the pair ) keeps the + // test monotonic under shrinking: the same coin-flips apply to + // whatever AST survives, so a smaller selector that still has + // the bug reproduces the same transform signature. + $metamorph_prng = new Prng( 'css-selector-fuzz-minimize', "metamorph:{$i}" ); + self::check_metamorphic( $complex_ast, $html_matches, $document, $metamorph_prng, $record ); + if ( $has_target_signature() ) { + break; + } + } + } + + $signatures = array(); + foreach ( $failures as $failure ) { + $signatures[] = self::signature( $failure ); + } + + return array( + 'failures' => $failures, + 'signatures' => array_values( array_unique( $signatures ) ), + ); + } + + /** + * Fragment analogue of check_capture_against_model: the ``-context + * fragment capture must equal the model rows built from the body-level + * children ( with the implicit HTML/BODY ancestors ). + */ + private static function check_fragment_capture_against_model( array $document, array $capture, callable $record ): void { + $model_rows = DocumentGenerator::rows_from_fragment( $document['children'] ); + + $normalize = static function ( array $rows ): array { + $out = array(); + foreach ( $rows as $row ) { + $attrs = array(); + foreach ( $row['attrs'] as $attr ) { + $attrs[ $attr[0] ] = $attr[1]; + } + ksort( $attrs ); + $out[] = array( + 'tag' => $row['tag'], + 'fid' => $row['fid'], + 'attrs' => $attrs, + 'ancestorTags' => $row['ancestorTags'], + ); + } + return $out; + }; + + $expected = $normalize( $model_rows ); + $actual = $normalize( $capture['htmlRows'] ); + if ( $expected !== $actual ) { + $record( + 'model-desync', + array( + 'processor' => 'fragment', + 'expected' => $expected, + 'actual' => $actual, + ) + ); + } + } + + /** + * Verifies that the processor's captured view of a safe (model-built) + * document agrees with the generated model — this guards the oracle + * itself against renderer/model drift, and is what justifies trusting + * the capture on wild documents. + */ + private static function check_capture_against_model( array $document, array $capture, callable $record ): void { + $model_rows = DocumentGenerator::rows_from_model( $document['model'] ); + + $normalize = static function ( array $rows, bool $with_ancestors ): array { + $out = array(); + foreach ( $rows as $row ) { + $attrs = array(); + foreach ( $row['attrs'] as $attr ) { + $attrs[ $attr[0] ] = $attr[1]; + } + ksort( $attrs ); + $normalized = array( + 'tag' => $row['tag'], + 'fid' => $row['fid'], + 'attrs' => $attrs, + ); + if ( $with_ancestors ) { + $normalized['ancestorTags'] = $row['ancestorTags']; + } + $out[] = $normalized; + } + return $out; + }; + + $expected = $normalize( $model_rows, true ); + $actual = $normalize( $capture['htmlRows'], true ); + if ( $expected !== $actual ) { + $record( + 'model-desync', + array( + 'processor' => 'html', + 'expected' => $expected, + 'actual' => $actual, + ) + ); + } + + $expected_tags = $normalize( $model_rows, false ); + $actual_tags = $normalize( $capture['tagRows'], false ); + if ( $expected_tags !== $actual_tags ) { + $record( + 'model-desync', + array( + 'processor' => 'tag', + 'expected' => $expected_tags, + 'actual' => $actual_tags, + ) + ); + } + + if ( $document['quirks'] !== $capture['quirks'] ) { + $record( + 'model-desync', + array( + 'processor' => 'quirks', + 'expected' => $document['quirks'], + 'actual' => $capture['quirks'], + ) + ); + } + } + + /** + * Runs a select() loop over the document, collecting matched data-fids. + * + * @param string $target 'html' or 'tag'. + * @param array $document The case document ( may request fragment mode ). + * @return array{0: string[]|null, 1: \Throwable|null} + */ + private static function collect_matches( string $target, string $selector_string, array $document ): array { + $html = $document['html']; + $context = ( $document['fragment'] ?? false ) ? $document['context'] : null; + return self::guard( + static function () use ( $target, $selector_string, $html, $context ) { + if ( 'tag' === $target ) { + $processor = new \WP_HTML_Tag_Processor( $html ); + } elseif ( null !== $context ) { + $processor = \WP_HTML_Processor::create_fragment( $html, $context ); + } else { + $processor = \WP_HTML_Processor::create_full_parser( $html ); + } + + $matches = array(); + $iterations = 0; + while ( $processor->select( $selector_string ) ) { + $fid = $processor->get_attribute( 'data-fid' ); + // Sanitize identically to TreeCapture/lexbor so a fid with + // a control char can never produce a false divergence on + // the match path ( unreachable today: fids are integers ). + $matches[] = is_string( $fid ) ? TreeCapture::sanitize_fid( $fid ) : '(missing-fid:' . $processor->get_tag() . ')'; + if ( ++$iterations > self::SELECT_ITERATION_LIMIT ) { + throw new \RuntimeException( 'select() did not terminate within the iteration limit.' ); + } + } + + if ( $processor instanceof \WP_HTML_Processor ) { + if ( null !== $processor->get_last_error() ) { + throw new \RuntimeException( 'Processor error state: ' . $processor->get_last_error() ); + } + if ( null !== $processor->get_unsupported_exception() ) { + throw new \RuntimeException( 'Processor unsupported state: ' . $processor->get_unsupported_exception()->getMessage() ); + } + } + + return $matches; + } + ); + } + + /** + * Flushes the select() parse caches. + * + * Both select() implementations memoize the most recently parsed selector + * string in a function-static cache, so whether a select() call re-parses + * — and therefore whether parse-time notices ( the invalid-UTF-8 scrub + * notice from from_selectors() ) fire — depends on what the worker + * happened to parse before. Parsing a sentinel selector first makes the + * next select() call for the case selector deterministic: it always + * re-parses, so exactly one parse happens inside each notice-assertion + * window regardless of worker history or case re-runs. + */ + private static function flush_select_parse_caches(): void { + ( new \WP_HTML_Tag_Processor( '' ) )->select( '#-fuzz-cache-flush-' ); + \WP_HTML_Processor::create_full_parser( '' )->select( '#-fuzz-cache-flush-' ); + } + + /** + * The _doing_it_wrong() name under which from_selectors() reports that an + * invalid-UTF-8 selector string was scrubbed to U+FFFD before parsing. + * + * @param string $target 'html' or 'tag'. + */ + private static function scrub_notice_name( string $target ): string { + return ( 'tag' === $target ? 'WP_CSS_Compound_Selector_List' : 'WP_CSS_Complex_Selector_List' ) . '::from_selectors'; + } + + /** + * Runs a select() loop on a parseable selector and compares the match set + * against the reference matcher. + * + * @param string $target 'html' or 'tag'. + * @return string[]|null The actual match set, or null when matching failed. + */ + private static function check_select_matches( string $target, string $selector_string, array $document, array $expected, callable $record ): ?array { + self::flush_select_parse_caches(); + Bootstrap::reset_doing_it_wrong(); + + list( $actual, $error ) = self::collect_matches( $target, $selector_string, $document ); + + if ( null !== $error ) { + $record( + 'match-error', + array( + 'target' => $target, + 'error' => self::describe_throwable( $error ), + ) + ); + return null; + } + + /* + * A selector string containing invalid UTF-8 is scrubbed to U+FFFD by + * from_selectors(), which reports the replacement with exactly one + * notice on the (single, cache-flushed) parse. Anything else is + * unexpected for a selector that parses. + */ + $expected_calls = \wp_is_valid_utf8( $selector_string ) + ? array() + : array( + array( + 'function' => self::scrub_notice_name( $target ), + ), + ); + + $doing_it_wrong = Bootstrap::doing_it_wrong_calls(); + if ( ! self::notices_match( $expected_calls, $doing_it_wrong ) ) { + $record( + 'doing-it-wrong-unexpected', + array( + 'target' => $target, + 'expectedCalls' => $expected_calls, + 'calls' => $doing_it_wrong, + ) + ); + } + + if ( $actual !== $expected ) { + $record( + 'match-mismatch-' . $target, + array( + 'expected' => $expected, + 'actual' => $actual, + ) + ); + } + + return $actual; + } + + private static function note_match_assertion( array &$match_stats, string $target, array $expected, array $actual ): void { + if ( ! isset( $match_stats[ $target ] ) ) { + $match_stats[ $target ] = array( + 'assertions' => 0, + 'nonVacuous' => 0, + ); + } + + ++$match_stats[ $target ]['assertions']; + if ( array() !== $expected || array() !== $actual ) { + ++$match_stats[ $target ]['nonVacuous']; + } + } + + private static function finalize_match_stats( array $match_stats ): array { + foreach ( $match_stats as $bucket => $targets ) { + foreach ( $targets as $target => $counts ) { + $assertions = (int) ( $counts['assertions'] ?? 0 ); + $non_vacuous = (int) ( $counts['nonVacuous'] ?? 0 ); + $vacuous = max( 0, $assertions - $non_vacuous ); + + $match_stats[ $bucket ][ $target ]['vacuous'] = $vacuous; + $match_stats[ $bucket ][ $target ]['nonVacuousRate'] = $assertions > 0 ? round( $non_vacuous / $assertions, 4 ) : 0.0; + $match_stats[ $bucket ][ $target ]['vacuousRate'] = $assertions > 0 ? round( $vacuous / $assertions, 4 ) : 0.0; + } + } + return $match_stats; + } + + /** + * Runs the lexbor differential — the THIRD, independent matching opinion. + * + * Quirks-mode documents are excluded unless the startup probe confirms + * lexbor has reliable class/#id case folding in both no-quirks and quirks + * mode. The comparison only runs when lexbor built the same element tree + * as WP ( fid/tag/ancestry multiset ), so it tests the selector layer, + * not tree construction. + * + * Verdict triage: + * - 'lexbor-divergence' lexbor != reference: a fuzzer-oracle problem + * ( or an un-compensated lexbor bug ) — never a + * WP verdict on its own. + * - 'lexbor-parse-reject' lexbor refused a selector WP accepted. + * - match-mismatch-html with NO lexbor-divergence on the same case + * means reference == lexbor != WP: a + * high-confidence WP finding. + * + * @return string Tally state: + * unavailable|skipped-quirks|skipped-utf8|error|tree-gated|compared. + */ + private static function check_lexbor_differential( array $complex_ast, string $selector_string, array $document, array $rows, bool $quirks, array $expected, callable $record ): string { + if ( ! LexborOracle::available() ) { + return 'unavailable'; + } + if ( $quirks && ! LexborOracle::quirks_class_id_reliable() ) { + return 'skipped-quirks'; + } + + /* + * lexbor receives a canonical re-render of the (already verified) + * AST rather than the original byte form: the differential targets + * matching semantics, while byte-level parsing (escapes, whitespace, + * modifier case — lexbor e.g. rejects uppercase I/S modifiers) is + * covered by the AST round-trip and metamorphic invariants. ASTs + * containing invalid UTF-8 cannot be re-rendered; since + * from_selectors() scrubs input to U+FFFD before parsing, none should + * exist and this skip is defensive ( a nonzero skipped-utf8 tally + * indicates a normalization bypass ). + */ + if ( ! ast_strings_are_utf8( $complex_ast ) ) { + return 'skipped-utf8'; + } + $canonical = SelectorGenerator::render_canonical( $complex_ast ); + + $lex = LexborOracle::query( $document['html'], $canonical ); + if ( null === $lex ) { + return 'error'; + } + + if ( 'parse' === $lex['error'] ) { + $record( + 'lexbor-parse-reject', + array( + 'note' => 'lexbor rejected the canonical form of a selector the WP parser accepted', + 'canonical' => printable_bytes( $canonical ), + ) + ); + return 'compared'; + } + if ( null !== $lex['error'] ) { + return 'error'; + } + + if ( ! self::trees_agree( $rows, $lex['rows'] ) ) { + return 'tree-gated'; + } + + /* + * Two known lexbor deviations are compensated for so the rest of the + * semantics still get differential coverage; WP itself is still held + * to the strict expectation: + * + * - lexbor #368: class/#id match ASCII case-insensitively even in + * no-quirks documents. Compare lexbor against the reference run + * with quirks-style class/ID folding. + * - lexbor does not implement HTML's case-insensitive attribute + * value list ( [rel=NOFOLLOW] does not match rel="nofollow" ), + * where browsers and WP do. Compare lexbor against the reference + * run with that list disabled. + */ + $expected_for_lexbor = ReferenceMatcher::expected_html_matches_rows( + $complex_ast, + $rows, + LexborOracle::has_issue_368() ? true : $quirks, + false + ); + + // lexbor reports in document order, WP/reference in visit order — + // compare as multisets. + $lex_matches = $lex['matches']; + sort( $lex_matches ); + sort( $expected_for_lexbor ); + + if ( $lex_matches !== $expected_for_lexbor ) { + $record( + 'lexbor-divergence', + array( + 'reference' => $expected_for_lexbor, + 'lexbor' => $lex_matches, + 'issue368' => LexborOracle::has_issue_368(), + ) + ); + } + + return 'compared'; + } + + /** Multiset equality of ( tag, fid, ancestry ) between WP and lexbor rows. */ + private static function trees_agree( array $wp_rows, array $lexbor_rows ): bool { + $serialize = static function ( array $rows ): array { + $out = array(); + foreach ( $rows as $row ) { + $out[] = $row['tag'] . '|' . $row['fid'] . '|' . implode( ',', $row['ancestorTags'] ); + } + sort( $out ); + return $out; + }; + + return $serialize( $wp_rows ) === $serialize( $lexbor_rows ); + } + + /** + * Checks the metamorphic relations: each meaning-preserving transform of + * the parsed selector must parse, must (for AST-preserving transforms) + * parse to exactly the transformed AST, and must select exactly the same + * elements the original selector selected. + * + * @param array $complex_ast Canonical AST of the original selector. + * @param string[] $html_matches The original's WP_HTML_Processor match set. + */ + private static function check_metamorphic( array $complex_ast, array $html_matches, array $document, Prng $prng, callable $record ): void { + foreach ( Metamorph::variants( $complex_ast, $prng ) as $variant ) { + $transform = $variant['name']; + $variant_selector = $variant['selector']; + + list( $variant_list, $parse_error ) = self::guard( + static function () use ( $variant_selector ) { + return \WP_CSS_Complex_Selector_List::from_selectors( $variant_selector ); + } + ); + + if ( null !== $parse_error ) { + $record( + 'metamorphic-error', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'error' => self::describe_throwable( $parse_error ), + ) + ); + continue; + } + + if ( null === $variant_list ) { + $record( + 'metamorphic-parse', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + ) + ); + continue; + } + + if ( $variant['astMustMatch'] ) { + list( $variant_ast, $shape_error ) = self::guard( + static function () use ( $variant_list ) { + return AstExtractor::from_complex_list( $variant_list ); + } + ); + if ( null !== $shape_error ) { + $record( + 'metamorphic-error', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'error' => self::describe_throwable( $shape_error ), + ) + ); + continue; + } + if ( $variant_ast !== $variant['ast'] ) { + $record( + 'metamorphic-ast', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'expectedAst' => $variant['ast'], + 'parsedAst' => $variant_ast, + ) + ); + continue; + } + } + + Bootstrap::reset_doing_it_wrong(); + list( $variant_matches, $match_error ) = self::collect_matches( 'html', $variant_selector, $document ); + + if ( null !== $match_error ) { + $record( + 'metamorphic-error', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'error' => self::describe_throwable( $match_error ), + ) + ); + continue; + } + + if ( $variant_matches !== $html_matches ) { + $record( + 'metamorphic-mismatch', + array( + 'transform' => $transform, + 'selector' => printable_bytes( $variant_selector ), + 'expected' => $html_matches, + 'actual' => $variant_matches, + ) + ); + } + } + } + + /** + * For unparseable selectors: select() must return false, leave the + * processor usable, and report misuse exactly once per call. + */ + private static function check_select_rejection( string $target, string $selector_string, array $document, callable $record ): void { + self::flush_select_parse_caches(); + Bootstrap::reset_doing_it_wrong(); + + $context = ( $document['fragment'] ?? false ) ? $document['context'] : null; + list( $results, $error ) = self::guard( + static function () use ( $target, $selector_string, $document, $context ) { + if ( 'tag' === $target ) { + $processor = new \WP_HTML_Tag_Processor( $document['html'] ); + } elseif ( null !== $context ) { + $processor = \WP_HTML_Processor::create_fragment( $document['html'], $context ); + } else { + $processor = \WP_HTML_Processor::create_full_parser( $document['html'] ); + } + + // Two calls: the second exercises the parse cache. + return array( $processor->select( $selector_string ), $processor->select( $selector_string ) ); + } + ); + + if ( null !== $error ) { + $record( + 'match-error', + array( + 'target' => $target, + 'rejected' => true, + 'error' => self::describe_throwable( $error ), + ) + ); + return; + } + + if ( array( false, false ) !== $results ) { + $record( + 'select-on-null', + array( + 'target' => $target, + 'results' => $results, + ) + ); + } + + /* + * Two select() calls report the unparseable selector once each; the + * parse cache only skips re-parsing, never the per-call notice. An + * invalid-UTF-8 selector additionally reports the U+FFFD scrub once, + * on the first call ( the only one that parses after the flush ). + */ + $select_notice_name = ( 'tag' === $target ? 'WP_HTML_Tag_Processor' : 'WP_HTML_Processor' ) . '::select'; + $expected_calls = array( + array( 'function' => $select_notice_name ), + array( 'function' => $select_notice_name ), + ); + if ( ! \wp_is_valid_utf8( $selector_string ) ) { + array_unshift( $expected_calls, array( 'function' => self::scrub_notice_name( $target ) ) ); + } + + $doing_it_wrong = Bootstrap::doing_it_wrong_calls(); + if ( ! self::notices_match( $expected_calls, $doing_it_wrong ) ) { + $record( + 'doing-it-wrong-missing', + array( + 'target' => $target, + 'expectedCalls' => $expected_calls, + 'calls' => $doing_it_wrong, + ) + ); + } + } + + /** + * Compares recorded _doing_it_wrong() calls against expectations: same + * count, in order, matching on every key the expectation specifies + * ( recorded calls also carry 'message', which expectations omit ). + * + * @param array[] $expected_calls Expected calls, each a subset of record keys. + * @param array[] $actual_calls Recorded calls. + */ + private static function notices_match( array $expected_calls, array $actual_calls ): bool { + if ( count( $expected_calls ) !== count( $actual_calls ) ) { + return false; + } + foreach ( $expected_calls as $i => $expected_call ) { + foreach ( $expected_call as $key => $value ) { + if ( ( $actual_calls[ $i ][ $key ] ?? null ) !== $value ) { + return false; + } + } + } + return true; + } + + /* + * ------------- + * Batch running + * ------------- + */ + + /** + * Runs a batch of sequential seeds. + * + * @return array Summary. + */ + public static function run_batch( array $options ): array { + Bootstrap::load(); + + $start_seed = option_int( $options, 'start-seed', 1 ); + $count = option_int( $options, 'count', 100 ); + $failures_out = option_string( $options, 'failures-out', null ); + $progress_file = option_string( $options, 'progress-file', null ); + $determinism_every = option_int( $options, 'determinism-every', 16 ); + $max_failures = option_int( $options, 'max-failures', 200 ); + + $started_at = microtime( true ); + $failures = 0; + $buckets = array(); + $signatures = array(); + $lexbor = array(); + $match_stats = array(); + $last_seed = null; + $stop_reason = 'completed'; + + for ( $seed = $start_seed; $seed < $start_seed + $count; $seed++ ) { + if ( $max_failures > 0 && $failures >= $max_failures ) { + $stop_reason = 'max-failures'; + break; + } + if ( null !== $progress_file ) { + file_put_contents( $progress_file, (string) $seed ); + } + + $result = self::run_case( $seed ); + + if ( $determinism_every > 0 && 0 === $seed % $determinism_every ) { + $repeat = self::run_case( $seed ); + if ( $repeat['digest'] !== $result['digest'] ) { + $result['failures'][] = array( + 'invariant' => 'case-determinism', + 'detail' => array( + 'firstDigest' => $result['digest'], + 'secondDigest' => $repeat['digest'], + ), + ); + } + } + + $buckets[ $result['bucket'] ] = ( $buckets[ $result['bucket'] ] ?? 0 ) + 1; + $lexbor[ $result['lexbor'] ] = ( $lexbor[ $result['lexbor'] ] ?? 0 ) + 1; + $last_seed = $seed; + foreach ( $result['matchStats'] as $target => $stats ) { + if ( ! isset( $match_stats[ $result['bucket'] ][ $target ] ) ) { + $match_stats[ $result['bucket'] ][ $target ] = array( + 'assertions' => 0, + 'nonVacuous' => 0, + ); + } + $match_stats[ $result['bucket'] ][ $target ]['assertions'] += $stats['assertions']; + $match_stats[ $result['bucket'] ][ $target ]['nonVacuous'] += $stats['nonVacuous']; + } + + foreach ( $result['failures'] as $failure ) { + ++$failures; + $signature = self::signature( $failure ); + $signatures[ $signature ] = ( $signatures[ $signature ] ?? 0 ) + 1; + + $entry = array( + 'kind' => 'css-selector-fuzz-failure', + 'seed' => $result['seed'], + 'bucket' => $result['bucket'], + 'invariant' => $failure['invariant'], + 'signature' => $signature, + 'selector' => printable_bytes( $result['selector'] ), + 'selectorBase64' => base64_encode( $result['selector'] ), + 'htmlBase64' => base64_encode( $result['html'] ), + 'detail' => $failure['detail'], + ); + if ( null !== $failures_out ) { + append_ndjson( $failures_out, $entry ); + } else { + fwrite( STDERR, json_encode_safe( $entry ) . "\n" ); + } + } + } + + return array( + 'kind' => 'css-selector-fuzz-batch-summary', + 'startSeed' => $start_seed, + 'count' => $count, + 'lastSeed' => $last_seed, + 'failures' => $failures, + 'buckets' => $buckets, + 'signatures' => $signatures, + 'lexbor' => $lexbor, + 'matchStats' => self::finalize_match_stats( $match_stats ), + 'stopReason' => $stop_reason, + 'durationMs' => (int) round( 1000 * ( microtime( true ) - $started_at ) ), + ); + } + + /** Stable identity for de-duplicating equivalent failures. */ + private static function signature( array $failure ): string { + $parts = array( $failure['invariant'] ); + if ( isset( $failure['detail']['grammar'] ) ) { + $parts[] = $failure['detail']['grammar']; + } + if ( isset( $failure['detail']['target'] ) ) { + $parts[] = $failure['detail']['target']; + } + if ( isset( $failure['detail']['transform'] ) ) { + $parts[] = $failure['detail']['transform']; + } + if ( isset( $failure['detail']['error']['class'] ) ) { + $parts[] = $failure['detail']['error']['class']; + $parts[] = preg_replace( '/[0-9]+/', 'N', (string) ( $failure['detail']['error']['message'] ?? '' ) ); + } + return substr( sha1( implode( '|', $parts ) ), 0, 12 ) . ':' . $failure['invariant']; + } + + /* + * ------- + * Helpers + * ------- + */ + + /** + * Calls $fn with PHP warnings/notices converted to exceptions. + * + * @return array{0: mixed, 1: \Throwable|null} + */ + private static function guard( callable $fn ): array { + set_error_handler( + static function ( $severity, $message, $file, $line ) { + if ( E_DEPRECATED === $severity || E_USER_DEPRECATED === $severity ) { + return true; + } + throw new \ErrorException( $message, 0, $severity, $file, $line ); + } + ); + try { + return array( $fn(), null ); + } catch ( \Throwable $e ) { + return array( null, $e ); + } finally { + restore_error_handler(); + } + } + + public static function describe_throwable( \Throwable $e ): array { + $root = repo_root() . DIRECTORY_SEPARATOR; + return array( + 'class' => get_class( $e ), + 'message' => $e->getMessage(), + 'at' => str_replace( $root, '', $e->getFile() ) . ':' . $e->getLine(), + 'trace' => array_slice( + array_map( + static function ( $frame ) use ( $root ) { + $location = isset( $frame['file'] ) + ? str_replace( $root, '', $frame['file'] ) . ':' . ( $frame['line'] ?? '?' ) + : '[internal]'; + $callable = ( $frame['class'] ?? '' ) . ( $frame['type'] ?? '' ) . ( $frame['function'] ?? '' ); + return $location . ' ' . $callable; + }, + $e->getTrace() + ), + 0, + 6 + ), + ); + } +} diff --git a/tools/css-selector-fuzz/lib/autoload.php b/tools/css-selector-fuzz/lib/autoload.php new file mode 100644 index 0000000000000..6ebdcbc75d6c3 --- /dev/null +++ b/tools/css-selector-fuzz/lib/autoload.php @@ -0,0 +1,13 @@ + array() ); + $count = count( $argv ); + for ( $i = 1; $i < $count; $i++ ) { + $arg = $argv[ $i ]; + if ( 0 === strpos( $arg, '--' ) ) { + $name = substr( $arg, 2 ); + if ( false !== strpos( $name, '=' ) ) { + list( $name, $value ) = explode( '=', $name, 2 ); + $options[ $name ] = $value; + } elseif ( $i + 1 < $count && 0 !== strpos( $argv[ $i + 1 ], '--' ) ) { + $options[ $name ] = $argv[ ++$i ]; + } else { + $options[ $name ] = true; + } + } else { + $options['_'][] = $arg; + } + } + return $options; +} + +function option_string( array $options, string $name, ?string $default = null ): ?string { + if ( ! array_key_exists( $name, $options ) || true === $options[ $name ] ) { + return $default; + } + return (string) $options[ $name ]; +} + +function option_int( array $options, string $name, int $default ): int { + $value = option_string( $options, $name, null ); + return null === $value ? $default : (int) $value; +} + +function option_float( array $options, string $name, float $default ): float { + $value = option_string( $options, $name, null ); + return null === $value ? $default : (float) $value; +} + +function option_bool( array $options, string $name, bool $default ): bool { + if ( ! array_key_exists( $name, $options ) ) { + return $default; + } + $value = $options[ $name ]; + if ( true === $value ) { + return true; + } + return in_array( strtolower( (string) $value ), array( '1', 'true', 'yes', 'on' ), true ); +} + +function ensure_dir( string $dir ): void { + if ( ! is_dir( $dir ) && ! mkdir( $dir, 0777, true ) && ! is_dir( $dir ) ) { + throw new \RuntimeException( "Could not create directory: {$dir}" ); + } +} + +function json_encode_safe( $value ): string { + $encoded = json_encode( $value, JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_SUBSTITUTE ); + if ( false === $encoded ) { + $encoded = json_encode( array( 'jsonError' => json_last_error_msg() ) ); + } + return $encoded; +} + +function write_json_file( string $path, $value ): void { + file_put_contents( $path, json_encode_safe( $value ) . "\n" ); +} + +function read_json_file( string $path ): ?array { + if ( ! is_file( $path ) ) { + return null; + } + $decoded = json_decode( (string) file_get_contents( $path ), true ); + return is_array( $decoded ) ? $decoded : null; +} + +function append_ndjson( string $path, array $value ): void { + file_put_contents( $path, json_encode_safe( $value ) . "\n", FILE_APPEND | LOCK_EX ); +} + +function timestamp(): string { + return gmdate( 'Ymd-His' ); +} + +/** + * Renders bytes for human inspection: printable ASCII passes through, + * everything else becomes \xHH. + */ +function printable_bytes( string $bytes, int $max_length = 4096 ): string { + $out = ''; + $truncated = strlen( $bytes ) > $max_length; + $bytes = substr( $bytes, 0, $max_length ); + for ( $i = 0; $i < strlen( $bytes ); $i++ ) { + $c = $bytes[ $i ]; + $o = ord( $c ); + if ( $o >= 0x20 && $o <= 0x7E ) { + $out .= '\\' === $c ? '\\\\' : $c; + } else { + $out .= sprintf( '\\x%02X', $o ); + } + } + return $out . ( $truncated ? '…(truncated)' : '' ); +} + +function git_metadata(): array { + $head = trim( (string) shell_exec( 'git -C ' . escapeshellarg( repo_root() ) . ' rev-parse HEAD 2>/dev/null' ) ); + $branch = trim( (string) shell_exec( 'git -C ' . escapeshellarg( repo_root() ) . ' rev-parse --abbrev-ref HEAD 2>/dev/null' ) ); + return array( + 'head' => '' !== $head ? $head : null, + 'branch' => '' !== $branch ? $branch : null, + ); +} + +/** Whether every string anywhere in a nested array is valid UTF-8. */ +function ast_strings_are_utf8( $node ): bool { + if ( is_string( $node ) ) { + return (bool) preg_match( '//u', $node ); + } + if ( is_array( $node ) ) { + foreach ( $node as $child ) { + if ( ! ast_strings_are_utf8( $child ) ) { + return false; + } + } + } + return true; +} + +function ascii_strtolower( string $input ): string { + return strtr( $input, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz' ); +} + +function ascii_strtoupper( string $input ): string { + return strtr( $input, 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' ); +} + +/** Flips the case of each ASCII letter independently with 50% probability. */ +function str_shuffle_case( string $input, Prng $prng ): string { + $out = ''; + for ( $i = 0; $i < strlen( $input ); $i++ ) { + $byte = $input[ $i ]; + if ( $prng->chance( 50 ) ) { + $byte = ctype_lower( $byte ) ? ascii_strtoupper( $byte ) : ascii_strtolower( $byte ); + } + $out .= $byte; + } + return $out; +} + +/** + * Splits a valid UTF-8 string into codepoints. + * + * @return array Pairs of ( utf8 bytes, codepoint value ). + */ +function utf8_codepoints( string $input ): array { + $out = array(); + $len = strlen( $input ); + $i = 0; + while ( $i < $len ) { + $byte = ord( $input[ $i ] ); + if ( $byte < 0x80 ) { + $size = 1; + $cp = $byte; + } elseif ( 0xC0 === ( $byte & 0xE0 ) ) { + $size = 2; + $cp = $byte & 0x1F; + } elseif ( 0xE0 === ( $byte & 0xF0 ) ) { + $size = 3; + $cp = $byte & 0x0F; + } else { + $size = 4; + $cp = $byte & 0x07; + } + $size = min( $size, $len - $i ); + for ( $j = 1; $j < $size; $j++ ) { + $cp = ( $cp << 6 ) | ( ord( $input[ $i + $j ] ) & 0x3F ); + } + $out[] = array( substr( $input, $i, $size ), $cp ); + $i += $size; + } + return $out; +} diff --git a/tools/css-selector-fuzz/lib/wp-stubs.php b/tools/css-selector-fuzz/lib/wp-stubs.php new file mode 100644 index 0000000000000..ec9b154ee58d6 --- /dev/null +++ b/tools/css-selector-fuzz/lib/wp-stubs.php @@ -0,0 +1,62 @@ + (string) $function_name, + 'message' => (string) $message, + ); + } +} + +if ( ! function_exists( '_deprecated_argument' ) ) { + function _deprecated_argument( $function_name, $version, $message = '' ) { + } +} + +if ( ! function_exists( 'wp_trigger_error' ) ) { + function wp_trigger_error( $function_name, $message, $error_level = E_USER_NOTICE ) { + $GLOBALS['css_selector_fuzz_doing_it_wrong'][] = array( + 'function' => (string) $function_name, + 'message' => (string) $message, + ); + } +} + +if ( ! function_exists( 'wp_kses_uri_attributes' ) ) { + function wp_kses_uri_attributes() { + return array( + 'action', + 'archive', + 'background', + 'cite', + 'classid', + 'codebase', + 'data', + 'formaction', + 'href', + 'icon', + 'longdesc', + 'manifest', + 'poster', + 'profile', + 'src', + 'usemap', + 'xmlns', + ); + } +} diff --git a/tools/css-selector-fuzz/minimize.php b/tools/css-selector-fuzz/minimize.php new file mode 100644 index 0000000000000..c7fe1f1cb3dda --- /dev/null +++ b/tools/css-selector-fuzz/minimize.php @@ -0,0 +1,268 @@ +#!/usr/bin/env php + metamorphic-ast, Bug 2 -> match-mismatch-html, + * Bug 3 -> metamorphic-parse — reachable via --signature). + * + * Usage: + * php tools/css-selector-fuzz/minimize.php --seed 1234 [--signature SUBSTR] + * php tools/css-selector-fuzz/minimize.php --selector 'sel' --html '<…>' [--signature SUBSTR] + * + * Options: + * --signature SUBSTR Target a signature whose id or invariant contains + * SUBSTR. For --seed, also the way to opt into a + * related self-contained signature when the seed's own + * failure is generator-side (printed as a retarget). + * --max-attempts N Cap test evaluations (default 4000). + * --json Emit the reproducer as JSON. + */ + +require_once __DIR__ . '/lib/autoload.php'; + +use CssSelectorFuzz\Worker; +use function CssSelectorFuzz\json_encode_safe; +use function CssSelectorFuzz\option_bool; +use function CssSelectorFuzz\option_int; +use function CssSelectorFuzz\option_string; +use function CssSelectorFuzz\parse_cli_options; +use function CssSelectorFuzz\printable_bytes; + +$options = parse_cli_options( $argv ); +$max_attempts = option_int( $options, 'max-attempts', 20000 ); +$sig_filter = option_string( $options, 'signature', null ); + +/* + * In --seed mode, the seed's OWN failures ( from run_case ) are the source + * of truth. The minimizer can only preserve "self-contained" signatures + * ( those run_pair re-checks without the generator's intended AST ); the + * generator-side ones ( ast-mismatch, parse-expectation, path-expectation, + * model-desync ) are invisible to run_pair. Targeting must therefore be + * restricted to the intersection of the seed's failures and run_pair's + * view — otherwise the minimizer could silently retarget to an unrelated + * incidental signature and report a false "reproduced". + */ +$seed = option_int( $options, 'seed', -1 ); +$seed_signatures = null; +if ( $seed >= 0 ) { + $case = Worker::run_case( $seed ); + $selector = $case['selector']; + $html = $case['html']; + $seed_signatures = $case['signatures']; + if ( array() === $seed_signatures ) { + fwrite( STDERR, "Seed {$seed} produced no failure; nothing to minimize.\n" ); + exit( 1 ); + } +} else { + $selector = option_string( $options, 'selector', null ); + $html = option_string( $options, 'html', null ); + if ( null === $selector || null === $html ) { + fwrite( STDERR, "Provide --seed N, or both --selector and --html.\n" ); + exit( 1 ); + } +} + +/** Signatures produced by a pair ( $target lets run_pair short-circuit ). */ +$signatures_of = static function ( string $selector, string $html, ?string $target = null ): array { + return Worker::run_pair( $selector, $html, $target )['signatures']; +}; + +$baseline = $signatures_of( $selector, $html ); +if ( array() === $baseline ) { + fwrite( STDERR, "The starting pair does not reproduce any self-contained failure.\n" ); + if ( null !== $seed_signatures ) { + fwrite( STDERR, 'Seed failure(s): ' . implode( ', ', $seed_signatures ) . "\n" ); + fwrite( STDERR, "These are generator-side signatures the minimizer cannot reproduce from the\n" ); + fwrite( STDERR, "pair alone. Minimize a seed whose failure is self-contained, or pass\n" ); + fwrite( STDERR, "--selector/--html directly.\n" ); + } + fwrite( STDERR, 'selector: ' . printable_bytes( $selector ) . "\n" ); + exit( 1 ); +} + +/* + * Candidate targets are matched at the INVARIANT level, not the exact + * signature hash: a signature embeds transform-specific detail ( e.g. + * metamorphic-parse via `rerender` vs via `dup-branch` ), and run_pair's + * fixed metamorphic draws may expose the same invariant through a + * different transform than run_case did. Same invariant == same bug class, + * so that is faithful. A DIFFERENT invariant ( e.g. the seed's generator- + * side ast-mismatch vs an incidental self-contained metamorphic-ast ) is a + * genuine retarget and must be opted into. + */ +$invariant_of = static function ( string $signature ): string { + $pos = strrpos( $signature, ':' ); + return false === $pos ? $signature : substr( $signature, $pos + 1 ); +}; + +$retargeted = false; +if ( null === $seed_signatures ) { + $candidates = $baseline; +} else { + $seed_invariants = array_map( $invariant_of, $seed_signatures ); + $candidates = array(); + foreach ( $baseline as $signature ) { + if ( in_array( $invariant_of( $signature ), $seed_invariants, true ) ) { + $candidates[] = $signature; + } + } +} + +if ( array() === $candidates ) { + // The seed's failures are all generator-side ( no self-contained + // invariant in common ); refuse to silently minimize an unrelated + // incidental signature. + fwrite( STDERR, "Seed {$seed}'s failures are not self-contained, so the minimizer cannot\n" ); + fwrite( STDERR, "faithfully reproduce them.\n" ); + fwrite( STDERR, 'Seed failure(s): ' . implode( ', ', $seed_signatures ) . "\n" ); + fwrite( STDERR, 'Self-contained nearby: ' . implode( ', ', $baseline ) . "\n" ); + fwrite( STDERR, "Re-run with --signature to minimize one of the nearby signatures\n" ); + fwrite( STDERR, "explicitly ( understanding it is a related, not identical, failure ).\n" ); + if ( null === $sig_filter ) { + exit( 1 ); + } + // User explicitly opted into a nearby signature. + $candidates = $baseline; + $retargeted = true; +} + +// Pick the target signature from the eligible candidates. +$target = $candidates[0]; +if ( null !== $sig_filter ) { + foreach ( $candidates as $candidate ) { + if ( false !== strpos( $candidate, $sig_filter ) ) { + $target = $candidate; + break; + } + } +} + +$attempts = 0; +$reproduces = static function ( string $selector, string $html ) use ( $signatures_of, $target, &$attempts, $max_attempts ): bool { + if ( $attempts >= $max_attempts ) { + return false; + } + ++$attempts; + return in_array( $target, $signatures_of( $selector, $html, $target ), true ); +}; + +/** + * Delta-debugging shrink of one byte string: ddmin chunk removal followed + * by per-position single-byte simplification. $test( candidate ) decides + * whether a candidate still reproduces. + */ +$shrink = static function ( string $current, callable $test ) use ( &$attempts, $max_attempts ): string { + $chunks = 2; + while ( strlen( $current ) > 0 && $attempts < $max_attempts ) { + $length = strlen( $current ); + $chunk_size = (int) ceil( $length / $chunks ); + $changed = false; + + for ( $offset = 0; $offset < $length && $attempts < $max_attempts; $offset += $chunk_size ) { + $candidate = substr( $current, 0, $offset ) . substr( $current, min( $length, $offset + $chunk_size ) ); + if ( $candidate === $current ) { + continue; + } + if ( $test( $candidate ) ) { + $current = $candidate; + $chunks = max( 2, $chunks - 1 ); + $changed = true; + break; + } + } + + if ( ! $changed ) { + if ( $chunks >= $length ) { + break; + } + $chunks = min( $length, $chunks * 2 ); + } + } + + // Per-byte canonicalization: replace each byte with a simpler stand-in. + $replacements = array( 'a', ' ', '' ); + for ( $i = 0; $i < strlen( $current ) && $attempts < $max_attempts; $i++ ) { + foreach ( $replacements as $replacement ) { + $candidate = substr( $current, 0, $i ) . $replacement . substr( $current, $i + 1 ); + if ( $candidate === $current ) { + continue; + } + if ( $test( $candidate ) ) { + $current = $candidate; + $i = max( -1, $i - 2 ); + break; + } + } + } + + return $current; +}; + +// Alternate shrinking the HTML and the selector until neither moves. +// HTML first: when the signature is selector-only (e.g. metamorphic-parse) +// the document collapses cheaply before the costlier selector pass. +$prev = null; +while ( $attempts < $max_attempts && ( $selector . "\0" . $html ) !== $prev ) { + $prev = $selector . "\0" . $html; + + $html = $shrink( + $html, + static function ( string $candidate ) use ( $reproduces, &$selector ): bool { + return $reproduces( $selector, $candidate ); + } + ); + $selector = $shrink( + $selector, + static function ( string $candidate ) use ( $reproduces, &$html ): bool { + return $reproduces( $candidate, $html ); + } + ); +} + +$final = $signatures_of( $selector, $html ); +$ok = in_array( $target, $final, true ); + +if ( option_bool( $options, 'json', false ) ) { + echo json_encode_safe( + array( + 'target' => $target, + 'retargeted' => $retargeted, + 'seedSignatures' => $seed_signatures, + 'reproduced' => $ok, + 'attempts' => $attempts, + 'selector' => printable_bytes( $selector ), + 'selectorBytes' => strlen( $selector ), + 'html' => printable_bytes( $html ), + 'htmlBytes' => strlen( $html ), + 'selectorBase64' => base64_encode( $selector ), + 'htmlBase64' => base64_encode( $html ), + ) + ) . "\n"; + exit( $ok ? 0 : 2 ); +} + +echo "target: {$target}\n"; +if ( $retargeted ) { + echo 'NOTE: seed failure(s) ' . implode( ', ', $seed_signatures ) . " are generator-side;\n"; + echo " minimized the related self-contained signature above instead.\n"; +} +echo 'reproduced: ' . ( $ok ? 'yes' : 'NO' ) . "\n"; +echo "attempts: {$attempts}\n"; +echo 'selector: ' . printable_bytes( $selector ) . ' (' . strlen( $selector ) . " bytes)\n"; +echo 'html: ' . printable_bytes( $html ) . ' (' . strlen( $html ) . " bytes)\n"; +echo "\nreplay:\n"; +echo ' php tools/css-selector-fuzz/replay.php --selector ' . escapeshellarg( $selector ) + . ' --html ' . escapeshellarg( $html ) . "\n"; +exit( $ok ? 0 : 2 ); diff --git a/tools/css-selector-fuzz/replay.php b/tools/css-selector-fuzz/replay.php new file mode 100644 index 0000000000000..38ffb8a43678e --- /dev/null +++ b/tools/css-selector-fuzz/replay.php @@ -0,0 +1,91 @@ +#!/usr/bin/env php + bar' [--html '
'] + */ + +require_once __DIR__ . '/lib/autoload.php'; + +use CssSelectorFuzz\Bootstrap; +use CssSelectorFuzz\Worker; +use function CssSelectorFuzz\json_encode_safe; +use function CssSelectorFuzz\option_bool; +use function CssSelectorFuzz\option_int; +use function CssSelectorFuzz\option_string; +use function CssSelectorFuzz\parse_cli_options; +use function CssSelectorFuzz\printable_bytes; + +$options = parse_cli_options( $argv ); + +$probe_selector = option_string( $options, 'selector', null ); +if ( null !== $probe_selector ) { + // Quick probe mode: parse a selector and report what the API does with it. + Bootstrap::load(); + + $compound = \WP_CSS_Compound_Selector_List::from_selectors( $probe_selector ); + $complex = \WP_CSS_Complex_Selector_List::from_selectors( $probe_selector ); + + $report = array( + 'selector' => printable_bytes( $probe_selector ), + 'compoundList' => null === $compound ? null : \CssSelectorFuzz\AstExtractor::from_compound_list( $compound ), + 'complexList' => null === $complex ? null : \CssSelectorFuzz\AstExtractor::from_complex_list( $complex ), + ); + + $html = option_string( $options, 'html', null ); + if ( null !== $html && null !== $complex ) { + $processor = \WP_HTML_Processor::create_full_parser( $html ); + $matches = array(); + while ( $processor->select( $probe_selector ) ) { + $matches[] = array( + 'tag' => $processor->get_tag(), + 'breadcrumbs' => $processor->get_breadcrumbs(), + ); + } + $report['htmlProcessorMatches'] = $matches; + } + + echo json_encode_safe( $report ) . "\n"; + exit( 0 ); +} + +$seed = option_int( $options, 'seed', -1 ); +if ( $seed < 0 ) { + echo "Usage: php tools/css-selector-fuzz/replay.php --seed N [--json] [--show-html]\n"; + echo " php tools/css-selector-fuzz/replay.php --selector 'div > .cls' [--html '
']\n"; + exit( 1 ); +} + +$result = Worker::run_case( $seed ); + +if ( option_bool( $options, 'json', false ) ) { + echo json_encode_safe( $result ) . "\n"; + exit( array() === $result['failures'] ? 0 : 2 ); +} + +echo "seed: {$result['seed']}\n"; +echo "bucket: {$result['bucket']}\n"; +echo 'selector: ' . printable_bytes( $result['selector'] ) . "\n"; +echo "digest: {$result['digest']}\n"; + +if ( option_bool( $options, 'show-html', false ) ) { + echo "html: " . printable_bytes( $result['html'] ) . "\n"; +} + +if ( array() === $result['failures'] ) { + echo "failures: none\n"; + exit( 0 ); +} + +echo 'failures: ' . count( $result['failures'] ) . "\n"; +foreach ( $result['failures'] as $i => $failure ) { + echo "--- failure {$i}: {$failure['invariant']} ---\n"; + echo json_encode_safe( $failure['detail'] ) . "\n"; +} +exit( 2 ); diff --git a/tools/css-selector-fuzz/runner.php b/tools/css-selector-fuzz/runner.php new file mode 100644 index 0000000000000..414eb167a660e --- /dev/null +++ b/tools/css-selector-fuzz/runner.php @@ -0,0 +1,337 @@ +#!/usr/bin/env php + array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'pipe', 'w' ), + ); + + $started = microtime( true ); + $proc = proc_open( $command, $descriptors, $pipes, repo_root() ); + if ( ! is_resource( $proc ) ) { + return array( + 'code' => null, + 'timedOut' => false, + 'stdout' => '', + 'stderr' => 'proc_open failed', + 'durationMs' => 0, + ); + } + + fclose( $pipes[0] ); + stream_set_blocking( $pipes[1], false ); + stream_set_blocking( $pipes[2], false ); + + $stdout = ''; + $stderr = ''; + $timed_out = false; + $deadline = $started + $timeout_ms / 1000; + + while ( true ) { + $status = proc_get_status( $proc ); + $stdout .= (string) stream_get_contents( $pipes[1] ); + $stderr .= (string) stream_get_contents( $pipes[2] ); + + if ( ! $status['running'] ) { + $code = $status['exitcode']; + break; + } + if ( microtime( true ) > $deadline ) { + $timed_out = true; + proc_terminate( $proc, 9 ); + $code = null; + break; + } + usleep( 10000 ); + } + + $stdout .= (string) stream_get_contents( $pipes[1] ); + $stderr .= (string) stream_get_contents( $pipes[2] ); + fclose( $pipes[1] ); + fclose( $pipes[2] ); + proc_close( $proc ); + + return array( + 'code' => $code, + 'timedOut' => $timed_out, + 'stdout' => $stdout, + 'stderr' => $stderr, + 'durationMs' => (int) round( 1000 * ( microtime( true ) - $started ) ), + ); +} + +/** Extracts the batch summary from worker stdout, or null. */ +function css_selector_fuzz_worker_summary( string $stdout ): ?array { + foreach ( array_reverse( explode( "\n", trim( $stdout ) ) ) as $line ) { + $decoded = json_decode( $line, true ); + if ( is_array( $decoded ) && 'css-selector-fuzz-batch-summary' === ( $decoded['kind'] ?? null ) ) { + return $decoded; + } + } + return null; +} + +/** Merges per-bucket/per-target match assertion counts. */ +function css_selector_fuzz_merge_match_stats( array &$target, array $source ): void { + foreach ( $source as $bucket => $targets ) { + foreach ( $targets as $match_target => $stats ) { + if ( ! isset( $target[ $bucket ][ $match_target ] ) ) { + $target[ $bucket ][ $match_target ] = array( + 'assertions' => 0, + 'nonVacuous' => 0, + ); + } + $target[ $bucket ][ $match_target ]['assertions'] += (int) ( $stats['assertions'] ?? 0 ); + $target[ $bucket ][ $match_target ]['nonVacuous'] += (int) ( $stats['nonVacuous'] ?? 0 ); + } + } +} + +/** Adds derived rates after all count aggregation is finished. */ +function css_selector_fuzz_finalize_match_stats( array $stats ): array { + foreach ( $stats as $bucket => $targets ) { + foreach ( $targets as $match_target => $counts ) { + $assertions = (int) ( $counts['assertions'] ?? 0 ); + $non_vacuous = (int) ( $counts['nonVacuous'] ?? 0 ); + $vacuous = max( 0, $assertions - $non_vacuous ); + + $stats[ $bucket ][ $match_target ]['vacuous'] = $vacuous; + $stats[ $bucket ][ $match_target ]['nonVacuousRate'] = $assertions > 0 ? round( $non_vacuous / $assertions, 4 ) : 0.0; + $stats[ $bucket ][ $match_target ]['vacuousRate'] = $assertions > 0 ? round( $vacuous / $assertions, 4 ) : 0.0; + } + } + return $stats; +} + +function css_selector_fuzz_write_state( string $state_path, array $state ): void { + $state['matchStats'] = css_selector_fuzz_finalize_match_stats( $state['matchStats'] ?? array() ); + write_json_file( $state_path, $state ); +} + +function css_selector_fuzz_state_for_output( array $state ): array { + $state['matchStats'] = css_selector_fuzz_finalize_match_stats( $state['matchStats'] ?? array() ); + return $state; +} + +$options = parse_cli_options( $argv ); +if ( option_bool( $options, 'help', false ) || option_bool( $options, 'h', false ) ) { + echo "Usage: php tools/css-selector-fuzz/runner.php [--start-seed N] [--max-seeds N] [--duration-seconds N] [--chunk-size N] [--timeout-ms N] [--output-dir DIR] [--stop-on-failure]\n"; + exit( 0 ); +} + +$start_seed = option_int( $options, 'start-seed', 1 ); +$max_seeds = option_int( $options, 'max-seeds', 1000 ); +$duration_seconds = option_int( $options, 'duration-seconds', 120 ); +$chunk_size = max( 1, option_int( $options, 'chunk-size', 200 ) ); +$timeout_ms = option_int( $options, 'timeout-ms', 0 ); +$stop_on_failure = option_bool( $options, 'stop-on-failure', false ); +$output_dir = option_string( $options, 'output-dir', repo_root() . '/artifacts/css-selector-fuzz/run-' . timestamp() ); + +if ( $max_seeds < 1 ) { + fwrite( STDERR, "--max-seeds must be at least 1; refusing to run unbounded.\n" ); + exit( 1 ); +} +if ( 0 === $timeout_ms ) { + // Generous per-chunk budget: ~50ms per case plus startup. + $timeout_ms = $chunk_size * 50 + 10000; +} + +ensure_dir( $output_dir ); +$failures_path = $output_dir . '/failures.ndjson'; +$state_path = $output_dir . '/state.json'; +$worker_script = __DIR__ . '/worker.php'; + +$state = array( + 'kind' => 'css-selector-fuzz-runner-state', + 'startedAt' => gmdate( 'c' ), + 'updatedAt' => gmdate( 'c' ), + 'git' => git_metadata(), + 'phpVersion' => PHP_VERSION, + 'outputDir' => $output_dir, + 'startSeed' => $start_seed, + 'maxSeeds' => $max_seeds, + 'durationSeconds' => $duration_seconds, + 'chunkSize' => $chunk_size, + 'casesCompleted' => 0, + 'failures' => 0, + 'crashes' => 0, + 'buckets' => array(), + 'signatures' => array(), + 'lexbor' => array(), + 'matchStats' => array(), + 'nextSeed' => $start_seed, + 'stopReason' => null, +); +css_selector_fuzz_write_state( $state_path, $state ); + +$deadline = $duration_seconds > 0 ? microtime( true ) + $duration_seconds : null; +$seed = $start_seed; +$end_seed = $start_seed + $max_seeds; + +while ( $seed < $end_seed ) { + if ( null !== $deadline && microtime( true ) > $deadline ) { + $state['stopReason'] = 'duration-elapsed'; + break; + } + + $count = min( $chunk_size, $end_seed - $seed ); + $args = array( + $worker_script, + '--start-seed', + (string) $seed, + '--count', + (string) $count, + '--failures-out', + $failures_path, + '--progress-file', + $output_dir . '/progress.txt', + ); + + $proc = css_selector_fuzz_run_php( $args, $timeout_ms ); + $summary = css_selector_fuzz_worker_summary( $proc['stdout'] ); + + if ( null === $summary ) { + /* + * The worker crashed, hung, or died fatally. Re-run each seed of the + * chunk in its own process to attribute the crash. + */ + fwrite( STDERR, "chunk seed={$seed} count={$count}: worker crashed/hung; isolating…\n" ); + for ( $isolated = $seed; $isolated < $seed + $count; $isolated++ ) { + $single = css_selector_fuzz_run_php( + array( + $worker_script, + '--start-seed', + (string) $isolated, + '--count', + '1', + '--failures-out', + $failures_path, + '--determinism-every', + '0', + ), + max( 5000, (int) ( $timeout_ms / $count ) + 5000 ) + ); + $single_summary = css_selector_fuzz_worker_summary( $single['stdout'] ); + if ( null === $single_summary ) { + ++$state['crashes']; + ++$state['failures']; + append_ndjson( + $failures_path, + array( + 'kind' => 'css-selector-fuzz-failure', + 'seed' => $isolated, + 'invariant' => $single['timedOut'] ? 'worker-timeout' : 'worker-crash', + 'signature' => $single['timedOut'] ? 'worker-timeout' : 'worker-crash', + 'exitCode' => $single['code'], + 'stderrTail' => substr( $single['stderr'], -2000 ), + ) + ); + $key = $single['timedOut'] ? 'worker-timeout' : 'worker-crash'; + $state['signatures'][ $key ] = ( $state['signatures'][ $key ] ?? 0 ) + 1; + } else { + ++$state['casesCompleted']; + $state['failures'] += $single_summary['failures']; + foreach ( $single_summary['buckets'] as $bucket => $bucket_count ) { + $state['buckets'][ $bucket ] = ( $state['buckets'][ $bucket ] ?? 0 ) + $bucket_count; + } + foreach ( $single_summary['signatures'] as $signature => $signature_count ) { + $state['signatures'][ $signature ] = ( $state['signatures'][ $signature ] ?? 0 ) + $signature_count; + } + foreach ( $single_summary['lexbor'] ?? array() as $lexbor_state => $lexbor_count ) { + $state['lexbor'][ $lexbor_state ] = ( $state['lexbor'][ $lexbor_state ] ?? 0 ) + $lexbor_count; + } + css_selector_fuzz_merge_match_stats( $state['matchStats'], $single_summary['matchStats'] ?? array() ); + } + } + } else { + $state['casesCompleted'] += array_sum( $summary['buckets'] ); + $state['failures'] += $summary['failures']; + foreach ( $summary['buckets'] as $bucket => $bucket_count ) { + $state['buckets'][ $bucket ] = ( $state['buckets'][ $bucket ] ?? 0 ) + $bucket_count; + } + foreach ( $summary['signatures'] as $signature => $signature_count ) { + $state['signatures'][ $signature ] = ( $state['signatures'][ $signature ] ?? 0 ) + $signature_count; + } + foreach ( $summary['lexbor'] ?? array() as $lexbor_state => $lexbor_count ) { + $state['lexbor'][ $lexbor_state ] = ( $state['lexbor'][ $lexbor_state ] ?? 0 ) + $lexbor_count; + } + css_selector_fuzz_merge_match_stats( $state['matchStats'], $summary['matchStats'] ?? array() ); + } + + $seed += $count; + $state['nextSeed'] = $seed; + $state['updatedAt'] = gmdate( 'c' ); + css_selector_fuzz_write_state( $state_path, $state ); + + if ( $stop_on_failure && $state['failures'] > 0 ) { + $state['stopReason'] = 'stop-on-failure'; + break; + } +} + +if ( null === $state['stopReason'] ) { + $state['stopReason'] = 'max-seeds'; +} +$state['updatedAt'] = gmdate( 'c' ); +css_selector_fuzz_write_state( $state_path, $state ); + +/* + * The lexbor differential is the third oracle. If it ever ran ( 'compared' ) + * it was built and live; any 'unavailable' or 'error' tally then means it + * was missing for some cases or died mid-run, so part of the run had only + * two oracles. Surface that loudly rather than letting a green run hide it. + */ +$lexbor = $state['lexbor']; +$lexbor_ran = ( $lexbor['compared'] ?? 0 ) > 0; +$lexbor_lost = ( $lexbor['unavailable'] ?? 0 ) + ( $lexbor['error'] ?? 0 ); +if ( $lexbor_ran && $lexbor_lost > 0 ) { + fwrite( STDERR, "WARNING: lexbor third oracle was unavailable/errored for {$lexbor_lost} case(s); those ran with two oracles.\n" ); +} elseif ( ! $lexbor_ran ) { + fwrite( STDERR, "NOTE: lexbor third oracle never ran (harness not built?); run `sh tools/css-selector-fuzz/lexbor/build.sh` for the differential.\n" ); +} + +echo json_encode_safe( css_selector_fuzz_state_for_output( $state ) ) . "\n"; +exit( 0 === $state['failures'] ? 0 : 2 ); diff --git a/tools/css-selector-fuzz/tests/self-check.php b/tools/css-selector-fuzz/tests/self-check.php new file mode 100644 index 0000000000000..9664367f1e300 --- /dev/null +++ b/tools/css-selector-fuzz/tests/self-check.php @@ -0,0 +1,368 @@ +#!/usr/bin/env php + substr_count( $selector, ']' ) ) { + return 'eof-auto-closes-attribute-selector'; + } + if ( preg_match( '/\\[[^\\]]*=\\s*[-_a-zA-Z0-9]\\]$/', $selector ) ) { + return 'single-char-unquoted-attribute-value-at-eof'; + } + if ( has_identity_escape_after_multibyte( $selector ) ) { + return 'identity-escape-after-multibyte'; + } + + return null; +} + +function has_identity_escape_after_multibyte( string $selector ): bool { + $seen_multibyte = false; + $length = strlen( $selector ); + for ( $i = 0; $i < $length; $i++ ) { + $byte = ord( $selector[ $i ] ); + if ( $byte > 0x7F ) { + $seen_multibyte = true; + continue; + } + if ( ! $seen_multibyte || '\\' !== $selector[ $i ] || $i + 1 >= $length ) { + continue; + } + + $next = $selector[ $i + 1 ]; + if ( "\n" === $next || "\r" === $next || "\f" === $next || ctype_xdigit( $next ) ) { + continue; + } + return true; + } + return false; +} + +Bootstrap::load(); + +// --- Prng determinism and independence ------------------------------------- + +$a = new Prng( '42', 'label' ); +$b = new Prng( '42', 'label' ); +check( $a->bytes( 64 ) === $b->bytes( 64 ), 'Identical seeds produce identical streams.' ); + +$c = new Prng( '42', 'label' ); +$d = new Prng( '43', 'label' ); +check( $c->bytes( 64 ) !== $d->bytes( 64 ), 'Different seeds produce different streams.' ); + +$e = new Prng( '42', 'fork-test' ); +$f = new Prng( '42', 'fork-test' ); +$fork1 = $e->fork( 'x' ); +$fork2 = $f->fork( 'x' ); +check( $fork1->bytes( 32 ) === $fork2->bytes( 32 ), 'Forked streams are deterministic.' ); + +// --- utf8_codepoints -------------------------------------------------------- + +$points = utf8_codepoints( "a\u{E9}\u{1F600}" ); +check( 3 === count( $points ), 'utf8_codepoints splits into 3 codepoints.' ); +check( 0x61 === $points[0][1] && 0xE9 === $points[1][1] && 0x1F600 === $points[2][1], 'utf8_codepoints decodes values.' ); + +// --- Document generator: model matches parse for many seeds --------------- +// ( Worker::run_case checks this per case as model-desync; here only a couple +// of seeds are sampled for a fast signal. ) + +for ( $seed = 1; $seed <= 3; $seed++ ) { + $document = DocumentGenerator::generate( new Prng( (string) $seed, 'self-check-doc' ) ); + check( is_string( $document['html'] ) && '' !== $document['html'], "Document {$seed} renders." ); + check( str_contains( $document['html'], 'data-fid' ) || false !== strpos( $document['html'], 'data-fid' ), "Document {$seed} has fids." ); +} + +// --- Selector generator expectations over many seeds ----------------------- + +$by_bucket = array(); +$allowed_parse_mismatches = array(); +for ( $seed = 1; $seed <= 400; $seed++ ) { + $prng = new Prng( (string) $seed, 'self-check-selector' ); + $document = DocumentGenerator::generate( $prng->fork( 'doc' ) ); + $selector = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'] ); + + $by_bucket[ $selector['bucket'] ] = ( $by_bucket[ $selector['bucket'] ] ?? 0 ) + 1; + + $compound = WP_CSS_Compound_Selector_List::from_selectors( $selector['selector'] ); + $complex = WP_CSS_Complex_Selector_List::from_selectors( $selector['selector'] ); + + if ( null !== $selector['expectCompound'] ) { + $expected = $selector['expectCompound']; + $actual = null !== $compound; + $known = known_core_parse_mismatch( $selector['selector'], $expected, $actual ); + if ( null !== $known ) { + $allowed_parse_mismatches[ "compound:{$known}" ] = ( $allowed_parse_mismatches[ "compound:{$known}" ] ?? 0 ) + 1; + } else { + check( + $expected === $actual, + "Seed {$seed} ({$selector['bucket']}): compound parse expectation for: " . \CssSelectorFuzz\printable_bytes( $selector['selector'] ) + ); + } + } + if ( null !== $selector['expectComplex'] ) { + $expected = $selector['expectComplex']; + $actual = null !== $complex; + $known = known_core_parse_mismatch( $selector['selector'], $expected, $actual ); + if ( null !== $known ) { + $allowed_parse_mismatches[ "complex:{$known}" ] = ( $allowed_parse_mismatches[ "complex:{$known}" ] ?? 0 ) + 1; + } else { + check( + $expected === $actual, + "Seed {$seed} ({$selector['bucket']}): complex parse expectation for: " . \CssSelectorFuzz\printable_bytes( $selector['selector'] ) + ); + } + } +} + +check( count( $by_bucket ) >= 5, 'Bucket variety: saw ' . count( $by_bucket ) . ' buckets.' ); +if ( array() !== $allowed_parse_mismatches ) { + fwrite( STDERR, 'Allowed known core parse bug signatures: ' . \CssSelectorFuzz\json_encode_safe( $allowed_parse_mismatches ) . "\n" ); +} + +// --- Document generator: randomized class NUL injection -------------------- + +$safe_class_nul = 0; +for ( $seed = 1; $seed <= 200; $seed++ ) { + $document = DocumentGenerator::generate( new Prng( (string) $seed, 'self-check-class-nul-safe' ) ); + if ( false !== strpos( $document['html'], "\0" ) ) { + ++$safe_class_nul; + check( false === str_contains( implode( "\n", $document['pools']['attrValues'] ), "\0" ), "Safe document {$seed}: class NUL does not leak into attrValues pool." ); + check( \CssSelectorFuzz\ast_strings_are_utf8( $document['pools']['classes'] ), "Safe document {$seed}: class pool strings stay valid UTF-8." ); + check( in_array( true, array_map( static function ( string $class ): bool { + return false !== strpos( $class, "\u{FFFD}" ); + }, $document['pools']['classes'] ), true ), "Safe document {$seed}: class pool contains decoded U+FFFD token." ); + } +} +check( $safe_class_nul > 0, "Safe document generator emits randomized class NUL values ({$safe_class_nul} of 200)." ); + +$wild_class_nul = 0; +for ( $seed = 1; $seed <= 200; $seed++ ) { + $document = WildDocumentGenerator::generate( new Prng( (string) $seed, 'self-check-class-nul-wild' ) ); + if ( false !== strpos( $document['html'], "\0" ) ) { + ++$wild_class_nul; + check( false === str_contains( implode( "\n", $document['pools']['attrValues'] ), "\0" ), "Wild document {$seed}: class NUL does not leak into attrValues pool." ); + check( \CssSelectorFuzz\ast_strings_are_utf8( $document['pools']['classes'] ), "Wild document {$seed}: class pool strings stay valid UTF-8." ); + check( in_array( true, array_map( static function ( string $class ): bool { + return false !== strpos( $class, "\u{FFFD}" ); + }, $document['pools']['classes'] ), true ), "Wild document {$seed}: class pool contains decoded U+FFFD token." ); + } +} +check( $wild_class_nul > 0, "Wild document generator emits randomized class NUL values ({$wild_class_nul} of 200)." ); + +// --- Invalid-UTF-8 bucket: post-scrub AST expectations by construction ------ +// from_selectors() replaces each maximal subpart of an ill-formed UTF-8 +// sequence with one U+FFFD before parsing ( CSS Syntax §3.2 via the WHATWG +// decoder ). The bucket injects raw ill-formed sequences and carries the +// post-scrub AST, with the per-class subpart counts hard-coded in the +// generator — independent of wp_scrub_utf8(), so this loop is a real +// differential between the generator's WHATWG expectations and the core +// scrub + parse pipeline. + +$fffd_ast_counts = array(); +$injection_sites = array(); +$byte_classes = array(); + +// The class names AND byte values are duplicated here on purpose: tallying +// from the generator's own table would silently shrink the assertion with a +// deleted entry and self-validate on a drifted byte value. +$expected_byte_classes = array( + 'lone-continuation' => "\x80", + 'truncated-2-byte' => "\xC3", + 'truncated-3-byte' => "\xE2\x8C", + 'truncated-4-byte' => "\xF0\x9F\x82", + 'invalid-lead-f5' => "\xF5", + 'invalid-lead-ff' => "\xFF", + 'overlong-min' => "\xC0\x80", + 'overlong-max' => "\xC1\xBF", + 'surrogate-half' => "\xED\xA0\x80", + 'beyond-max' => "\xF4\x90\x80\x80", +); + +$count_fffd = static function ( $node ) use ( &$count_fffd ): int { + if ( is_string( $node ) ) { + return substr_count( $node, "\u{FFFD}" ); + } + $total = 0; + if ( is_array( $node ) ) { + foreach ( $node as $child ) { + $total += $count_fffd( $child ); + } + } + return $total; +}; + +for ( $seed = 1; $seed <= 150; $seed++ ) { + $prng = new Prng( (string) $seed, 'self-check-invalid-utf8' ); + $document = DocumentGenerator::generate( $prng->fork( 'doc' ) ); + $case = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'], null, 'invalid-utf8' ); + $printable = \CssSelectorFuzz\printable_bytes( $case['selector'] ); + + check( 'invalid-utf8' === $case['bucket'], "Seed {$seed}: forced invalid-utf8 bucket, got {$case['bucket']}." ); + check( ! wp_is_valid_utf8( $case['selector'] ), "Seed {$seed}: selector must contain invalid UTF-8: {$printable}" ); + check( true === $case['expectCompound'] && true === $case['expectComplex'], "Seed {$seed}: invalid-utf8 cases must expect to parse in both grammars." ); + check( is_array( $case['ast'] ) && \CssSelectorFuzz\ast_strings_are_utf8( $case['ast'] ), "Seed {$seed}: expected AST must be valid UTF-8." ); + + $compound = WP_CSS_Compound_Selector_List::from_selectors( $case['selector'] ); + $complex = WP_CSS_Complex_Selector_List::from_selectors( $case['selector'] ); + check( null !== $compound, "Seed {$seed}: compound parse after scrub for: {$printable}" ); + check( null !== $complex, "Seed {$seed}: complex parse after scrub for: {$printable}" ); + if ( null === $complex || ! is_array( $case['ast'] ) ) { + continue; + } + + $parsed_ast = \CssSelectorFuzz\AstExtractor::from_complex_list( $complex ); + check( $case['ast'] === $parsed_ast, "Seed {$seed}: parsed AST equals maximal-subpart scrub expectation for: {$printable}" ); + + $fffd_ast_counts[ $count_fffd( $case['ast'] ) ] = true; + foreach ( (array) $case['ast'][0]['self']['subs'] as $sub ) { + $injection_sites[ 'attr' === $sub['kind'] && null !== $sub['matcher'] ? 'attr-value' : $sub['kind'] ] = true; + } + foreach ( $expected_byte_classes as $class_name => $class_bytes ) { + // Substring attribution is ambiguous only for lone-continuation, + // whose byte occurs inside three longer classes — good enough for + // an at-least-once variety tally. + if ( str_contains( $case['selector'], $class_bytes ) ) { + $byte_classes[ $class_name ] = true; + } + } +} + +foreach ( array( 1, 2, 3, 4 ) as $expected_count ) { + check( isset( $fffd_ast_counts[ $expected_count ] ), "Invalid-utf8 variety: a {$expected_count}-subpart byte class was generated." ); +} +foreach ( array( 'class', 'id', 'attr', 'attr-value' ) as $site ) { + check( isset( $injection_sites[ $site ] ), "Invalid-utf8 variety: injection site {$site} was generated." ); +} +foreach ( array_keys( $expected_byte_classes ) as $class_name ) { + check( isset( $byte_classes[ $class_name ] ), "Invalid-utf8 variety: byte class {$class_name} was generated." ); +} + +// --- Mutated bucket: raw invalid-byte splicing ------------------------------- +// mutate() must be able to splice raw ill-formed UTF-8 into a selector at +// arbitrary byte offsets; these cases carry no AST expectation and exercise +// crash / scrub-notice / differential paths only. The marker bytes here can +// appear in NO rendered selector (the pools' multibyte characters use other +// lead bytes), so their presence proves the mutation operation fired. + +$mutated_with_invalid = 0; +for ( $seed = 1; $seed <= 200; $seed++ ) { + $prng = new Prng( (string) $seed, 'self-check-mutated-utf8' ); + $document = DocumentGenerator::generate( $prng->fork( 'doc' ) ); + $case = SelectorGenerator::generate( $prng->fork( 'sel' ), $document['pools'], null, 'mutated' ); + if ( false !== strpbrk( $case['selector'], "\xC0\xC1\xED\xF4\xF5\xFF" ) ) { + ++$mutated_with_invalid; + } +} +check( $mutated_with_invalid >= 10, "Mutated bucket splices raw invalid bytes ({$mutated_with_invalid} of 200 seeds)." ); + +// --- Known-answer matching cases ------------------------------------------- + +$known_html = '' + . '
' + . '
' + . ''; + +function select_fids( string $html, string $selector ): array { + $processor = WP_HTML_Processor::create_full_parser( $html ); + $out = array(); + while ( $processor->select( $selector ) ) { + $out[] = $processor->get_attribute( 'data-fid' ); + } + return $out; +} + +check( array( 'e4' ) === select_fids( $known_html, '#x' ), 'Known: #x.' ); +check( array( 'e3', 'e4' ) === select_fids( $known_html, '.b' ), 'Known: .b.' ); +check( array( 'e4' ) === select_fids( $known_html, 'div > span.b' ), 'Known: div > span.b.' ); +check( array( 'e7' ) === select_fids( $known_html, 'section em' ), 'Known: section em.' ); +check( array() === select_fids( $known_html, 'section > em' ), 'Known: section > em matches nothing.' ); +check( array( 'e4' ) === select_fids( $known_html, '[data-v|="hello"]' ), 'Known: [data-v|=hello].' ); +check( array( 'e7' ) === select_fids( $known_html, '[lang^="en"]' ), 'Known: [lang^=en].' ); + +// --- Class-value decode boundary (ReferenceMatcher vs WP class_list) -------- +// WP's class_list() folds NUL -> U+FFFD and treats FF as a separator; the +// reference matcher reimplements tokenization independently. Pin both engines +// against each other on these boundary inputs; randomized generator sampling +// above verifies that the same NUL boundary is present in the hot path. Each +// case also checks the reference matcher agrees with select() over a +// TreeCapture of the same markup. + +function ref_fids( string $html, string $selector ): array { + $capture = \CssSelectorFuzz\TreeCapture::capture( $html ); + $list = WP_CSS_Complex_Selector_List::from_selectors( $selector ); + if ( null !== $capture['error'] || null === $list ) { + return array( '(error)' ); + } + $ast = \CssSelectorFuzz\AstExtractor::from_complex_list( $list ); + return \CssSelectorFuzz\ReferenceMatcher::expected_html_matches_rows( $ast, $capture['htmlRows'], $capture['quirks'] ); +} + +$nul_html = ""; +$ff_html = ""; + +$nul_cases = array( + array( "class NUL -> FFFD", $nul_html, ".foo\u{FFFD}bar", array( 'n0' ) ), + array( "class trailing NUL", $nul_html, ".x\u{FFFD}", array( 'n1' ) ), + array( "class raw NUL no-match", $nul_html, '.foobar', array() ), + array( "class FF separator (first)", $ff_html, '.alpha', array( 'f0' ) ), + array( "class FF separator (second)", $ff_html, '.beta', array( 'f0' ) ), +); +foreach ( $nul_cases as $case ) { + list( $label, $html, $selector, $expected ) = $case; + $wp = select_fids( $html, $selector ); + $ref = ref_fids( $html, $selector ); + check( $expected === $wp, "Decode boundary ({$label}): select() == expected." ); + check( $ref === $wp, "Decode boundary ({$label}): ReferenceMatcher == select()." ); +} + +// --- Worker end-to-end on a few seeds --------------------------------------- + +for ( $seed = 1; $seed <= 5; $seed++ ) { + $first = Worker::run_case( $seed ); + $second = Worker::run_case( $seed ); + check( $first['digest'] === $second['digest'], "Seed {$seed}: case digest is deterministic." ); +} + +if ( 0 === $failures ) { + echo "self-check OK\n"; + exit( 0 ); +} +echo "self-check FAILED: {$failures} failure(s)\n"; +exit( 1 ); diff --git a/tools/css-selector-fuzz/worker.php b/tools/css-selector-fuzz/worker.php new file mode 100644 index 0000000000000..bdcde442aa943 --- /dev/null +++ b/tools/css-selector-fuzz/worker.php @@ -0,0 +1,34 @@ +#!/usr/bin/env php + 'css-selector-fuzz-worker-fatal', + 'error' => \CssSelectorFuzz\Worker::describe_throwable( $e ), + ) + ) . "\n" + ); + exit( 1 ); +}