From 9d8e23e85d1e7d64a972547fc8bbcb9a10ce9893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 15:28:40 +0200 Subject: [PATCH 01/24] Inline terminal matching and defer parse node allocation Hot-path changes in WP_Parser::parse_recursive(): - Inline the terminal match in the branch loop instead of recursing into parse_recursive() for every token. Over the full MySQL test suite this eliminates ~1.6M function calls. - Hoist grammar, rules, fragment_ids, rule_names, tokens, and token_count into local variables so the inner loops avoid repeated property lookups on $this->grammar. - Cache the token count on the instance to avoid a count() per call. - Build branch children in a local array and only instantiate the WP_Parser_Node once the branch has matched; on the MySQL corpus ~75% of speculative nodes were previously created and thrown away. - Drop a dead is_array($subnode) check that never fires in practice (subnodes are false, true, tokens, or nodes - never arrays). - Inline fragment inlining: read the fragment's children directly instead of building a fragment node and immediately merging it. End-to-end parser benchmark on the MySQL server test corpus: Before: ~11,500 QPS After: ~14,900 QPS (+29%) --- .../src/mysql/class-wp-mysql-parser.php | 2 +- .../src/parser/class-wp-parser-node.php | 12 +++ .../src/parser/class-wp-parser.php | 89 +++++++++++++------ 3 files changed, 75 insertions(+), 28 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php index 69282b9c..b6b465bd 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php @@ -40,7 +40,7 @@ public function reset_tokens( array $tokens ): void { * @return bool Whether a query was successfully parsed. */ public function next_query(): bool { - if ( $this->position >= count( $this->tokens ) ) { + if ( $this->position >= $this->token_count ) { return false; } $this->current_ast = $this->parse(); diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index b61f38d5..75580f6d 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -26,6 +26,18 @@ public function append_child( $node ) { $this->children[] = $node; } + /** + * Replace all children with the given array. + * + * This is used by the parser to attach a batch of children built up in a + * local array while trying branches, without allocating a node per attempt. + * + * @param array $children The new children. + */ + public function set_children( array $children ): void { + $this->children = $children; + } + /** * Flatten the matched rule fragments as if their children were direct * descendants of the current rule. diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 4436892f..96feb083 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -11,12 +11,14 @@ class WP_Parser { protected $grammar; protected $tokens; + protected $token_count; protected $position; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->tokens = $tokens; - $this->position = 0; + $this->grammar = $grammar; + $this->tokens = $tokens; + $this->token_count = count( $tokens ); + $this->position = 0; } public function parse() { @@ -27,9 +29,11 @@ public function parse() { } private function parse_recursive( $rule_id ) { - $is_terminal = $rule_id <= $this->grammar->highest_terminal_id; - if ( $is_terminal ) { - if ( $this->position >= count( $this->tokens ) ) { + $grammar = $this->grammar; + $highest_terminal_id = $grammar->highest_terminal_id; + + if ( $rule_id <= $highest_terminal_id ) { + if ( $this->position >= $this->token_count ) { return false; } @@ -38,41 +42,67 @@ private function parse_recursive( $rule_id ) { } if ( $this->tokens[ $this->position ]->id === $rule_id ) { + $token = $this->tokens[ $this->position ]; ++$this->position; - return $this->tokens[ $this->position - 1 ]; + return $token; } return false; } - $branches = $this->grammar->rules[ $rule_id ]; - if ( ! count( $branches ) ) { + $branches = $grammar->rules[ $rule_id ]; + if ( ! $branches ) { return false; } // Bale out from processing the current branch if none of its rules can // possibly match the current token. - if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) { + $rule_lookahead = $grammar->lookahead_is_match_possible[ $rule_id ] ?? null; + if ( null !== $rule_lookahead ) { $token_id = $this->tokens[ $this->position ]->id; if ( - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) && - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ WP_Parser_Grammar::EMPTY_RULE_ID ] ) + ! isset( $rule_lookahead[ $token_id ] ) && + ! isset( $rule_lookahead[ WP_Parser_Grammar::EMPTY_RULE_ID ] ) ) { return false; } } - $rule_name = $this->grammar->rule_names[ $rule_id ]; + $rule_name = $grammar->rule_names[ $rule_id ]; + $fragment_ids = $grammar->fragment_ids; + $rules = $grammar->rules; + $tokens = $this->tokens; + $token_count = $this->token_count; $starting_position = $this->position; + $branch_matches = false; foreach ( $branches as $branch ) { $this->position = $starting_position; - $node = new WP_Parser_Node( $rule_id, $rule_name ); + $children = array(); $branch_matches = true; foreach ( $branch as $subrule_id ) { + // Inline terminal matching to avoid a recursive call per token. + if ( $subrule_id <= $highest_terminal_id ) { + if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) { + // Epsilon rule: matches without consuming input. + continue; + } + if ( + $this->position < $token_count + && $tokens[ $this->position ]->id === $subrule_id + ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $branch_matches = false; + break; + } + $subnode = $this->parse_recursive( $subrule_id ); if ( false === $subnode ) { $branch_matches = false; break; - } elseif ( true === $subnode ) { + } + if ( true === $subnode ) { /* * The subrule was matched without actually matching a token. * This means a special empty "ε" (epsilon) rule was matched. @@ -80,16 +110,15 @@ private function parse_recursive( $rule_id ) { * It is used to represent optional grammar productions. */ continue; - } elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) { - continue; - } - if ( is_array( $subnode ) && ! count( $subnode ) ) { - continue; } - if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) { - $node->merge_fragment( $subnode ); + if ( isset( $fragment_ids[ $subrule_id ] ) ) { + // Fragments: inline their children directly to avoid building + // a throwaway WP_Parser_Node that would be merged afterwards. + foreach ( $subnode->get_children_ref() as $c ) { + $children[] = $c; + } } else { - $node->append_child( $subnode ); + $children[] = $subnode; } } @@ -100,12 +129,16 @@ private function parse_recursive( $rule_id ) { // for right-associative rules, which could solve this. // See: https://github.com/mysql/mysql-workbench/blob/8.0.38/library/parsers/grammars/MySQLParser.g4#L994 // See: https://github.com/antlr/antlr4/issues/488 - $la = $this->tokens[ $this->position ] ?? null; - if ( $la && 'selectStatement' === $rule_name && WP_MySQL_Lexer::INTO_SYMBOL === $la->id ) { + if ( + $branch_matches + && 'selectStatement' === $rule_name + && $this->position < $token_count + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { $branch_matches = false; } - if ( true === $branch_matches ) { + if ( $branch_matches ) { break; } } @@ -115,10 +148,12 @@ private function parse_recursive( $rule_id ) { return false; } - if ( ! $node->has_child() ) { + if ( ! $children ) { return true; } + $node = new WP_Parser_Node( $rule_id, $rule_name ); + $node->set_children( $children ); return $node; } } From 81db9d203708fdd2a3072eb997021fc0e3ba0967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 15:34:20 +0200 Subject: [PATCH 02/24] Use per-branch FIRST sets to skip unreachable branches The grammar now precomputes FIRST and NULLABLE via fixpoint, then indexes each rule's branches by the tokens that can start them. At parse time the parser jumps straight to the candidate branches for the current token instead of iterating every branch and letting most fail. On the full MySQL test suite, 59% of branch attempts previously failed because the first token could never match the branch's FIRST set; with per-branch lookahead those attempts are eliminated. End-to-end parser benchmark: Before: ~14,900 QPS After: ~22,400 QPS (+50%) --- .../src/parser/class-wp-parser-grammar.php | 212 ++++++++++++++---- .../src/parser/class-wp-parser.php | 60 +++-- 2 files changed, 197 insertions(+), 75 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 9bf30b97..1e4c461b 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -29,7 +29,32 @@ class WP_Parser_Grammar { public $rules; public $rule_names; public $fragment_ids; - public $lookahead_is_match_possible = array(); + + /** + * Per-rule branch selector keyed by the next token id. + * + * When set, `$branches_for_token[$rule_id][$token_id]` is the ordered list + * of branch indexes in `$rules[$rule_id]` that can possibly match when the + * current token has the given id. Nullable branches appear in every entry. + * + * If an entry does not exist for the current token, `$nullable_branches` + * is consulted. If both are empty, the rule cannot match and the parser + * returns immediately. + * + * Rules whose FIRST set could not be computed do not appear in the map; + * for those the parser falls back to trying every branch. + * + * @var array> + */ + public $branches_for_token = array(); + + /** + * Per-rule list of nullable branch indexes. + * + * @var array + */ + public $nullable_branches = array(); + public $lowest_non_terminal_id; public $highest_terminal_id; public $native_grammar; @@ -57,8 +82,8 @@ private function inflate( $grammar ) { $this->highest_terminal_id = $this->lowest_non_terminal_id - 1; foreach ( $grammar['rules_names'] as $rule_index => $rule_name ) { - $this->rule_names[ $rule_index + $grammar['rules_offset'] ] = $rule_name; - $this->rules[ $rule_index + $grammar['rules_offset'] ] = array(); + $rule_id = $rule_index + $grammar['rules_offset']; + $this->rule_names[ $rule_id ] = $rule_name; /** * Treat all intermediate rules as fragments to inline before returning @@ -76,7 +101,7 @@ private function inflate( $grammar ) { * They are prefixed with a "%" to be distinguished from the original rules. */ if ( '%' === $rule_name[0] ) { - $this->fragment_ids[ $rule_index + $grammar['rules_offset'] ] = true; + $this->fragment_ids[ $rule_id ] = true; } } @@ -86,55 +111,154 @@ private function inflate( $grammar ) { $this->rules[ $rule_id ] = $branches; } - /** - * Compute a rule => [token => true] lookup table for each rule - * that starts with a terminal OR with another rule that already - * has a lookahead mapping. - * - * This is similar to left-factoring the grammar, even if not quite - * the same. - * - * This enables us to quickly bail out from checking branches that - * cannot possibly match the current token. This increased the parser - * speed by a whopping 80%! - * - * @TODO: Explore these possible next steps: - * - * * Compute a rule => [token => branch[]] list lookup table and only - * process the branches that have a chance of matching the current token. - * * Actually left-factor the grammar as much as possible. This, however, - * could inflate the serialized grammar size. - */ - // 5 iterations seem to give us all the speed gains we can get from this. - for ( $i = 0; $i < 5; $i++ ) { - foreach ( $grammar['grammar'] as $rule_index => $branches ) { - $rule_id = $rule_index + $grammar['rules_offset']; - if ( isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { - continue; - } - $rule_lookup = array(); - $first_symbol_can_be_expanded_to_all_terminals = true; + $this->build_branch_selectors(); + } + + /** + * Compute FIRST and NULLABLE sets for every non-terminal, then denormalize + * them into a per-rule map of `token_id => branch_index[]` so the parser + * can jump straight to the branches that can possibly match the current + * token. + * + * This replaces the previous coarse "can any branch match this token?" + * lookahead. On the MySQL corpus the fine-grained selector skips ~60% + * of the branch attempts that the parser used to try and fail. + */ + private function build_branch_selectors() { + $rules = $this->rules; + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $rule_ids = array_keys( $rules ); + $nullable = array(); + $first_sets = array(); + + foreach ( $rule_ids as $rule_id ) { + $nullable[ $rule_id ] = false; + $first_sets[ $rule_id ] = array(); + } + + // Iterate to fixpoint. FIRST and NULLABLE set monotonically grow. + do { + $changed = false; + foreach ( $rule_ids as $rule_id ) { + $branches = $rules[ $rule_id ]; foreach ( $branches as $branch ) { - $terminals = false; - $branch_starts_with_terminal = $branch[0] < $this->lowest_non_terminal_id; - if ( $branch_starts_with_terminal ) { - $terminals = array( $branch[0] ); - } elseif ( isset( $this->lookahead_is_match_possible[ $branch[0] ] ) ) { - $terminals = array_keys( $this->lookahead_is_match_possible[ $branch[0] ] ); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + // ε: contributes nothing to FIRST, stays nullable. + continue; + } + if ( $symbol < $low_nt ) { + // Terminal. + if ( ! isset( $first_sets[ $rule_id ][ $symbol ] ) ) { + $first_sets[ $rule_id ][ $symbol ] = true; + $changed = true; + } + $branch_nullable = false; + break; + } + // Non-terminal. + foreach ( $first_sets[ $symbol ] as $tid => $_ ) { + if ( ! isset( $first_sets[ $rule_id ][ $tid ] ) ) { + $first_sets[ $rule_id ][ $tid ] = true; + $changed = true; + } + } + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; + } } + if ( $branch_nullable && ! $nullable[ $rule_id ] ) { + $nullable[ $rule_id ] = true; + $changed = true; + } + } + } + } while ( $changed ); - if ( false === $terminals ) { - $first_symbol_can_be_expanded_to_all_terminals = false; + // Build per-(rule, token) branch indices. + foreach ( $rule_ids as $rule_id ) { + $branches = $rules[ $rule_id ]; + $selector = array(); + $nullable_branch_ids = array(); + foreach ( $branches as $idx => $branch ) { + $branch_first = array(); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + continue; + } + if ( $symbol < $low_nt ) { + $branch_first[ $symbol ] = true; + $branch_nullable = false; break; } - foreach ( $terminals as $terminal ) { - $rule_lookup[ $terminal ] = true; + foreach ( $first_sets[ $symbol ] as $tid => $_ ) { + $branch_first[ $tid ] = true; + } + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; } } - if ( $first_symbol_can_be_expanded_to_all_terminals ) { - $this->lookahead_is_match_possible[ $rule_id ] = $rule_lookup; + foreach ( $branch_first as $tid => $_ ) { + $selector[ $tid ][] = $idx; + } + if ( $branch_nullable ) { + $nullable_branch_ids[] = $idx; + } + } + + // Nullable branches also match when the current token is not in + // any branch's FIRST set. Fold them into every populated entry + // so the runtime lookup is a single array access. + if ( $nullable_branch_ids ) { + $merged = array(); + foreach ( $selector as $tid => $idx_list ) { + $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); } + $selector = $merged; + $this->nullable_branches[ $rule_id ] = $nullable_branch_ids; } + if ( $selector ) { + $this->branches_for_token[ $rule_id ] = $selector; + } + } + } + + /** + * Merge two ascending int arrays into one ascending int array without + * duplicates. Preserves original branch order as required by the parser. + * + * @param int[] $a + * @param int[] $b + * @return int[] + */ + private static function merge_sorted( array $a, array $b ): array { + $i = 0; + $j = 0; + $na = count( $a ); + $nb = count( $b ); + $out = array(); + while ( $i < $na && $j < $nb ) { + if ( $a[ $i ] < $b[ $j ] ) { + $out[] = $a[ $i++ ]; + } elseif ( $a[ $i ] > $b[ $j ] ) { + $out[] = $b[ $j++ ]; + } else { + $out[] = $a[ $i ]; + ++$i; + ++$j; + } + } + while ( $i < $na ) { + $out[] = $a[ $i++ ]; + } + while ( $j < $nb ) { + $out[] = $b[ $j++ ]; } + return $out; } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 96feb083..d674312b 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -54,42 +54,48 @@ private function parse_recursive( $rule_id ) { return false; } - // Bale out from processing the current branch if none of its rules can - // possibly match the current token. - $rule_lookahead = $grammar->lookahead_is_match_possible[ $rule_id ] ?? null; - if ( null !== $rule_lookahead ) { - $token_id = $this->tokens[ $this->position ]->id; - if ( - ! isset( $rule_lookahead[ $token_id ] ) && - ! isset( $rule_lookahead[ WP_Parser_Grammar::EMPTY_RULE_ID ] ) - ) { + $tokens = $this->tokens; + $token_count = $this->token_count; + $position = $this->position; + + // Narrow the set of branches worth trying using the precomputed FIRST + // sets. When no entry exists for the current token, fall back to the + // rule's nullable branches (if any); if both are empty the rule cannot + // match here. + $branch_selector = $grammar->branches_for_token[ $rule_id ] ?? null; + if ( null !== $branch_selector ) { + $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; + if ( isset( $branch_selector[ $tid ] ) ) { + $candidate_branches = $branch_selector[ $tid ]; + } elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) { + $candidate_branches = $grammar->nullable_branches[ $rule_id ]; + } else { return false; } + } else { + $candidate_branches = array_keys( $branches ); } - $rule_name = $grammar->rule_names[ $rule_id ]; - $fragment_ids = $grammar->fragment_ids; - $rules = $grammar->rules; - $tokens = $this->tokens; - $token_count = $this->token_count; - $starting_position = $this->position; - $branch_matches = false; - foreach ( $branches as $branch ) { - $this->position = $starting_position; + $rule_name = $grammar->rule_names[ $rule_id ]; + $fragment_ids = $grammar->fragment_ids; + $is_select_statement = 'selectStatement' === $rule_name; + $branch_matches = false; + $children = array(); + foreach ( $candidate_branches as $idx ) { + $branch = $branches[ $idx ]; + $this->position = $position; $children = array(); $branch_matches = true; foreach ( $branch as $subrule_id ) { - // Inline terminal matching to avoid a recursive call per token. if ( $subrule_id <= $highest_terminal_id ) { if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) { - // Epsilon rule: matches without consuming input. continue; } if ( $this->position < $token_count && $tokens[ $this->position ]->id === $subrule_id ) { - $children[] = $tokens[ $this->position ]; + $children[] = $tokens[ $this->position ]; ++$this->position; continue; } @@ -103,17 +109,9 @@ private function parse_recursive( $rule_id ) { break; } if ( true === $subnode ) { - /* - * The subrule was matched without actually matching a token. - * This means a special empty "ε" (epsilon) rule was matched. - * An "ε" rule in a grammar matches an empty input of 0 bytes. - * It is used to represent optional grammar productions. - */ continue; } if ( isset( $fragment_ids[ $subrule_id ] ) ) { - // Fragments: inline their children directly to avoid building - // a throwaway WP_Parser_Node that would be merged afterwards. foreach ( $subnode->get_children_ref() as $c ) { $children[] = $c; } @@ -131,7 +129,7 @@ private function parse_recursive( $rule_id ) { // See: https://github.com/antlr/antlr4/issues/488 if ( $branch_matches - && 'selectStatement' === $rule_name + && $is_select_statement && $this->position < $token_count && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { @@ -144,7 +142,7 @@ private function parse_recursive( $rule_id ) { } if ( ! $branch_matches ) { - $this->position = $starting_position; + $this->position = $position; return false; } From dc0db559cca34bb4bfdb89463bd400aceb518b5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 15:39:54 +0200 Subject: [PATCH 03/24] Short-circuit nullable-fallback and inline single-branch fragments Two grammar/parser refinements that both reduce recursive calls: * In parse_recursive(): when the rule has a per-token branch selector but the current token is not in any branch's FIRST and the rule itself is nullable, return 'matched empty' immediately instead of descending into nullable branches that would recursively do the same thing. This alone eliminates ~460k recursive calls on the MySQL corpus. * At grammar build time, expand every single-branch fragment rule into its call sites. Fragments exist only to factor shared sub-sequences and their children are already flattened into the parent AST node, so splicing them directly into parent branches is a no-op for the resulting tree but removes an entire recursive call per use. 480 of the grammar's fragments qualify. Also drops the dead terminal branch at the top of parse_recursive() (the branch loop inlines terminal matching, so parse_recursive is only ever called with non-terminal rule ids) and the always-false empty-branches guard. End-to-end parser benchmark: Before: ~22,400 QPS After: ~27,500 QPS (+23%) --- .../src/parser/class-wp-parser-grammar.php | 80 +++++++++++++++++-- .../src/parser/class-wp-parser.php | 60 +++++--------- 2 files changed, 95 insertions(+), 45 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 1e4c461b..0af06bda 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -28,7 +28,7 @@ class WP_Parser_Grammar { */ public $rules; public $rule_names; - public $fragment_ids; + public $fragment_ids = array(); /** * Per-rule branch selector keyed by the next token id. @@ -38,11 +38,8 @@ class WP_Parser_Grammar { * current token has the given id. Nullable branches appear in every entry. * * If an entry does not exist for the current token, `$nullable_branches` - * is consulted. If both are empty, the rule cannot match and the parser - * returns immediately. - * - * Rules whose FIRST set could not be computed do not appear in the map; - * for those the parser falls back to trying every branch. + * is consulted. If neither has an entry for this rule, the rule cannot + * match and the parser returns immediately. * * @var array> */ @@ -111,9 +108,80 @@ private function inflate( $grammar ) { $this->rules[ $rule_id ] = $branches; } + $this->inline_single_branch_fragments(); $this->build_branch_selectors(); } + /** + * Inline single-branch fragment rules into their call sites. + * + * The grammar contains many single-branch fragment rules that exist only + * to factor shared sub-sequences out of larger productions. At runtime + * the parser would descend into each such fragment via a recursive call + * just to walk the same symbol sequence and splice the results back into + * the parent. Expanding them in-place at build time eliminates that call + * chain without changing the resulting AST because fragment children are + * already flattened into the parent node. + * + * Fragments with two or more alternatives (e.g., `%EOF_zero_or_one`) are + * left intact because they represent real choices that must be evaluated + * against the current token. + */ + private function inline_single_branch_fragments() { + $rules = $this->rules; + $fragment_ids = $this->fragment_ids; + $low_nt = $this->lowest_non_terminal_id; + + // Precompute the set of single-branch fragments that are candidates + // for inlining. + $inlinable = array(); + foreach ( $fragment_ids as $rule_id => $_ ) { + if ( isset( $rules[ $rule_id ] ) && 1 === count( $rules[ $rule_id ] ) ) { + $inlinable[ $rule_id ] = true; + } + } + + // Depth-first expansion memoized per rule, with cycle detection. + $expanded = array(); + $visiting = array(); + $expand_branch = function ( array $branch ) use ( &$expand_branch, &$expanded, &$visiting, $rules, $low_nt, $inlinable ) { + $out = array(); + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + $out[] = $sym; + continue; + } + if ( ! isset( $inlinable[ $sym ] ) ) { + $out[] = $sym; + continue; + } + if ( isset( $visiting[ $sym ] ) ) { + // Cycle: leave the reference in place. + $out[] = $sym; + continue; + } + if ( ! isset( $expanded[ $sym ] ) ) { + $visiting[ $sym ] = true; + $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); + unset( $visiting[ $sym ] ); + } + foreach ( $expanded[ $sym ] as $s ) { + $out[] = $s; + } + } + return $out; + }; + + // Rewrite every rule's branches with fragments inlined. + foreach ( $this->rules as $rule_id => $branches ) { + $new_branches = array(); + foreach ( $branches as $branch ) { + $new_branches[] = $expand_branch( $branch ); + } + $this->rules[ $rule_id ] = $new_branches; + } + } + /** * Compute FIRST and NULLABLE sets for every non-terminal, then denormalize * them into a per-rule map of `token_id => branch_index[]` so the parser diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index d674312b..b80fe96f 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -28,54 +28,36 @@ public function parse() { return false === $ast ? null : $ast; } + /** + * Parse a single non-terminal rule. + * + * This function is only called for non-terminal rule ids. Terminals are + * matched inline inside the branch loop below to avoid a function-call + * round trip per consumed token. + */ private function parse_recursive( $rule_id ) { - $grammar = $this->grammar; - $highest_terminal_id = $grammar->highest_terminal_id; - - if ( $rule_id <= $highest_terminal_id ) { - if ( $this->position >= $this->token_count ) { - return false; - } - - if ( WP_Parser_Grammar::EMPTY_RULE_ID === $rule_id ) { - return true; - } - - if ( $this->tokens[ $this->position ]->id === $rule_id ) { - $token = $this->tokens[ $this->position ]; - ++$this->position; - return $token; - } - return false; - } - - $branches = $grammar->rules[ $rule_id ]; - if ( ! $branches ) { - return false; - } - + $grammar = $this->grammar; $tokens = $this->tokens; $token_count = $this->token_count; $position = $this->position; // Narrow the set of branches worth trying using the precomputed FIRST - // sets. When no entry exists for the current token, fall back to the - // rule's nullable branches (if any); if both are empty the rule cannot - // match here. - $branch_selector = $grammar->branches_for_token[ $rule_id ] ?? null; - if ( null !== $branch_selector ) { - $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; - if ( isset( $branch_selector[ $tid ] ) ) { - $candidate_branches = $branch_selector[ $tid ]; - } elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) { - $candidate_branches = $grammar->nullable_branches[ $rule_id ]; - } else { - return false; - } + // sets. When no entry exists for the current token but the rule is + // nullable, all candidate branches would match empty, so we return + // immediately without entering any branch. + $branch_selector = $grammar->branches_for_token[ $rule_id ]; + $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; + if ( isset( $branch_selector[ $tid ] ) ) { + $candidate_branches = $branch_selector[ $tid ]; + } elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) { + return true; } else { - $candidate_branches = array_keys( $branches ); + return false; } + $highest_terminal_id = $grammar->highest_terminal_id; + $branches = $grammar->rules[ $rule_id ]; + $rule_name = $grammar->rule_names[ $rule_id ]; $fragment_ids = $grammar->fragment_ids; $is_select_statement = 'selectStatement' === $rule_name; From 0e387952772c29610a9c6f1a7f4c00162b08fc17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 15:43:59 +0200 Subject: [PATCH 04/24] Strip epsilon markers and cache grammar refs on the parser Two minor reductions in per-call work: * Strip explicit EMPTY_RULE_ID symbols out of rule branches at grammar build time. The parser loop would have 'continue'd over them anyway, so removing them ahead of time lets the hot symbol loop drop the epsilon check. Pure-epsilon branches become empty branches and still match empty via the existing empty-children fast path. * Cache the grammar's rules, fragment_ids, rule_names, branches_for_token, nullable_branches, and highest_terminal_id as direct parser instance fields so parse_recursive() no longer pays for a $this->grammar->... double hop on every call. * Collapse the two-step node construction (new + set_children) into a single constructor call that takes the children array directly. This saves a method call per allocated node (~820k across the MySQL corpus). End-to-end parser benchmark: ~27,500 QPS -> ~28,500 QPS (+3.5%). --- .../src/parser/class-wp-parser-grammar.php | 30 ++++++++++++ .../src/parser/class-wp-parser-node.php | 17 ++----- .../src/parser/class-wp-parser.php | 49 +++++++++++-------- 3 files changed, 61 insertions(+), 35 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 0af06bda..d5ab972e 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -109,9 +109,39 @@ private function inflate( $grammar ) { } $this->inline_single_branch_fragments(); + $this->strip_epsilon_markers(); $this->build_branch_selectors(); } + /** + * Remove explicit `EMPTY_RULE_ID` markers from branches. + * + * The epsilon marker is a zero-width, always-matching symbol used in the + * grammar to express optional productions. At parse time it would still + * be walked and "continued" over for no effect, so stripping it ahead of + * time removes a per-symbol branch in the hot loop. + * + * A pure-epsilon branch (`[EMPTY_RULE_ID]`) becomes an empty branch (`[]`) + * which the parser already handles: the inner symbol loop does nothing and + * the rule returns a successful empty match. + */ + private function strip_epsilon_markers() { + foreach ( $this->rules as $rule_id => $branches ) { + foreach ( $branches as $i => $branch ) { + if ( in_array( self::EMPTY_RULE_ID, $branch, true ) ) { + $this->rules[ $rule_id ][ $i ] = array_values( + array_filter( + $branch, + static function ( $s ) { + return self::EMPTY_RULE_ID !== $s; + } + ) + ); + } + } + } + } + /** * Inline single-branch fragment rules into their call sites. * diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index 75580f6d..12a257c8 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -15,29 +15,18 @@ class WP_Parser_Node { */ public $rule_id; public $rule_name; - protected $children = array(); + protected $children; - public function __construct( $rule_id, $rule_name ) { + public function __construct( $rule_id, $rule_name, array $children = array() ) { $this->rule_id = $rule_id; $this->rule_name = $rule_name; + $this->children = $children; } public function append_child( $node ) { $this->children[] = $node; } - /** - * Replace all children with the given array. - * - * This is used by the parser to attach a batch of children built up in a - * local array while trying branches, without allocating a node per attempt. - * - * @param array $children The new children. - */ - public function set_children( array $children ): void { - $this->children = $children; - } - /** * Flatten the matched rule fragments as if their children were direct * descendants of the current rule. diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index b80fe96f..54bed302 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -14,11 +14,26 @@ class WP_Parser { protected $token_count; protected $position; + // Grammar data cached as instance fields so the hot path avoids an extra + // property hop via $this->grammar on every recursive call. + private $rules; + private $rule_names; + private $fragment_ids; + private $branches_for_token; + private $nullable_branches; + private $highest_terminal_id; + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->tokens = $tokens; - $this->token_count = count( $tokens ); - $this->position = 0; + $this->grammar = $grammar; + $this->tokens = $tokens; + $this->token_count = count( $tokens ); + $this->position = 0; + $this->rules = $grammar->rules; + $this->rule_names = $grammar->rule_names; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = $grammar->branches_for_token; + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; } public function parse() { @@ -36,7 +51,6 @@ public function parse() { * round trip per consumed token. */ private function parse_recursive( $rule_id ) { - $grammar = $this->grammar; $tokens = $this->tokens; $token_count = $this->token_count; $position = $this->position; @@ -45,21 +59,19 @@ private function parse_recursive( $rule_id ) { // sets. When no entry exists for the current token but the rule is // nullable, all candidate branches would match empty, so we return // immediately without entering any branch. - $branch_selector = $grammar->branches_for_token[ $rule_id ]; - $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; - if ( isset( $branch_selector[ $tid ] ) ) { - $candidate_branches = $branch_selector[ $tid ]; - } elseif ( isset( $grammar->nullable_branches[ $rule_id ] ) ) { + $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->nullable_branches[ $rule_id ] ) ) { return true; } else { return false; } - $highest_terminal_id = $grammar->highest_terminal_id; - $branches = $grammar->rules[ $rule_id ]; - - $rule_name = $grammar->rule_names[ $rule_id ]; - $fragment_ids = $grammar->fragment_ids; + $highest_terminal_id = $this->highest_terminal_id; + $branches = $this->rules[ $rule_id ]; + $fragment_ids = $this->fragment_ids; + $rule_name = $this->rule_names[ $rule_id ]; $is_select_statement = 'selectStatement' === $rule_name; $branch_matches = false; $children = array(); @@ -70,9 +82,6 @@ private function parse_recursive( $rule_id ) { $branch_matches = true; foreach ( $branch as $subrule_id ) { if ( $subrule_id <= $highest_terminal_id ) { - if ( WP_Parser_Grammar::EMPTY_RULE_ID === $subrule_id ) { - continue; - } if ( $this->position < $token_count && $tokens[ $this->position ]->id === $subrule_id @@ -132,8 +141,6 @@ private function parse_recursive( $rule_id ) { return true; } - $node = new WP_Parser_Node( $rule_id, $rule_name ); - $node->set_children( $children ); - return $node; + return new WP_Parser_Node( $rule_id, $rule_name, $children ); } } From 658245d4976f8e320730231437987e43edd21ce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:01:14 +0200 Subject: [PATCH 05/24] Return fragment results as children arrays, skip the intermediate node Multi-branch fragment rules can't be expanded at grammar build time, but their runtime role is still trivial: match a sequence of symbols and have the caller splice the resulting children into its own node. The old code allocated a full WP_Parser_Node for each fragment match just to have the caller immediately copy its children out. Return the children array directly from fragments instead. The caller distinguishes via is_array($subnode) and splices in-place, saving a Parser_Node allocation per fragment match (~253k per 10k queries). End-to-end parser benchmark: Before: ~27,000 QPS (avg) After: ~28,700 QPS (+6%). --- .../src/parser/class-wp-parser.php | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 54bed302..78aced53 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -72,6 +72,7 @@ private function parse_recursive( $rule_id ) { $branches = $this->rules[ $rule_id ]; $fragment_ids = $this->fragment_ids; $rule_name = $this->rule_names[ $rule_id ]; + $is_fragment = isset( $fragment_ids[ $rule_id ] ); $is_select_statement = 'selectStatement' === $rule_name; $branch_matches = false; $children = array(); @@ -102,8 +103,11 @@ private function parse_recursive( $rule_id ) { if ( true === $subnode ) { continue; } - if ( isset( $fragment_ids[ $subrule_id ] ) ) { - foreach ( $subnode->get_children_ref() as $c ) { + if ( is_array( $subnode ) ) { + // Fragment results are returned directly as a children + // array so the parser does not allocate a Parser_Node + // that would immediately be unwrapped into the parent. + foreach ( $subnode as $c ) { $children[] = $c; } } else { @@ -141,6 +145,14 @@ private function parse_recursive( $rule_id ) { return true; } + // Fragments exist only to group symbols for reuse; their "node" would + // get inlined into the parent on the very next step. Return the raw + // children array so the caller can splice it without allocating a + // throwaway WP_Parser_Node. + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $rule_name, $children ); } } From 2bf90e8d6ddbc72913e9bcae03cb34490166e0cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:07:09 +0200 Subject: [PATCH 06/24] Append end-of-input sentinel token to drop range checks Add a sentinel WP_Parser_Token with id EMPTY_RULE_ID (0) to the end of the token array. Real MySQL tokens never have id 0 (WHITESPACE, the only token with id 0, is stripped by the lexer before tokens reach the parser), so the sentinel cannot match any real terminal. This lets the hot path drop the 'position < token_count' range check everywhere it reads the current token id: the selector lookup at method entry, the inline terminal match inside the branch loop, and the post-branch INTO negative lookahead for selectStatement. Any read past the last real token falls naturally into the nullable-fallback or branch-miss handling. Also drop a few dead locals ($token_count, $fragment_ids) that no longer appear in the hot path after the change. End-to-end parser benchmark: Before: ~28,700 QPS (avg) After: ~29,800 QPS (+4%). --- .../src/parser/class-wp-parser.php | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 78aced53..a0728aef 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -25,8 +25,28 @@ class WP_Parser { public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $this->grammar = $grammar; - $this->tokens = $tokens; $this->token_count = count( $tokens ); + // Append an end-of-input sentinel token whose id is EMPTY_RULE_ID + // (0). The hot path can then read $tokens[$pos]->id unconditionally + // when $pos is the current cursor, because the sentinel naturally + // fails to match any real grammar terminal while feeding the + // nullable-fallback branch of the selector check. + // + // Invariants the hot path relies on: + // - The sentinel id (0) cannot match any grammar terminal. + // strip_epsilon_markers() removes id 0 from every branch at + // grammar build time, so no $subrule_id in the inner loop ever + // equals 0 and ++$this->position can never advance past the + // sentinel. + // - The sentinel must never be appended to a node's children. It + // is only inspected via $tokens[$pos]->id; tokens are pushed + // into $children only on terminal-id equality, which the + // sentinel cannot satisfy. + // - WP_MySQL_Parser::next_query() bounds at $position < $token_count + // (set above, before the append), so the sentinel sits at index + // $token_count and is never fed into a parse round. + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; $this->position = 0; $this->rules = $grammar->rules; $this->rule_names = $grammar->rule_names; @@ -51,15 +71,14 @@ public function parse() { * round trip per consumed token. */ private function parse_recursive( $rule_id ) { - $tokens = $this->tokens; - $token_count = $this->token_count; - $position = $this->position; + $tokens = $this->tokens; + $position = $this->position; // Narrow the set of branches worth trying using the precomputed FIRST // sets. When no entry exists for the current token but the rule is // nullable, all candidate branches would match empty, so we return // immediately without entering any branch. - $tid = $position < $token_count ? $tokens[ $position ]->id : WP_Parser_Grammar::EMPTY_RULE_ID; + $tid = $tokens[ $position ]->id; if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; } elseif ( isset( $this->nullable_branches[ $rule_id ] ) ) { @@ -70,9 +89,8 @@ private function parse_recursive( $rule_id ) { $highest_terminal_id = $this->highest_terminal_id; $branches = $this->rules[ $rule_id ]; - $fragment_ids = $this->fragment_ids; $rule_name = $this->rule_names[ $rule_id ]; - $is_fragment = isset( $fragment_ids[ $rule_id ] ); + $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); $is_select_statement = 'selectStatement' === $rule_name; $branch_matches = false; $children = array(); @@ -83,10 +101,10 @@ private function parse_recursive( $rule_id ) { $branch_matches = true; foreach ( $branch as $subrule_id ) { if ( $subrule_id <= $highest_terminal_id ) { - if ( - $this->position < $token_count - && $tokens[ $this->position ]->id === $subrule_id - ) { + // The sentinel at $tokens[$token_count] has id 0 so it + // cannot match any real terminal, making the range check + // unnecessary here. + if ( $tokens[ $this->position ]->id === $subrule_id ) { $children[] = $tokens[ $this->position ]; ++$this->position; continue; @@ -125,7 +143,6 @@ private function parse_recursive( $rule_id ) { if ( $branch_matches && $is_select_statement - && $this->position < $token_count && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { $branch_matches = false; From 9de4be28b2a6e6ea4e3a9657dee18371aee3fe2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:10:50 +0200 Subject: [PATCH 07/24] Embed branch symbol sequences directly in the per-token selector Previously the per-(rule, token) selector stored a list of branch indexes that the parser then had to look up in $rules[$rule_id] on every branch attempt. Store the branch symbol sequences themselves so the hot loop can iterate candidate branches directly. PHP arrays are copy-on-write, so sharing the same branch sequence across selector entries for many tokens costs negligible extra memory. The nullable_branches map shrinks to a bool marker since the parser only uses it for existence checks. Also cache the start rule id on the grammar so parse() skips its array_search() across rule_names on every call. End-to-end parser benchmark: Before: ~29,800 QPS (avg) After: ~31,700 QPS (+6%). --- .../src/parser/class-wp-parser-grammar.php | 56 ++++++++++++++++--- .../src/parser/class-wp-parser.php | 7 +-- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index d5ab972e..f971976f 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -34,21 +34,22 @@ class WP_Parser_Grammar { * Per-rule branch selector keyed by the next token id. * * When set, `$branches_for_token[$rule_id][$token_id]` is the ordered list - * of branch indexes in `$rules[$rule_id]` that can possibly match when the - * current token has the given id. Nullable branches appear in every entry. + * of candidate branch symbol sequences (drawn from `$rules[$rule_id]`) + * that can possibly match when the current token has the given id. + * Nullable branches appear in every entry. * * If an entry does not exist for the current token, `$nullable_branches` * is consulted. If neither has an entry for this rule, the rule cannot * match and the parser returns immediately. * - * @var array> + * @var array> */ public $branches_for_token = array(); /** - * Per-rule list of nullable branch indexes. + * Per-rule marker indicating the rule has at least one nullable branch. * - * @var array + * @var array */ public $nullable_branches = array(); @@ -56,6 +57,18 @@ class WP_Parser_Grammar { public $highest_terminal_id; public $native_grammar; + /** + * Memoized rule-id lookups, keyed by rule name. + * + * `get_rule_id()` is a linear `array_search` over `$rule_names` and + * costs a few microseconds per call on the MySQL grammar. The parser + * looks up its start rule and the `selectStatement` rule on a hot path, + * so the results are memoized via `get_or_cache_rule_id()`. + * + * @var array + */ + private $cached_rule_ids = array(); + public function __construct( array $rules ) { $this->inflate( $rules ); } @@ -68,6 +81,25 @@ public function get_rule_id( $rule_name ) { return array_search( $rule_name, $this->rule_names, true ); } + /** + * Return the rule id for a given rule name, memoizing the result. + * + * Equivalent to `get_rule_id()` but caches the lookup so repeated + * queries for the same rule name (typically the start rule and a few + * grammar-specific rules consulted on the parser hot path) avoid + * the linear scan over `$rule_names`. Returns `false` for unknown + * rule names, mirroring `get_rule_id()`. + * + * @param string $rule_name + * @return int|false + */ + public function get_or_cache_rule_id( $rule_name ) { + if ( ! array_key_exists( $rule_name, $this->cached_rule_ids ) ) { + $this->cached_rule_ids[ $rule_name ] = $this->get_rule_id( $rule_name ); + } + return $this->cached_rule_ids[ $rule_name ]; + } + /** * Inflate the grammar to an internal representation optimized for parsing. * @@ -317,10 +349,20 @@ private function build_branch_selectors() { foreach ( $selector as $tid => $idx_list ) { $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); } - $selector = $merged; - $this->nullable_branches[ $rule_id ] = $nullable_branch_ids; + $selector = $merged; + $this->nullable_branches[ $rule_id ] = true; } if ( $selector ) { + // Store the candidate branch sequences directly so the parser + // can foreach over them without an extra $branches[$idx] + // indirection on every branch attempt. + foreach ( $selector as $tid => $idx_list ) { + $seqs = array(); + foreach ( $idx_list as $idx ) { + $seqs[] = $branches[ $idx ]; + } + $selector[ $tid ] = $seqs; + } $this->branches_for_token[ $rule_id ] = $selector; } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index a0728aef..c74e82f5 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -58,8 +58,7 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { public function parse() { // @TODO: Make the starting rule lookup non-grammar-specific. - $query_rule_id = $this->grammar->get_rule_id( 'query' ); - $ast = $this->parse_recursive( $query_rule_id ); + $ast = $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) ); return false === $ast ? null : $ast; } @@ -88,14 +87,12 @@ private function parse_recursive( $rule_id ) { } $highest_terminal_id = $this->highest_terminal_id; - $branches = $this->rules[ $rule_id ]; $rule_name = $this->rule_names[ $rule_id ]; $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); $is_select_statement = 'selectStatement' === $rule_name; $branch_matches = false; $children = array(); - foreach ( $candidate_branches as $idx ) { - $branch = $branches[ $idx ]; + foreach ( $candidate_branches as $branch ) { $this->position = $position; $children = array(); $branch_matches = true; From b3931d0e7bfac5cd67f57c3530f268d57e1de3e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:12:18 +0200 Subject: [PATCH 08/24] Compare selectStatement by rule id instead of by name Minor cleanup in parse_recursive(): cache the selectStatement rule id once and compare integers on every call instead of re-comparing the 'selectStatement' string against every rule's name. Also drops the $rules instance cache from the parser, which the hot path no longer touches now that branch sequences are embedded in the selector. --- .../mysql-on-sqlite/src/parser/class-wp-parser.php | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index c74e82f5..30efb6cf 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -16,12 +16,12 @@ class WP_Parser { // Grammar data cached as instance fields so the hot path avoids an extra // property hop via $this->grammar on every recursive call. - private $rules; private $rule_names; private $fragment_ids; private $branches_for_token; private $nullable_branches; private $highest_terminal_id; + private $select_statement_rule_id; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $this->grammar = $grammar; @@ -48,12 +48,16 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); $this->tokens = $tokens; $this->position = 0; - $this->rules = $grammar->rules; $this->rule_names = $grammar->rule_names; $this->fragment_ids = $grammar->fragment_ids; $this->branches_for_token = $grammar->branches_for_token; $this->nullable_branches = $grammar->nullable_branches; $this->highest_terminal_id = $grammar->highest_terminal_id; + + // The INTO negative-lookahead only fires for selectStatement. Cache + // the rule id so the per-call check is an int compare instead of a + // string compare. + $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); } public function parse() { @@ -87,9 +91,8 @@ private function parse_recursive( $rule_id ) { } $highest_terminal_id = $this->highest_terminal_id; - $rule_name = $this->rule_names[ $rule_id ]; $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); - $is_select_statement = 'selectStatement' === $rule_name; + $is_select_statement = $rule_id === $this->select_statement_rule_id; $branch_matches = false; $children = array(); foreach ( $candidate_branches as $branch ) { @@ -167,6 +170,6 @@ private function parse_recursive( $rule_id ) { return $children; } - return new WP_Parser_Node( $rule_id, $rule_name, $children ); + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); } } From 0169a668b048a00077148eab67f0e12c093a6297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:19:29 +0200 Subject: [PATCH 09/24] Re-align grammar and parser whitespace after recent changes Adopts phpcbf's trivial whitespace alignment fixes in the grammar and parser source to keep `composer run check-cs` clean after the prior optimisation commits added new local variables and reshaped the selector-build code. --- .../src/parser/class-wp-parser-grammar.php | 20 +++++++++---------- .../src/parser/class-wp-parser.php | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index f971976f..27d902ea 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -204,8 +204,8 @@ private function inline_single_branch_fragments() { } // Depth-first expansion memoized per rule, with cycle detection. - $expanded = array(); - $visiting = array(); + $expanded = array(); + $visiting = array(); $expand_branch = function ( array $branch ) use ( &$expand_branch, &$expanded, &$visiting, $rules, $low_nt, $inlinable ) { $out = array(); foreach ( $branch as $sym ) { @@ -223,8 +223,8 @@ private function inline_single_branch_fragments() { continue; } if ( ! isset( $expanded[ $sym ] ) ) { - $visiting[ $sym ] = true; - $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); + $visiting[ $sym ] = true; + $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); unset( $visiting[ $sym ] ); } foreach ( $expanded[ $sym ] as $s ) { @@ -255,12 +255,12 @@ private function inline_single_branch_fragments() { * of the branch attempts that the parser used to try and fail. */ private function build_branch_selectors() { - $rules = $this->rules; - $low_nt = $this->lowest_non_terminal_id; - $empty_rule = self::EMPTY_RULE_ID; - $rule_ids = array_keys( $rules ); - $nullable = array(); - $first_sets = array(); + $rules = $this->rules; + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $rule_ids = array_keys( $rules ); + $nullable = array(); + $first_sets = array(); foreach ( $rule_ids as $rule_id ) { $nullable[ $rule_id ] = false; diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 30efb6cf..48930dd7 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -24,8 +24,8 @@ class WP_Parser { private $select_statement_rule_id; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->token_count = count( $tokens ); + $this->grammar = $grammar; + $this->token_count = count( $tokens ); // Append an end-of-input sentinel token whose id is EMPTY_RULE_ID // (0). The hot path can then read $tokens[$pos]->id unconditionally // when $pos is the current cursor, because the sentinel naturally From 0f7c1f913e4bbc1361ec9db6b728d987b91fe3f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 16:28:03 +0200 Subject: [PATCH 10/24] Deduplicate selector entries while embedding branch sequences The per-(rule, token) branch selector stored a separate inner array per token, even when many tokens within the same rule mapped to identical branch lists (a single branch's FIRST set covers many tokens, for example). Loading the MySQL grammar used ~40 MB of PHP memory, most of which was duplicated inner arrays. Deduplicate by signature during grammar build so all tokens that land on the same branch list share one inner array via copy-on-write. The inner arrays still embed the branch symbol sequences directly so the hot loop iterates them without an extra $rules[$rule_id][$idx] indirection per branch attempt. Grammar memory on the MySQL grammar drops from ~40 MB to ~10 MB. PHPUnit peak memory drops from 198 MB to 110 MB. Parser throughput is unchanged from the previous (non-deduplicated) embedded-sequences form. --- .../src/parser/class-wp-parser-grammar.php | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 27d902ea..0da3600d 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -353,15 +353,32 @@ private function build_branch_selectors() { $this->nullable_branches[ $rule_id ] = true; } if ( $selector ) { - // Store the candidate branch sequences directly so the parser - // can foreach over them without an extra $branches[$idx] - // indirection on every branch attempt. + // Expand branch indexes to the branch symbol sequences so + // the parser can foreach candidate branches without an extra + // $branches[$idx] indirection on every attempt. Many tokens + // inside the same rule end up pointing to the same branch-id + // list, so deduplicate by signature and let copy-on-write + // share one sequences array across all of them. + // + // Trade-off vs trunk: storing branch sequences inline (rather + // than just branch indexes plus the trunk lookahead bitmap) + // costs ~+16 MiB of grammar memory after dedup but eliminates + // the per-attempt $rules[$rule_id][$idx] indirection in the + // parser hot loop. The dedup itself is what keeps the cost at + // ~+16 MiB; without it the embedded table would be ~40 MB. + $by_signature = array(); foreach ( $selector as $tid => $idx_list ) { - $seqs = array(); - foreach ( $idx_list as $idx ) { - $seqs[] = $branches[ $idx ]; + $sig = implode( ',', $idx_list ); + if ( isset( $by_signature[ $sig ] ) ) { + $selector[ $tid ] = $by_signature[ $sig ]; + } else { + $seqs = array(); + foreach ( $idx_list as $idx ) { + $seqs[] = $branches[ $idx ]; + } + $by_signature[ $sig ] = $seqs; + $selector[ $tid ] = $seqs; } - $selector[ $tid ] = $seqs; } $this->branches_for_token[ $rule_id ] = $selector; } From 25e04ed8bbb41aaf7edda59596d630cf1fde3ae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 22:42:52 +0200 Subject: [PATCH 11/24] Add direct-return fast path for single-candidate rules On the MySQL grammar, 1,290 of 1,916 rules have a selector where every (rule, token) entry points to exactly one branch. Those rules account for ~55% of parse_recursive calls on the test corpus (722k of 1.3M per 10k queries). Flag those rules at grammar build time. In parse_recursive, detect the flag and take the only candidate branch directly, skipping the candidate-iteration loop. On match failure, restore $position and return false directly instead of going through the multi-candidate branch_matches/break sequence. End-to-end parser benchmark: no JIT: ~31.6K -> ~32.6K QPS avg (+3%) tracing JIT: ~52.6K -> ~55.7K QPS avg (+6%) --- .../src/parser/class-wp-parser-grammar.php | 18 ++++- .../src/parser/class-wp-parser.php | 72 ++++++++++++++++--- 2 files changed, 79 insertions(+), 11 deletions(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 0da3600d..939d249a 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -53,6 +53,15 @@ class WP_Parser_Grammar { */ public $nullable_branches = array(); + /** + * Per-rule flag indicating every (rule, token) selector entry points + * to exactly one branch. The parser uses this to skip the outer + * foreach when a single candidate is the only possibility. + * + * @var array + */ + public $single_candidate_rules = array(); + public $lowest_non_terminal_id; public $highest_terminal_id; public $native_grammar; @@ -366,8 +375,12 @@ private function build_branch_selectors() { // the per-attempt $rules[$rule_id][$idx] indirection in the // parser hot loop. The dedup itself is what keeps the cost at // ~+16 MiB; without it the embedded table would be ~40 MB. - $by_signature = array(); + $by_signature = array(); + $all_single_candidates = true; foreach ( $selector as $tid => $idx_list ) { + if ( 1 !== count( $idx_list ) ) { + $all_single_candidates = false; + } $sig = implode( ',', $idx_list ); if ( isset( $by_signature[ $sig ] ) ) { $selector[ $tid ] = $by_signature[ $sig ]; @@ -381,6 +394,9 @@ private function build_branch_selectors() { } } $this->branches_for_token[ $rule_id ] = $selector; + if ( $all_single_candidates ) { + $this->single_candidate_rules[ $rule_id ] = true; + } } } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 48930dd7..03c00280 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -22,6 +22,7 @@ class WP_Parser { private $nullable_branches; private $highest_terminal_id; private $select_statement_rule_id; + private $single_candidate_rules; public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { $this->grammar = $grammar; @@ -45,14 +46,15 @@ public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { // - WP_MySQL_Parser::next_query() bounds at $position < $token_count // (set above, before the append), so the sentinel sits at index // $token_count and is never fed into a parse round. - $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); - $this->tokens = $tokens; - $this->position = 0; - $this->rule_names = $grammar->rule_names; - $this->fragment_ids = $grammar->fragment_ids; - $this->branches_for_token = $grammar->branches_for_token; - $this->nullable_branches = $grammar->nullable_branches; - $this->highest_terminal_id = $grammar->highest_terminal_id; + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + $this->rule_names = $grammar->rule_names; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = $grammar->branches_for_token; + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = $grammar->single_candidate_rules; // The INTO negative-lookahead only fires for selectStatement. Cache // the rule id so the per-call check is an int compare instead of a @@ -93,8 +95,58 @@ private function parse_recursive( $rule_id ) { $highest_terminal_id = $this->highest_terminal_id; $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); $is_select_statement = $rule_id === $this->select_statement_rule_id; - $branch_matches = false; - $children = array(); + + // Fast path for rules where every (rule, token) selector entry + // points to exactly one branch - about 55% of nonterminal calls + // on the MySQL corpus. Skip the outer foreach and the + // $branch_matches bookkeeping; every failure path just rewinds + // the position and returns false directly. + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + $branch = $candidate_branches[0]; + $children = array(); + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $this->position = $position; + return false; + } + + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $this->position = $position; + return false; + } + if ( true === $subnode ) { + continue; + } + if ( is_array( $subnode ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; + return false; + } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); + } + + $branch_matches = false; + $children = array(); foreach ( $candidate_branches as $branch ) { $this->position = $position; $children = array(); From 96184de8c3d370de0bc084d91885bdeb469843f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Fri, 24 Apr 2026 23:22:22 +0200 Subject: [PATCH 12/24] Mark WP_Parser_Node as final Nothing extends WP_Parser_Node. Marking it final lets PHP's opcache and tracing JIT specialize property access and method dispatch since the class layout is now fixed. Small but consistent improvement measured across multiple runs under tracing JIT (~+2% avg, ~+2% best). End-to-end parser benchmark: tracing JIT: ~57K -> ~57-58K QPS avg, 60-61K QPS best no JIT: ~33K -> ~34K QPS avg, 35K QPS best --- packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index 12a257c8..a5a6b187 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -9,7 +9,7 @@ * In this way, a parser node constitutes a recursive structure that represents * a parse (sub)tree at each level of the full grammar tree. */ -class WP_Parser_Node { +final class WP_Parser_Node { /** * @TODO: Review and document these properties and their visibility. */ From b6029fd88e28859b26c6c2e6d8d1a40a693c70a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 28 Apr 2026 09:36:59 +0200 Subject: [PATCH 13/24] Speed up the lexer with cheaper byte checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply lexer optimisations from PR #375: - Cache `strlen($sql)` once in `$sql_length` instead of recomputing on each EOF check. - Replace `strspn($byte, MASK) > 0` with direct byte comparisons (`$byte >= '0' && $byte <= '9'`, `false !== strpos(MASK, $byte)`, unrolled whitespace check). - Use `strpos($sql, '*/', $pos)` instead of a manual scan loop in `read_comment_content()`. - In `read_quoted_text()`, use `strpos()` to find the next quote, eliminating the separate end-of-input check that follows the `strcspn()` scan. - Inline `next_token()` + `get_token()` in `remaining_tokens()` so the hot loop builds tokens directly. Co-authored-by: Adam Zieliński Adapted from https://github.com/WordPress/sqlite-database-integration/pull/375 --- .../src/mysql/class-wp-mysql-lexer.php | 114 +++++++++++++----- 1 file changed, 83 insertions(+), 31 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 10ecd90a..06d01623 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer { */ private $sql; + /** + * Byte length of the SQL payload. + * + * @var int + */ + private $sql_length; + /** * The version of the MySQL server that the SQL payload is intended for. * @@ -2189,6 +2196,7 @@ public function __construct( array $sql_modes = array() ) { $this->sql = $sql; + $this->sql_length = strlen( $sql ); $this->mysql_version = $mysql_version; foreach ( $sql_modes as $sql_mode ) { @@ -2284,10 +2292,46 @@ public function get_token(): ?WP_MySQL_Token { * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. */ public function remaining_tokens(): array { - $tokens = array(); - while ( true === $this->next_token() ) { - $token = $this->get_token(); - $tokens[] = $token; + $tokens = array(); + $no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active( + self::SQL_MODE_NO_BACKSLASH_ESCAPES + ); + + while ( true ) { + if ( + self::EOF === $this->token_type + || ( null === $this->token_type && $this->bytes_already_read > 0 ) + ) { + $this->token_type = null; + break; + } + + do { + $this->token_starts_at = $this->bytes_already_read; + $this->token_type = $this->read_next_token(); + } while ( + self::WHITESPACE === $this->token_type + || self::COMMENT === $this->token_type + || self::MYSQL_COMMENT_START === $this->token_type + || self::MYSQL_COMMENT_END === $this->token_type + ); + + if ( null === $this->token_type ) { + break; + } + + $tokens[] = new WP_MySQL_Token( + $this->token_type, + $this->token_starts_at, + $this->bytes_already_read - $this->token_starts_at, + $this->sql, + $no_backslash_escapes_sql_mode_set + ); + + if ( self::EOF === $this->token_type ) { + $this->token_type = null; + break; + } } return $tokens; } @@ -2356,10 +2400,10 @@ private function read_next_token(): ?int { if ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); - } elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) { + } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { $type = $this->read_number(); } elseif ( '.' === $byte ) { - if ( null !== $next_byte && strspn( $next_byte, self::DIGIT_MASK ) > 0 ) { + if ( null !== $next_byte && $next_byte >= '0' && $next_byte <= '9' ) { $type = $this->read_number(); } else { $this->bytes_already_read += 1; @@ -2420,8 +2464,8 @@ private function read_next_token(): ?int { } elseif ( '-' === $byte ) { if ( '-' === $next_byte - && $this->bytes_already_read + 2 < strlen( $this->sql ) - && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0 + && $this->bytes_already_read + 2 < $this->sql_length + && false !== strpos( self::WHITESPACE_MASK, $this->sql[ $this->bytes_already_read + 2 ] ) ) { $type = $this->read_line_comment(); } elseif ( '>' === $next_byte ) { @@ -2547,7 +2591,13 @@ private function read_next_token(): ?int { } } elseif ( '#' === $byte ) { $type = $this->read_line_comment(); - } elseif ( null !== $byte && strspn( $byte, self::WHITESPACE_MASK ) > 0 ) { + } elseif ( + ' ' === $byte + || "\t" === $byte + || "\n" === $byte + || "\r" === $byte + || "\f" === $byte + ) { $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); $type = self::WHITESPACE; } elseif ( ( 'x' === $byte || 'X' === $byte || 'b' === $byte || 'B' === $byte ) && "'" === $next_byte ) { @@ -2675,7 +2725,7 @@ private function read_number(): ?int { '0' === $byte && 'x' === $next_byte && null !== $third_byte - && strspn( $third_byte, self::HEX_DIGIT_MASK ) > 0 + && false !== strpos( self::HEX_DIGIT_MASK, $third_byte ) ) // HEX number in the form of x'N' or X'N'. || ( ( 'x' === $byte || 'X' === $byte ) && "'" === $next_byte ) @@ -2685,7 +2735,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2708,7 +2758,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2737,11 +2787,12 @@ private function read_number(): ?int { ( 'e' === $byte || 'E' === $byte ) && null !== $next_byte && ( - strspn( $next_byte, self::DIGIT_MASK ) > 0 + ( $next_byte >= '0' && $next_byte <= '9' ) || ( ( '+' === $next_byte || '-' === $next_byte ) - && $this->bytes_already_read + 2 < strlen( $this->sql ) - && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0 + && $this->bytes_already_read + 2 < $this->sql_length + && $this->sql[ $this->bytes_already_read + 2 ] >= '0' + && $this->sql[ $this->bytes_already_read + 2 ] <= '9' ) ); if ( $has_exponent ) { @@ -2838,12 +2889,11 @@ private function read_quoted_text(): ?int { // in which case the escape sequence is consumed and the loop continues. $at = $this->bytes_already_read; while ( true ) { - $at += strcspn( $this->sql, $quote, $at ); - - // Unclosed string - unexpected EOF. - if ( ( $this->sql[ $at ] ?? null ) !== $quote ) { + $quote_at = strpos( $this->sql, $quote, $at ); + if ( false === $quote_at ) { return null; // Invalid input. } + $at = $quote_at; /* * By default, quotes can be escaped with a "\". @@ -2853,9 +2903,17 @@ private function read_quoted_text(): ?int { * The quote is escaped only when the number of preceding backslashes * is odd - "\" is an escape sequence, "\\" is an escaped backslash, * "\\\" is an escaped backslash and an escape sequence, and so on. + * + * The `($at - $i - 1) >= 0` guard prevents PHP's negative-string- + * offset wraparound (PHP 7.1+) when the closing-quote candidate + * sits at the very start of the input. The `?? null` covers + * positive out-of-range indexes belt-and-suspenders. */ if ( ! $no_backslash_escapes ) { - for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 ); + $i = 0; + while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) { + $i += 1; + } if ( 1 === $i % 2 ) { $at += 1; continue; @@ -2920,17 +2978,11 @@ private function read_mysql_comment(): int { } private function read_comment_content(): void { - while ( true ) { - $this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read ); - $this->bytes_already_read += 1; // Consume the '*'. - $byte = $this->sql[ $this->bytes_already_read ] ?? null; - if ( null === $byte ) { - break; - } - if ( '/' === $byte ) { - $this->bytes_already_read += 1; // Consume the '/'. - break; - } + $comment_end = strpos( $this->sql, '*/', $this->bytes_already_read ); + if ( false === $comment_end ) { + $this->bytes_already_read = $this->sql_length; + } else { + $this->bytes_already_read = $comment_end + 2; } } From 9f758025c281e5fe4e877931d9b29f61dc255cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 28 Apr 2026 09:37:05 +0200 Subject: [PATCH 14/24] Skip parent constructor in WP_MySQL_Token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Token construction is on the lexer hot path; bypassing the `WP_Parser_Token::__construct()` indirection and assigning the four properties directly removes one method call per token. Requires `$input` on `WP_Parser_Token` to be `protected` instead of `private` so the subclass can write to it. Co-authored-by: Adam Zieliński Adapted from https://github.com/WordPress/sqlite-database-integration/pull/375 --- packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php | 6 +++++- .../mysql-on-sqlite/src/parser/class-wp-parser-token.php | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php index 1fb25ab4..0840bc2f 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php @@ -30,7 +30,11 @@ public function __construct( string $input, bool $sql_mode_no_backslash_escapes_enabled ) { - parent::__construct( $id, $start, $length, $input ); + $this->id = $id; + $this->start = $start; + $this->length = $length; + $this->input = $input; + $this->sql_mode_no_backslash_escapes_enabled = $sql_mode_no_backslash_escapes_enabled; } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php index b7726189..4132ba38 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php @@ -35,7 +35,7 @@ class WP_Parser_Token { * * @var string */ - private $input; + protected $input; /** * Constructor. From d5f155ea2106a281f943ec498cddaacda7b1b991 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 28 Apr 2026 09:37:13 +0200 Subject: [PATCH 15/24] Use ! empty() in WP_Parser_Node::has_child() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `! empty( $this->children )` short-circuits without calling `count()`, saving one function call per invocation. Co-authored-by: Adam Zieliński Adapted from https://github.com/WordPress/sqlite-database-integration/pull/376 --- packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index a5a6b187..20badb82 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -109,7 +109,7 @@ public function merge_fragment( $node ) { * @return bool True if this node has any child nodes or tokens, false otherwise. */ public function has_child(): bool { - return count( $this->children ) > 0; + return ! empty( $this->children ); } /** From 4d7970a58d46f4c5e68a4f60025e82c3c0b7b490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:53:29 +0200 Subject: [PATCH 16/24] Inline leading-whitespace skip in lexer's token loops Both next_token() and remaining_tokens() previously paid a read_next_token() function call per whitespace run only to recognise and skip the resulting WHITESPACE token. A single unguarded strspn() at the top of each loop iteration absorbs the run inline, saving the call overhead for ~one whitespace run per real token across millions of tokens. The strspn() call is unguarded because an unconditional strspn() (which returns 0 in a single C-side call when nothing matches) is faster than gating it on a five-arm '$byte === ...' precheck. --- packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 06d01623..78c0c6b7 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2235,6 +2235,9 @@ public function next_token(): bool { return false; } + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + do { $this->token_starts_at = $this->bytes_already_read; $this->token_type = $this->read_next_token(); @@ -2306,6 +2309,9 @@ public function remaining_tokens(): array { break; } + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + do { $this->token_starts_at = $this->bytes_already_read; $this->token_type = $this->read_next_token(); From a1e0e6cb5d9afd7c869d663a99895cc57d27a6c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:54:33 +0200 Subject: [PATCH 17/24] Catch identifier and keyword tokens at the top of the chain ASCII letters and UTF-8 multibyte start bytes account for most token-start bytes on the MySQL corpus. They previously fell into the catch-all `else` at the bottom of read_next_token() after walking every operator arm in between. The new branch sits at the top of the elseif chain and dispatches them directly. The `next_byte !== "'"` guard keeps the x'..', n'..' and similar specials on their dedicated branches. `_` and `$` starters stay on the catch-all so the UNDERSCORE_CHARSET lookup still fires. --- .../src/mysql/class-wp-mysql-lexer.php | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 78c0c6b7..5822ef3d 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2404,7 +2404,27 @@ private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; - if ( "'" === $byte || '"' === $byte || '`' === $byte ) { + // Fast path for keywords and identifiers. + // These are the most common token types in MySQL payloads. + if ( + ( + ( $byte >= 'a' && $byte <= 'z' ) + || ( $byte >= 'A' && $byte <= 'Z' ) + || $byte > "\x7F" + ) + && "'" !== $next_byte + ) { + $started_at = $this->bytes_already_read; + $type = $this->read_identifier(); + if ( self::IDENTIFIER === $type ) { + // When preceded by a dot, it is always an identifier. + if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { + $type = self::IDENTIFIER; + } else { + $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); + } + } + } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { $type = $this->read_number(); @@ -2617,13 +2637,9 @@ private function read_next_token(): ?int { } elseif ( null === $byte ) { $type = self::EOF; } else { - $started_at = $this->bytes_already_read; - $type = $this->read_identifier(); + $type = $this->read_identifier(); if ( self::IDENTIFIER === $type ) { - // When preceded by a dot, it is always an identifier. - if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { - $type = self::IDENTIFIER; - } elseif ( '_' === $byte && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) { + if ( '_' === $byte && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) { $type = self::UNDERSCORE_CHARSET; } else { $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); From 17e06b3adbf58574789241e406b02fe719ee05a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Wed, 29 Apr 2026 10:55:33 +0200 Subject: [PATCH 18/24] Add a single-byte operator dispatch table The ASCII bytes (, ), ',' ;, +, ~, %, ^, ?, {, }, and = each map to a unique single-byte token type with no lookahead. A static array + isset() arm dispatches them in one lookup, short-circuiting the per-byte elseif arms further down the chain. '*' and '|' are deliberately excluded because their token type depends on context (in_mysql_comment for '*/', SQL_MODE_PIPES_ AS_CONCAT for '||'). --- .../src/mysql/class-wp-mysql-lexer.php | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 5822ef3d..c8713955 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2404,6 +2404,22 @@ private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; + // A map for a single-byte symbol fast path. + static $single_byte_ops = array( + '(' => self::OPEN_PAR_SYMBOL, + ')' => self::CLOSE_PAR_SYMBOL, + ',' => self::COMMA_SYMBOL, + ';' => self::SEMICOLON_SYMBOL, + '+' => self::PLUS_OPERATOR, + '~' => self::BITWISE_NOT_OPERATOR, + '%' => self::MOD_OPERATOR, + '^' => self::BITWISE_XOR_OPERATOR, + '?' => self::PARAM_MARKER, + '{' => self::OPEN_CURLY_SYMBOL, + '}' => self::CLOSE_CURLY_SYMBOL, + '=' => self::EQUAL_OPERATOR, + ); + // Fast path for keywords and identifiers. // These are the most common token types in MySQL payloads. if ( @@ -2424,6 +2440,10 @@ private function read_next_token(): ?int { $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); } } + } elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) { + // Fast path for single-byte symbols. + $this->bytes_already_read += 1; + $type = $single_byte_ops[ $byte ]; } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { From 49acebddb901eaefb54591d2447a3a96afac4df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 15:40:23 +0200 Subject: [PATCH 19/24] Document non-obvious lexer dispatch conditions Three review-noted spots that were terse in the code: - The remaining_tokens() loop guard now spells out why both EOF and `null === token_type && bytes_already_read > 0` are needed (EOF on clean end-of-input vs invalid byte mid-stream, with the `> 0` guard letting the very first iteration through). - The identifier/keyword fast path now explains `$byte > "\x7F"` (UTF-8 multi-byte starter; MySQL identifiers allow U+0080-U+FFFF) and `next_byte !== "'"` (only single quotes form the special hex/bin/n-char literal starters; `"` never does, regardless of SQL mode). No behavior change. --- .../mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index c8713955..377860dd 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2301,6 +2301,8 @@ public function remaining_tokens(): array { ); while ( true ) { + // Bail on EOF, or on a null token type once at least one byte has + // been consumed (read_next_token() hit invalid input mid-stream). if ( self::EOF === $this->token_type || ( null === $this->token_type && $this->bytes_already_read > 0 ) @@ -2421,7 +2423,10 @@ private function read_next_token(): ?int { ); // Fast path for keywords and identifiers. - // These are the most common token types in MySQL payloads. + // `$byte > "\x7F"` catches UTF-8 multi-byte starters (U+0080-U+FFFF). + // `"'" !== $next_byte` defers x'..', n'..' and similar special + // literals to their dedicated branches below; only single quotes + // form those, regardless of SQL mode. if ( ( ( $byte >= 'a' && $byte <= 'z' ) From 242acf636daede0d5bf2687e3b243238340c434c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 17:01:37 +0200 Subject: [PATCH 20/24] Remove single-byte operator arms shadowed by the dispatch table The static $single_byte_ops table introduced earlier already dispatches '(', ')', ',', ';', '+', '~', '%', '^', '?', '{', '}', and '=' before the per-byte elseif chain runs. The 12 individual arms further down the chain were therefore unreachable; remove them so the dispatch table is the single source of truth for these tokens. --- .../src/mysql/class-wp-mysql-lexer.php | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 377860dd..7e6c179b 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2460,9 +2460,6 @@ private function read_next_token(): ?int { $this->bytes_already_read += 1; $type = self::DOT_SYMBOL; } - } elseif ( '=' === $byte ) { - $this->bytes_already_read += 1; - $type = self::EQUAL_OPERATOR; } elseif ( ':' === $byte ) { $this->bytes_already_read += 1; // Consume the ':'. if ( '=' === $next_byte ) { @@ -2509,9 +2506,6 @@ private function read_next_token(): ?int { } else { $type = self::LOGICAL_NOT_OPERATOR; } - } elseif ( '+' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PLUS_OPERATOR; } elseif ( '-' === $byte ) { if ( '-' === $next_byte @@ -2561,9 +2555,6 @@ private function read_next_token(): ?int { $this->bytes_already_read += 1; $type = self::DIV_OPERATOR; } - } elseif ( '%' === $byte ) { - $this->bytes_already_read += 1; - $type = self::MOD_OPERATOR; } elseif ( '&' === $byte ) { $this->bytes_already_read += 1; // Consume the '&'. if ( '&' === $next_byte ) { @@ -2572,9 +2563,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_AND_OPERATOR; } - } elseif ( '^' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_XOR_OPERATOR; } elseif ( '|' === $byte ) { $this->bytes_already_read += 1; // Consume the '|'. if ( '|' === $next_byte ) { @@ -2585,27 +2573,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_OR_OPERATOR; } - } elseif ( '~' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_NOT_OPERATOR; - } elseif ( ',' === $byte ) { - $this->bytes_already_read += 1; - $type = self::COMMA_SYMBOL; - } elseif ( ';' === $byte ) { - $this->bytes_already_read += 1; - $type = self::SEMICOLON_SYMBOL; - } elseif ( '(' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_PAR_SYMBOL; - } elseif ( ')' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_PAR_SYMBOL; - } elseif ( '{' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_CURLY_SYMBOL; - } elseif ( '}' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_CURLY_SYMBOL; } elseif ( '@' === $byte ) { $this->bytes_already_read += 1; // Consume the '@'. @@ -2629,9 +2596,6 @@ private function read_next_token(): ?int { $type = self::AT_SIGN_SYMBOL; } } - } elseif ( '?' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PARAM_MARKER; } elseif ( '\\' === $byte ) { $this->bytes_already_read += 1; // Consume the '\'. if ( 'N' === $next_byte ) { From aa0feda2ca022ca5fcb5b883f89740ad3c974e47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 17:02:25 +0200 Subject: [PATCH 21/24] Unroll whitespace check in '--' line-comment dispatch The leading-whitespace skip at the top of read_next_token() was already unrolled into byte-equality checks for the perf reasons documented in 916b512e. Apply the same unroll to the third-byte whitespace check that gates a '--' as a line-comment start, so the hot dispatch chain doesn't fall back into strpos() on a 5-char mask for this case. The bound check is folded into '?? null' on the third-byte read, matching the rest of the lookahead style. --- .../mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 7e6c179b..e578fefb 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2507,10 +2507,16 @@ private function read_next_token(): ?int { $type = self::LOGICAL_NOT_OPERATOR; } } elseif ( '-' === $byte ) { + $third_byte = $this->sql[ $this->bytes_already_read + 2 ] ?? null; if ( '-' === $next_byte - && $this->bytes_already_read + 2 < $this->sql_length - && false !== strpos( self::WHITESPACE_MASK, $this->sql[ $this->bytes_already_read + 2 ] ) + && ( + ' ' === $third_byte + || "\t" === $third_byte + || "\n" === $third_byte + || "\r" === $third_byte + || "\f" === $third_byte + ) ) { $type = $this->read_line_comment(); } elseif ( '>' === $next_byte ) { From 873bed53d50279a866afff834a781a42f8c0d9d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 17:51:12 +0200 Subject: [PATCH 22/24] Drop 'final' from WP_Parser_Node Trunk added WP_MySQL_Native_Parser_Node which extends WP_Parser_Node to lazily materialize children from the Rust-owned AST (b6473ef..ef45003). PHP forbids extending a final class, so the +7% JIT/opcache specialization that 'final' enabled is incompatible with the native parser facade and has to be given up here. If the native parser is reworked to not extend WP_Parser_Node in the future, this can be restored. --- packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index 20badb82..096f17fc 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -9,7 +9,7 @@ * In this way, a parser node constitutes a recursive structure that represents * a parse (sub)tree at each level of the full grammar tree. */ -final class WP_Parser_Node { +class WP_Parser_Node { /** * @TODO: Review and document these properties and their visibility. */ From 8c11f760654867524b5fb9144ddefd080bd0e68a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 17:51:21 +0200 Subject: [PATCH 23/24] Maintain end-of-input sentinel in reset_tokens() Trunk's WP_MySQL_Parser::reset_tokens() lets the parser be reused across queries by swapping in a new token array. The performance branch's parser relies on an end-of-input sentinel token (id = EMPTY_RULE_ID) appended at $tokens[$token_count] so the hot path can read $tokens[$pos]->id without a range check; reset_tokens() must reproduce that invariant or the next parse() walks off the end. Append the sentinel and update $token_count in reset_tokens(), matching WP_Parser::__construct(). --- packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php index b6b465bd..bbae3efd 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php @@ -14,6 +14,10 @@ class WP_MySQL_Parser extends WP_Parser { * @param array $tokens The parser tokens. */ public function reset_tokens( array $tokens ): void { + $this->token_count = count( $tokens ); + // Maintain the end-of-input sentinel that parse_recursive() relies on. + // See WP_Parser::__construct for the invariants. + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); $this->tokens = $tokens; $this->position = 0; $this->current_ast = null; From 625680763b4423a21de7c8da87b0a5c8c7747f6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Mon, 4 May 2026 18:05:16 +0200 Subject: [PATCH 24/24] Restore lookahead_is_match_possible as a native-parser-bridge view Trunk's mysql-rust-bridge.php exports \$grammar->lookahead_is_match_possible to the native (Rust) parser extension, which uses it for early-bailout in the same way the previous pure-PHP parser did. The performance branch removed this property when it replaced the coarse lookahead with the more precise per-token \$branches_for_token + \$nullable_branches pair, which broke the native parser matrix (PHP Warning + Fatal in trait-wp-mysql-native-parser-impl). Re-derive the property from the new selectors at grammar build time so the bridge keeps working without a Rust-side change. The view's contents match what the old algorithm produced (FIRST(rule) per rule, plus EMPTY_RULE_ID for nullable rules), and is a strict superset since the new fixpoint computes FIRST for rules the old 5-iteration build gave up on - safe under the bridge's "in lookahead OR nullable" check. The property is not consulted by the pure-PHP parser hot path; it's purely a compatibility surface for the native bridge. --- .../src/parser/class-wp-parser-grammar.php | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 939d249a..6e54dc91 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -62,6 +62,25 @@ class WP_Parser_Grammar { */ public $single_candidate_rules = array(); + /** + * Backward-compatible view of `$branches_for_token` for the native (Rust) + * parser bridge. + * + * Trunk's `mysql-rust-bridge.php` exports this property to the native + * extension, which uses it for early-bailout the same way the previous + * pure-PHP parser did: `lookahead_is_match_possible[$rule_id][$tid]` + * means "this rule can possibly match when the next token is `$tid`". + * `EMPTY_RULE_ID` (0) marks the rule as having a nullable branch. + * + * The performance branch replaced this map with the more precise + * `$branches_for_token` + `$nullable_branches` pair on the parser hot + * path. The view is kept around so the native extension keeps working + * without a Rust-side change. It is not consulted by the pure-PHP parser. + * + * @var array> + */ + public $lookahead_is_match_possible = array(); + public $lowest_non_terminal_id; public $highest_terminal_id; public $native_grammar; @@ -399,6 +418,26 @@ private function build_branch_selectors() { } } } + + // Build the backward-compat lookahead view for the native parser + // bridge. See $lookahead_is_match_possible. + foreach ( $this->branches_for_token as $rule_id => $sel ) { + $entry = array(); + foreach ( $sel as $tid => $_ ) { + $entry[ $tid ] = true; + } + if ( isset( $this->nullable_branches[ $rule_id ] ) ) { + $entry[ self::EMPTY_RULE_ID ] = true; + } + $this->lookahead_is_match_possible[ $rule_id ] = $entry; + } + foreach ( $this->nullable_branches as $rule_id => $_ ) { + if ( ! isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { + $this->lookahead_is_match_possible[ $rule_id ] = array( + self::EMPTY_RULE_ID => true, + ); + } + } } /**