-
Notifications
You must be signed in to change notification settings - Fork 59
Parser + lexer performance: consolidated 2–3× end-to-end speedup #378
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
9d8e23e
81db9d2
dc0db55
0e38795
658245d
2bf90e8
9de4be2
b3931d0
0169a66
0f7c1f9
25e04ed
96184de
b6029fd
9f75802
d5f155e
4d7970a
a1e0e6c
17e06b3
49acebd
242acf6
aa0feda
873bed5
8c11f76
6256807
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer { | |
| */ | ||
| private $sql; | ||
|
|
||
| /** | ||
| * Byte length of the SQL payload. | ||
| * | ||
| * @var int | ||
| */ | ||
| private $sql_length; | ||
|
|
||
| /** | ||
| * The version of the MySQL server that the SQL payload is intended for. | ||
| * | ||
|
|
@@ -2189,6 +2196,7 @@ public function __construct( | |
| array $sql_modes = array() | ||
| ) { | ||
| $this->sql = $sql; | ||
| $this->sql_length = strlen( $sql ); | ||
| $this->mysql_version = $mysql_version; | ||
|
|
||
| foreach ( $sql_modes as $sql_mode ) { | ||
|
|
@@ -2227,6 +2235,9 @@ public function next_token(): bool { | |
| return false; | ||
| } | ||
|
|
||
| // Skip leading whitespace inline for optimal performance. | ||
| $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); | ||
|
|
||
| do { | ||
| $this->token_starts_at = $this->bytes_already_read; | ||
| $this->token_type = $this->read_next_token(); | ||
|
|
@@ -2284,10 +2295,51 @@ public function get_token(): ?WP_MySQL_Token { | |
| * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. | ||
| */ | ||
| public function remaining_tokens(): array { | ||
| $tokens = array(); | ||
| while ( true === $this->next_token() ) { | ||
| $token = $this->get_token(); | ||
| $tokens[] = $token; | ||
| $tokens = array(); | ||
| $no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active( | ||
| self::SQL_MODE_NO_BACKSLASH_ESCAPES | ||
| ); | ||
|
|
||
| while ( true ) { | ||
| // Bail on EOF, or on a null token type once at least one byte has | ||
| // been consumed (read_next_token() hit invalid input mid-stream). | ||
| if ( | ||
| self::EOF === $this->token_type | ||
| || ( null === $this->token_type && $this->bytes_already_read > 0 ) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't EOF cover that?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in f9172e1. EOF and the second arm catch different cases: |
||
| ) { | ||
| $this->token_type = null; | ||
| break; | ||
| } | ||
|
|
||
| // Skip leading whitespace inline for optimal performance. | ||
| $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); | ||
|
|
||
| do { | ||
| $this->token_starts_at = $this->bytes_already_read; | ||
| $this->token_type = $this->read_next_token(); | ||
| } while ( | ||
| self::WHITESPACE === $this->token_type | ||
| || self::COMMENT === $this->token_type | ||
| || self::MYSQL_COMMENT_START === $this->token_type | ||
| || self::MYSQL_COMMENT_END === $this->token_type | ||
| ); | ||
|
|
||
| if ( null === $this->token_type ) { | ||
| break; | ||
| } | ||
|
|
||
| $tokens[] = new WP_MySQL_Token( | ||
| $this->token_type, | ||
| $this->token_starts_at, | ||
| $this->bytes_already_read - $this->token_starts_at, | ||
| $this->sql, | ||
| $no_backslash_escapes_sql_mode_set | ||
| ); | ||
|
|
||
| if ( self::EOF === $this->token_type ) { | ||
| $this->token_type = null; | ||
| break; | ||
| } | ||
| } | ||
| return $tokens; | ||
| } | ||
|
|
@@ -2354,20 +2406,60 @@ private function read_next_token(): ?int { | |
| $byte = $this->sql[ $this->bytes_already_read ] ?? null; | ||
| $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; | ||
|
|
||
| if ( "'" === $byte || '"' === $byte || '`' === $byte ) { | ||
| // A map for a single-byte symbol fast path. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice |
||
| static $single_byte_ops = array( | ||
| '(' => self::OPEN_PAR_SYMBOL, | ||
| ')' => self::CLOSE_PAR_SYMBOL, | ||
| ',' => self::COMMA_SYMBOL, | ||
| ';' => self::SEMICOLON_SYMBOL, | ||
| '+' => self::PLUS_OPERATOR, | ||
| '~' => self::BITWISE_NOT_OPERATOR, | ||
| '%' => self::MOD_OPERATOR, | ||
| '^' => self::BITWISE_XOR_OPERATOR, | ||
| '?' => self::PARAM_MARKER, | ||
| '{' => self::OPEN_CURLY_SYMBOL, | ||
| '}' => self::CLOSE_CURLY_SYMBOL, | ||
| '=' => self::EQUAL_OPERATOR, | ||
| ); | ||
|
|
||
| // Fast path for keywords and identifiers. | ||
| // `$byte > "\x7F"` catches UTF-8 multi-byte starters (U+0080-U+FFFF). | ||
| // `"'" !== $next_byte` defers x'..', n'..' and similar special | ||
| // literals to their dedicated branches below; only single quotes | ||
| // form those, regardless of SQL mode. | ||
| if ( | ||
| ( | ||
| ( $byte >= 'a' && $byte <= 'z' ) | ||
| || ( $byte >= 'A' && $byte <= 'Z' ) | ||
| || $byte > "\x7F" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd leave a comment on why
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in f9172e1. |
||
| ) | ||
| && "'" !== $next_byte | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why just
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in f9172e1. |
||
| ) { | ||
| $started_at = $this->bytes_already_read; | ||
| $type = $this->read_identifier(); | ||
| if ( self::IDENTIFIER === $type ) { | ||
| // When preceded by a dot, it is always an identifier. | ||
| if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { | ||
| $type = self::IDENTIFIER; | ||
| } else { | ||
| $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); | ||
| } | ||
| } | ||
| } elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) { | ||
| // Fast path for single-byte symbols. | ||
| $this->bytes_already_read += 1; | ||
| $type = $single_byte_ops[ $byte ]; | ||
| } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { | ||
| $type = $this->read_quoted_text(); | ||
| } elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) { | ||
| } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { | ||
| $type = $this->read_number(); | ||
| } elseif ( '.' === $byte ) { | ||
| if ( null !== $next_byte && strspn( $next_byte, self::DIGIT_MASK ) > 0 ) { | ||
| if ( null !== $next_byte && $next_byte >= '0' && $next_byte <= '9' ) { | ||
| $type = $this->read_number(); | ||
| } else { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::DOT_SYMBOL; | ||
| } | ||
| } elseif ( '=' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::EQUAL_OPERATOR; | ||
| } elseif ( ':' === $byte ) { | ||
| $this->bytes_already_read += 1; // Consume the ':'. | ||
| if ( '=' === $next_byte ) { | ||
|
|
@@ -2414,14 +2506,17 @@ private function read_next_token(): ?int { | |
| } else { | ||
| $type = self::LOGICAL_NOT_OPERATOR; | ||
| } | ||
| } elseif ( '+' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::PLUS_OPERATOR; | ||
| } elseif ( '-' === $byte ) { | ||
| $third_byte = $this->sql[ $this->bytes_already_read + 2 ] ?? null; | ||
| if ( | ||
| '-' === $next_byte | ||
| && $this->bytes_already_read + 2 < strlen( $this->sql ) | ||
| && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0 | ||
| && ( | ||
| ' ' === $third_byte | ||
| || "\t" === $third_byte | ||
| || "\n" === $third_byte | ||
| || "\r" === $third_byte | ||
| || "\f" === $third_byte | ||
| ) | ||
| ) { | ||
| $type = $this->read_line_comment(); | ||
| } elseif ( '>' === $next_byte ) { | ||
|
|
@@ -2466,9 +2561,6 @@ private function read_next_token(): ?int { | |
| $this->bytes_already_read += 1; | ||
| $type = self::DIV_OPERATOR; | ||
| } | ||
| } elseif ( '%' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::MOD_OPERATOR; | ||
| } elseif ( '&' === $byte ) { | ||
| $this->bytes_already_read += 1; // Consume the '&'. | ||
| if ( '&' === $next_byte ) { | ||
|
|
@@ -2477,9 +2569,6 @@ private function read_next_token(): ?int { | |
| } else { | ||
| $type = self::BITWISE_AND_OPERATOR; | ||
| } | ||
| } elseif ( '^' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::BITWISE_XOR_OPERATOR; | ||
| } elseif ( '|' === $byte ) { | ||
| $this->bytes_already_read += 1; // Consume the '|'. | ||
| if ( '|' === $next_byte ) { | ||
|
|
@@ -2490,27 +2579,6 @@ private function read_next_token(): ?int { | |
| } else { | ||
| $type = self::BITWISE_OR_OPERATOR; | ||
| } | ||
| } elseif ( '~' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::BITWISE_NOT_OPERATOR; | ||
| } elseif ( ',' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::COMMA_SYMBOL; | ||
| } elseif ( ';' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::SEMICOLON_SYMBOL; | ||
| } elseif ( '(' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::OPEN_PAR_SYMBOL; | ||
| } elseif ( ')' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::CLOSE_PAR_SYMBOL; | ||
| } elseif ( '{' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::OPEN_CURLY_SYMBOL; | ||
| } elseif ( '}' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::CLOSE_CURLY_SYMBOL; | ||
| } elseif ( '@' === $byte ) { | ||
| $this->bytes_already_read += 1; // Consume the '@'. | ||
|
|
||
|
|
@@ -2534,9 +2602,6 @@ private function read_next_token(): ?int { | |
| $type = self::AT_SIGN_SYMBOL; | ||
| } | ||
| } | ||
| } elseif ( '?' === $byte ) { | ||
| $this->bytes_already_read += 1; | ||
| $type = self::PARAM_MARKER; | ||
| } elseif ( '\\' === $byte ) { | ||
| $this->bytes_already_read += 1; // Consume the '\'. | ||
| if ( 'N' === $next_byte ) { | ||
|
|
@@ -2547,7 +2612,13 @@ private function read_next_token(): ?int { | |
| } | ||
| } elseif ( '#' === $byte ) { | ||
| $type = $this->read_line_comment(); | ||
| } elseif ( null !== $byte && strspn( $byte, self::WHITESPACE_MASK ) > 0 ) { | ||
| } elseif ( | ||
| ' ' === $byte | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would array + isset() be faster?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Marginally faster, but this branch rarely fires. |
||
| || "\t" === $byte | ||
| || "\n" === $byte | ||
| || "\r" === $byte | ||
| || "\f" === $byte | ||
| ) { | ||
| $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); | ||
| $type = self::WHITESPACE; | ||
| } elseif ( ( 'x' === $byte || 'X' === $byte || 'b' === $byte || 'B' === $byte ) && "'" === $next_byte ) { | ||
|
|
@@ -2561,13 +2632,9 @@ private function read_next_token(): ?int { | |
| } elseif ( null === $byte ) { | ||
| $type = self::EOF; | ||
| } else { | ||
| $started_at = $this->bytes_already_read; | ||
| $type = $this->read_identifier(); | ||
| $type = $this->read_identifier(); | ||
| if ( self::IDENTIFIER === $type ) { | ||
| // When preceded by a dot, it is always an identifier. | ||
| if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { | ||
| $type = self::IDENTIFIER; | ||
| } elseif ( '_' === $byte && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) { | ||
| if ( '_' === $byte && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) { | ||
| $type = self::UNDERSCORE_CHARSET; | ||
| } else { | ||
| $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); | ||
|
|
@@ -2675,7 +2742,7 @@ private function read_number(): ?int { | |
| '0' === $byte | ||
| && 'x' === $next_byte | ||
| && null !== $third_byte | ||
| && strspn( $third_byte, self::HEX_DIGIT_MASK ) > 0 | ||
| && false !== strpos( self::HEX_DIGIT_MASK, $third_byte ) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. clever |
||
| ) | ||
| // HEX number in the form of x'N' or X'N'. | ||
| || ( ( 'x' === $byte || 'X' === $byte ) && "'" === $next_byte ) | ||
|
|
@@ -2685,7 +2752,7 @@ private function read_number(): ?int { | |
| $this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read ); | ||
| if ( $is_quoted ) { | ||
| if ( | ||
| $this->bytes_already_read >= strlen( $this->sql ) | ||
| $this->bytes_already_read >= $this->sql_length | ||
| || "'" !== $this->sql[ $this->bytes_already_read ] | ||
| ) { | ||
| return null; // Invalid input. | ||
|
|
@@ -2708,7 +2775,7 @@ private function read_number(): ?int { | |
| $this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read ); | ||
| if ( $is_quoted ) { | ||
| if ( | ||
| $this->bytes_already_read >= strlen( $this->sql ) | ||
| $this->bytes_already_read >= $this->sql_length | ||
| || "'" !== $this->sql[ $this->bytes_already_read ] | ||
| ) { | ||
| return null; // Invalid input. | ||
|
|
@@ -2737,11 +2804,12 @@ private function read_number(): ?int { | |
| ( 'e' === $byte || 'E' === $byte ) | ||
| && null !== $next_byte | ||
| && ( | ||
| strspn( $next_byte, self::DIGIT_MASK ) > 0 | ||
| ( $next_byte >= '0' && $next_byte <= '9' ) | ||
| || ( | ||
| ( '+' === $next_byte || '-' === $next_byte ) | ||
| && $this->bytes_already_read + 2 < strlen( $this->sql ) | ||
| && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0 | ||
| && $this->bytes_already_read + 2 < $this->sql_length | ||
| && $this->sql[ $this->bytes_already_read + 2 ] >= '0' | ||
| && $this->sql[ $this->bytes_already_read + 2 ] <= '9' | ||
| ) | ||
| ); | ||
| if ( $has_exponent ) { | ||
|
|
@@ -2838,12 +2906,11 @@ private function read_quoted_text(): ?int { | |
| // in which case the escape sequence is consumed and the loop continues. | ||
| $at = $this->bytes_already_read; | ||
| while ( true ) { | ||
| $at += strcspn( $this->sql, $quote, $at ); | ||
|
|
||
| // Unclosed string - unexpected EOF. | ||
| if ( ( $this->sql[ $at ] ?? null ) !== $quote ) { | ||
| $quote_at = strpos( $this->sql, $quote, $at ); | ||
| if ( false === $quote_at ) { | ||
| return null; // Invalid input. | ||
| } | ||
| $at = $quote_at; | ||
|
|
||
| /* | ||
| * By default, quotes can be escaped with a "\". | ||
|
|
@@ -2853,9 +2920,17 @@ private function read_quoted_text(): ?int { | |
| * The quote is escaped only when the number of preceding backslashes | ||
| * is odd - "\" is an escape sequence, "\\" is an escaped backslash, | ||
| * "\\\" is an escaped backslash and an escape sequence, and so on. | ||
| * | ||
| * The `($at - $i - 1) >= 0` guard prevents PHP's negative-string- | ||
| * offset wraparound (PHP 7.1+) when the closing-quote candidate | ||
| * sits at the very start of the input. The `?? null` covers | ||
| * positive out-of-range indexes belt-and-suspenders. | ||
| */ | ||
| if ( ! $no_backslash_escapes ) { | ||
| for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 ); | ||
| $i = 0; | ||
| while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) { | ||
| $i += 1; | ||
| } | ||
| if ( 1 === $i % 2 ) { | ||
| $at += 1; | ||
| continue; | ||
|
|
@@ -2920,17 +2995,11 @@ private function read_mysql_comment(): int { | |
| } | ||
|
|
||
| private function read_comment_content(): void { | ||
| while ( true ) { | ||
| $this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read ); | ||
| $this->bytes_already_read += 1; // Consume the '*'. | ||
| $byte = $this->sql[ $this->bytes_already_read ] ?? null; | ||
| if ( null === $byte ) { | ||
| break; | ||
| } | ||
| if ( '/' === $byte ) { | ||
| $this->bytes_already_read += 1; // Consume the '/'. | ||
| break; | ||
| } | ||
| $comment_end = strpos( $this->sql, '*/', $this->bytes_already_read ); | ||
| if ( false === $comment_end ) { | ||
| $this->bytes_already_read = $this->sql_length; | ||
| } else { | ||
| $this->bytes_already_read = $comment_end + 2; | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed in f9172e1.